[Swift-commit] r3766 - in provenancedb: . apps/oops
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Sun Dec 12 17:29:55 CST 2010
Author: lgadelha
Date: 2010-12-12 17:29:54 -0600 (Sun, 12 Dec 2010)
New Revision: 3766
Modified:
provenancedb/apps/oops/oops_extractor.sh
provenancedb/apps/oops/raptor_extractor.sh
provenancedb/prov-init.sql
Log:
Minor changes to OOPS provenance extractor.
Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================
--- provenancedb/apps/oops/oops_extractor.sh 2010-12-11 22:46:59 UTC (rev 3765)
+++ provenancedb/apps/oops/oops_extractor.sh 2010-12-12 23:29:54 UTC (rev 3766)
@@ -19,13 +19,11 @@
for k in $(ls -1);
do
cd $PROTESTS_HOME/$k
- for i in $(ls | grep run.loops);
+ for i in $(ls | grep run.raptorloops; ls | grep run.loops);
do
cd $IMPORT_HOME
if ! grep --silent $i provdb_imported; then
if grep --silent "Swift finished with no errors" $PROTESTS_HOME/$k/$i/psim.loops-*.log; then
- cd swift-logs
- cd import
# swift-prov-import-all-logs also controls what already has been
# imported, so it does not repeat work
echo "export LOGREPO=$PROTESTS_HOME/$k/$i" > $PROVDB_HOME/etc/provenance.config
@@ -38,40 +36,85 @@
# annotate workflows with their oops runid
OOPS_RUN_ID=$(echo $i | awk -F . '{print $3}')
cd $PROTESTS_HOME/$k/$i
- LOG_FILENAME=$(ls | grep psim.loops- | grep "\."log$)
+ LOG_FILENAME=$(ls *-*-*-*.log)
WORKFLOW_ID=$(echo "select id from workflow where log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}')
cd $IMPORT_HOME/swift-logs
echo "insert into annot_wf_txt (id, name, value) values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
- #
- echo "select file.id,file.filename from process, ds_usage, ds_containment, file where process.id=ds_usage.process_id and ds_usage.dataset_id=out_id and file.id=ds_containment.in_id and filename like '%.params' and process.name='PrepLoop' and process.workflow_id='$WORKFLOW_ID';" > query.sql;
-
+ #extracts scientific parameters given as input to the workflow in file *.params.
+ echo "select file.id,file.filename from process, ds_usage, ds_containment, file where process.id=ds_usage.process_id and ds_usage.dataset_id=out_id and file.id=ds_containment.in_id and filename like '%.params' and process.name='PrepLoop' and process.workflow_id='$WORKFLOW_ID' and ds_usage.direction='O';" > query.sql;
+
$SQLCMD -t -A -F " " -f query.sql -o result.txt
- DATASET_ID=$(awk '{print $1}' result.txt)
- FILENAME=$(awk '{print $2}' result.txt | sed 's/file:\/\/localhost\///g')
+ DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
+ FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
- cd $PROTESTS_HOME/$k/run.loops.$OOPS_RUN_ID
+ cd $PROTESTS_HOME/$k/$i
while read line; do
NAME=$(echo $line | awk 'BEGIN { FS = "=" }; {print $1}')
+ RIGHT=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}')
if [ "$NAME" = "SAMPLE RANGE" ]; then
- VALUE1=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "-" }; {print $1}')
- VALUE2=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "-" }; {print $2}')
- echo "insert into annot_ds_num values ('$DATASET_ID', '$NAME BEGIN', $VALUE1);" | $SQLCMD
- echo "insert into annot_ds_num values ('$DATASET_ID', '$NAME END', $VALUE2);" | $SQLCMD
+ echo "insert into annot_ds_txt values ('$DATASET_ID', 'sample_range', '$RIGHT');" | $SQLCMD
fi
if [ "$NAME" = "RESTRAIN DISTANCE" ]; then
- VALUE1=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $1}')
+ VALUE1=$(echo $RIGHT | awk 'BEGIN { FS = "," }; {print $1}')
VALUE2=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $2}')
- echo "insert into annot_ds_num values ('$DATASET_ID', '$NAME 1', $VALUE1);" | $SQLCMD
- echo "insert into annot_ds_num values ('$DATASET_ID', '$NAME 2', $VALUE2);" | $SQLCMD
+ echo "insert into annot_ds_num values ('$DATASET_ID', 'restrain_distance_1', $VALUE1);" | $SQLCMD
+ echo "insert into annot_ds_num values ('$DATASET_ID', 'restrain_distance_2', $VALUE2);" | $SQLCMD
fi
if [ "$NAME" = "MAXIMUM NUMBER OF STEPS" ]; then
- VALUE=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}')
- echo "insert into annot_ds_num values ('$DATASET_ID', '$NAME', $VALUE);" | $SQLCMD
+ echo "insert into annot_ds_num values ('$DATASET_ID', 'maximum_number_of_steps', $RIGHT);" | $SQLCMD
fi
done < $FILENAME
+
+ # extracts scientific parameters given as input to the workflow in file *.params.
+ # relevant lines:
+ # zone2 (Initial Energy: -21352.116911)
+ # Total Function Evaluations: 20000
+ # Accepted transitions: 7410
+ # Increasing transitions: 4525
+ # Decreasing transitions: 2885
+ # Rejected transitions: 12590
+ # Final Energy: -27152.264775
+ # Final Temp: 79.778142
+ # Total Running Time: 18006
+
+ echo "select file.id,file.filename from process, ds_usage, ds_containment, file where process.id=ds_usage.process_id and ds_usage.dataset_id=out_id and file.id=ds_containment.in_id and filename like '%.log' and process.name='LoopModel' and process.workflow_id='$WORKFLOW_ID' and ds_usage.direction='O';" > query.sql;
+
+ $SQLCMD -t -A -F " " -f query.sql -o result.txt
+
+ while read dataset filename; do
+ DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
+ FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
+ while read token1 token2 token3 token4; do
+ if [ "$token2" = "(Initial Energy:" ]; then
+ initialenergy=$(echo $token3 | awk 'BEGIN { FS = "\)" }; {print $1}')
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'initial_energy', $initialenergy);" | $SQLCMD
+ fi
+ if [ "$token1" = "Total" && "$token2" = "Function" && "$token3" = "Evaluations:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'total_function_evaluations', $token4);" | $SQLCMD
+ fi
+ if [ "$token1" = "Increasing" && "$token2" = "transitions:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'accepted_increasing_transitions', $token3);" | $SQLCMD
+ fi
+ if [ "$token1" = "Decreasing" && "$token2" = "transitions:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'accepted_decreasing_transitions', $token3);" | $SQLCMD
+ fi
+ if [ "$token1" = "Rejected" && "$token2" = "transitions:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'rejected_transitions', $token3);" | $SQLCMD
+ fi
+ if [ "$token1" = "Final" && "$token2" = "Energy:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'final_energy', $token3);" | $SQLCMD
+ fi
+ if [ "$token1" = "Final" && "$token2" = "Temp:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'final_temp', $token3);" | $SQLCMD
+ fi
+ if [ "$token1" = "Total" && "$token2" = "Running" && "$token3" = "Time:" ]; then
+ echo "insert into annot_ds_num (id, name, value) values ('$dataset', 'total_running_time', $token4);" | $SQLCMD
+ fi
+ done < result.txt
+ done < $FILENAME
fi
fi
done
Modified: provenancedb/apps/oops/raptor_extractor.sh
===================================================================
--- provenancedb/apps/oops/raptor_extractor.sh 2010-12-11 22:46:59 UTC (rev 3765)
+++ provenancedb/apps/oops/raptor_extractor.sh 2010-12-12 23:29:54 UTC (rev 3766)
@@ -20,7 +20,7 @@
for k in $(ls -1);
do
cd $PROTESTS_HOME/$k
- for i in $(ls | grep run.raptorloops);
+ for i in $(ls | grep run.raptorloops; ls | grep run.loops);
do
cd $IMPORT_HOME
if ! grep --silent $i provdb_imported_raptor; then
@@ -33,7 +33,7 @@
cd $IMPORT_HOME
echo $i >> provdb_imported_raptor
cd swift-logs
- # annotate workflows with their oops runid
+ # annotate workflows with their oops runid
OOPS_RUN_ID=$(echo $i | awk -F . '{print $3}')
cd $PROTESTS_HOME/$k/$i
LOG_FILENAME=$(ls | grep RaptorLoops- | grep "\."log$)
@@ -45,8 +45,8 @@
$SQLCMD -t -A -F " " -f query.sql -o result.txt
- DATASET_ID=`awk '{if (NR==1) print $1}' result.txt`
- FILENAME=`awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
+ DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
+ FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
cd $PROTESTS_HOME/$k/run.raptorloops.$OOPS_RUN_ID
Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql 2010-12-11 22:46:59 UTC (rev 3765)
+++ provenancedb/prov-init.sql 2010-12-12 23:29:54 UTC (rev 3766)
@@ -1,41 +1,41 @@
-- this is the schema definition used for the main relational provenance
-- implementation (in both sqlite3 and postgres)
-DROP TABLE dataset CASCADE;
-DROP TABLE file CASCADE;
-DROP TABLE variable CASCADE;
-DROP TABLE ds_containment CASCADE;
-DROP TABLE process CASCADE;
-DROP TABLE execute CASCADE;
-DROP TABLE execute2 CASCADE;
-DROP TABLE workflow CASCADE;
-DROP TABLE ds_usage CASCADE;
-DROP TABLE annot_ds_num CASCADE;
-DROP TABLE annot_ds_txt CASCADE;
-DROP TABLE annot_ds_bool CASCADE;
-DROP TABLE annot_p_num CASCADE;
-DROP TABLE annot_p_txt CASCADE;
-DROP TABLE annot_p_bool CASCADE;
-DROP TABLE annot_wf_num CASCADE;
-DROP TABLE annot_wf_txt CASCADE;
-DROP TABLE annot_wf_bool CASCADE;
--- DROP TABLE extrainfo CASCADE;
-DROP TABLE createarray CASCADE;
-DROP TABLE createarray_member CASCADE;
-DROP TABLE array_range CASCADE;
+drop table dataset cascade;
+drop table file cascade;
+drop table variable cascade;
+drop table ds_containment cascade;
+drop table process cascade;
+drop table execute cascade;
+drop table execute2 cascade;
+drop table workflow cascade;
+drop table ds_usage cascade;
+drop table annot_ds_num cascade;
+drop table annot_ds_txt cascade;
+drop table annot_ds_bool cascade;
+drop table annot_p_num cascade;
+drop table annot_p_txt cascade;
+drop table annot_p_bool cascade;
+drop table annot_wf_num cascade;
+drop table annot_wf_txt cascade;
+drop table annot_wf_bool cascade;
+-- drop table extrainfo cascade;
+drop table createarray cascade;
+drop table createarray_member cascade;
+drop table array_range cascade;
-- workflow stores some information about each workflow log that has
-- been seen by the importer: the log filename, swift version and import
-- status.
-- Might be interesting to store xml translation of the swiftscript code
-- here for prospective provenance/versioning
-CREATE TABLE workflow
- (id VARCHAR(256) PRIMARY KEY,
- log_filename VARCHAR(2048),
- swift_version VARCHAR(16),
- import_status VARCHAR(16),
- start_time NUMERIC,
- duration NUMERIC
+create table workflow
+ (id varchar(256) primary key,
+ log_filename varchar(2048),
+ swift_version varchar(16),
+ import_status varchar(16),
+ start_time numeric,
+ duration numeric
);
-- workflow_run stores the start time and duration for each workflow
@@ -47,14 +47,14 @@
-- );
-- dataset stores all dataset identifiers.
-CREATE TABLE dataset
- (id VARCHAR(256) PRIMARY KEY
+create table dataset
+ (id varchar(256) primary key
);
-- file stores the filename mapped to each dataset.
-CREATE TABLE file
- ( id VARCHAR(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
- filename VARCHAR(2048)
+create table file
+ ( id varchar(256) primary key references dataset (id) on delete cascade,
+ filename varchar(2048)
);
-- dataset_values stores the value for each dataset which is known to have
@@ -62,9 +62,9 @@
-- to expose that value as an SQL type other than a string, and so (for
-- example) SQL numerical operations should not be expected to work, even
-- though the user knows that a particular dataset stores a numeric value.
-CREATE TABLE variable
- ( id VARCHAR(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
- value VARCHAR(2048)
+create table variable
+ ( id varchar(256) primary key references dataset (id) on delete cascade,
+ value varchar(2048)
);
-- dataset_containment stores the containment hierarchy between
@@ -74,10 +74,10 @@
-- constructors and accessors, rather than, or in addition to,
-- a containment hierarchy. The relationship (such as array index or
-- structure member name) should also be stored in this table.
-CREATE TABLE ds_containment
- ( out_id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
- in_id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
- PRIMARY KEY (out_id,in_id)
+create table ds_containment
+ ( out_id varchar(256) references dataset (id) on delete cascade,
+ in_id varchar(256) references dataset (id) on delete cascade,
+ primary key (out_id,in_id)
);
-- process gives information about each process (in the OPM sense)
@@ -88,114 +88,114 @@
-- Having this type here seems poor normalisation, though?
-- process types: internal, rootthread, execute, function, compound, scope, operator
-- maybe create a table for each type?
-CREATE TABLE process
- (id VARCHAR(256) PRIMARY KEY,
- type VARCHAR(16),
- name VARCHAR(256), -- in the case of an execute this refers to the transformation name in tc.data
- workflow_id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE -- normalize: workflow_id of sub-procedure determined
+create table process
+ (id varchar(256) primary key,
+ type varchar(16),
+ name varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
+ workflow_id varchar(256) references workflow (id) on delete cascade -- normalize: workflow_id of sub-procedure determined
-- by compound procedure
);
-- this gives information about each execute.
-- each execute is identified by a unique URI. other information from
-- swift logs is also stored here. an execute is an OPM process.
-CREATE TABLE execute
- (id VARCHAR(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,
- procedure_name VARCHAR(256), -- name of the app procedure that invokes the transformation
- start_time NUMERIC,
- duration NUMERIC,
- final_state VARCHAR(16),
- scratch VARCHAR(2048)
+create table execute
+ (id varchar(256) primary key references process (id) on delete cascade,
+ procedure_name varchar(256), -- name of the app procedure that invokes the transformation
+ start_time numeric,
+ duration numeric,
+ final_state varchar(16),
+ scratch varchar(2048)
);
-- this gives information about each execute2, which is an attempt to
-- perform an execution. the execute2 id is tied to per-execution-attempt
-- information such as wrapper logs
-CREATE TABLE execute2
- (id VARCHAR(256) PRIMARY KEY,
- execute_id VARCHAR(256) REFERENCES execute (id) ON DELETE CASCADE,
- start_time NUMERIC,
- duration NUMERIC,
- final_state VARCHAR(16),
- site VARCHAR(256)
+create table execute2
+ (id varchar(256) primary key,
+ execute_id varchar(256) references execute (id) on delete cascade,
+ start_time numeric,
+ duration numeric,
+ final_state varchar(16),
+ site varchar(256)
);
-- dataset_usage records usage relationships between processes and datasets;
-- in SwiftScript terms, the input and output parameters for each
-- application procedure invocation; in OPM terms, the artificts which are
-- input to and output from each process that is a Swift execution
-CREATE TABLE ds_usage
- (process_id VARCHAR(256) REFERENCES process(id) ON DELETE CASCADE,
- direction CHAR(1), -- I or O for input or output
- dataset_id VARCHAR(256) REFERENCES dataset(id) ON DELETE CASCADE,
- param_name VARCHAR(256), -- the name of the parameter in this execute that
+create table ds_usage
+ (process_id varchar(256) references process(id) on delete cascade,
+ direction char(1), -- I or O for input or output
+ dataset_id varchar(256) references dataset(id) on delete cascade,
+ param_name varchar(256), -- the name of the parameter in this execute that
-- this dataset was bound to. sometimes this must
-- be contrived (for example, in positional varargs)
- PRIMARY KEY (process_id,direction,dataset_id,param_name)
+ primary key (process_id,direction,dataset_id,param_name)
);
-- annotations
-CREATE TABLE annot_ds_num
- ( id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value NUMERIC,
- PRIMARY KEY (id, name)
+create table annot_ds_num
+ ( id varchar(256) references dataset (id) on delete cascade,
+ name varchar(256),
+ value numeric,
+ primary key (id, name)
);
-CREATE TABLE annot_ds_txt
- ( id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value VARCHAR(2048),
- PRIMARY KEY (id, name)
+create table annot_ds_txt
+ ( id varchar(256) references dataset (id) on delete cascade,
+ name varchar(256),
+ value varchar(2048),
+ primary key (id, name)
);
-CREATE TABLE annot_ds_bool
- ( id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value BOOLEAN,
- PRIMARY KEY (id, name)
+create table annot_ds_bool
+ ( id varchar(256) references dataset (id) on delete cascade,
+ name varchar(256),
+ value boolean,
+ primary key (id, name)
);
-CREATE TABLE annot_p_num
- ( id VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value NUMERIC,
- PRIMARY KEY (id, name)
+create table annot_p_num
+ ( id varchar(256) references process (id) on delete cascade,
+ name varchar(256),
+ value numeric,
+ primary key (id, name)
);
-CREATE TABLE annot_p_txt
- ( id VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value VARCHAR(2048),
- PRIMARY KEY (id, name)
+create table annot_p_txt
+ ( id varchar(256) references process (id) on delete cascade,
+ name varchar(256),
+ value varchar(2048),
+ primary key (id, name)
);
-CREATE TABLE annot_p_bool
- ( id VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value BOOLEAN,
- PRIMARY KEY (id, name)
+create table annot_p_bool
+ ( id varchar(256) references process (id) on delete cascade,
+ name varchar(256),
+ value boolean,
+ primary key (id, name)
);
-CREATE TABLE annot_wf_num
- ( id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value NUMERIC,
- PRIMARY KEY (id, name)
+create table annot_wf_num
+ ( id varchar(256) references workflow (id) on delete cascade,
+ name varchar(256),
+ value numeric,
+ primary key (id, name)
);
-CREATE TABLE annot_wf_txt
- ( id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE,
- name VARCHAR(256),
- value VARCHAR(2048),
- PRIMARY KEY (id, name)
+create table annot_wf_txt
+ ( id varchar(256) references workflow (id) on delete cascade,
+ name varchar(256),
+ value varchar(2048),
+ primary key (id, name)
);
-CREATE TABLE annot_wf_bool
- ( id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE,
- name VARCHAR(2048),
- value BOOLEAN,
- PRIMARY KEY (id, name)
+create table annot_wf_bool
+ ( id varchar(256) references workflow (id) on delete cascade,
+ name varchar(2048),
+ value boolean,
+ primary key (id, name)
);
-- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
@@ -211,29 +211,29 @@
-- terms of accessors and constructors.
-- It is unclear which is the better representation.
-CREATE TABLE createarray
- ( id VARCHAR(256) PRIMARY KEY
+create table createarray
+ ( id varchar(256) primary key
);
-CREATE TABLE createarray_member
- ( array_id VARCHAR(256) REFERENCES createarray (id) ON DELETE CASCADE,
- ix VARCHAR(256),
- member_id VARCHAR(256),
- PRIMARY KEY (array_id, ix)
+create table createarray_member
+ ( array_id varchar(256) references createarray (id) on delete cascade,
+ ix varchar(256),
+ member_id varchar(256),
+ primary key (array_id, ix)
);
-- TODO step
-CREATE TABLE array_range
- ( array_id VARCHAR(256) REFERENCES createarray (id) ON DELETE CASCADE,
- from_id VARCHAR(256),
- to_id VARCHAR(256),
- step_id VARCHAR(256), -- nullable, if step is unspecified
- PRIMARY KEY (array_id,from_id,to_id,step_id)
+create table array_range
+ ( array_id varchar(256) references createarray (id) on delete cascade,
+ from_id varchar(256),
+ to_id varchar(256),
+ step_id varchar(256), -- nullable, if step is unspecified
+ primary key (array_id,from_id,to_id,step_id)
);
-- this GRANT does not work for sqlite; you'll get a syntax error but
-- ignore it, as it is not needed in sqlite
-GRANT ALL ON
+grant all on
dataset,
file,
variable,
@@ -256,4 +256,4 @@
createarray,
createarray_member,
array_range
-TO public, operators;
+to public, operators;
More information about the Swift-commit
mailing list