[Swift-commit] r3397 - in provenancedb: . apps/oops
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Mon Jun 21 13:23:45 CDT 2010
Author: lgadelha
Date: 2010-06-21 13:23:44 -0500 (Mon, 21 Jun 2010)
New Revision: 3397
Modified:
provenancedb/apps/oops/oops_extractor.sh
provenancedb/prov-init.sql
Log:
Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================
--- provenancedb/apps/oops/oops_extractor.sh 2010-06-18 21:21:53 UTC (rev 3396)
+++ provenancedb/apps/oops/oops_extractor.sh 2010-06-21 18:23:44 UTC (rev 3397)
@@ -6,63 +6,78 @@
# OOPS' Swift logs.
PROVDB_HOME=~/provenancedb
-PROTESTS_HOME=~/protests
+PROTESTS_HOME=/home/aashish/CASP
+IMPORT_HOME=~/protests
source $PROVDB_HOME/etc/provenance.config
# provdb_imported records runs already imported to the provenance database
-cd $PROTESTS_HOME
+cd $IMPORT_HOME
if [ ! -a provdb_imported ]; then
touch provdb_imported
fi
-
-for i in `ls | grep run.loops`;
-do
- cd $PROTESTS_HOME
- if ! grep $i provdb_imported; then
- if grep "Swift finished with no errors" $i/psim.loops-*.log; then
- cd swift-logs
- for j in `ls ../$i | grep psim.loops-`; do
- ln -s ../$i/$j
- done
- cd import
- # swift-prov-import-all-logs also controls what already has been
- # imported, so it does not repeat work
- $PROVDB_HOME/swift-prov-import-all-logs
- cd $PROTESTS_HOME
- echo $i >> provdb_imported
-
- # annotate workflows with their oops runid
- OOPS_RUN_ID=`echo $i | awk -F . '{print $3}'`
- LOG_FILENAME=`ls $i | grep psim.loops- | grep "\."log$`
- WORKFLOW_ID=`echo "select workflow_id from known_workflows where workflow_log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}'`
- echo "insert into annotations values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
-
- # annotate dataset with scientific parameters passed to doLoopRound
-
- # TODO: check why it is not recording doLoopRound in processes_in_workflows
- #echo "select dataset_filenames.dataset_id,dataset_filenames.filename from dataset_usage,invocation_procedure_names,dataset_containment,dataset_filenames,processes_in_workflows where dataset_usage.process_id=invocation_procedure_names.execute_id and dataset_containment.inner_dataset_id=dataset_filenames.dataset_id and procedure_name='loopModel' and param_name='d' and dataset_containment.outer_dataset_id=dataset_usage.dataset_id and dataset_filenames.filename like '%.params%' and processes_in_workflows.process_id=dataset_usage.process_id and processes_in_workflows.workflow_id='$WORKFLOW_ID';" > query.sql
-
- # using this as a workaround for the problem above, it will return nSim identical tuples
- echo "select dataset_filenames.dataset_id,dataset_filenames.filename from dataset_usage,invocation_procedure_names,dataset_containment,dataset_filenames,processes_in_workflows where dataset_usage.process_id=invocation_procedure_names.execute_id and dataset_containment.inner_dataset_id=dataset_filenames.dataset_id and procedure_name='loopModel' and param_name='d' and dataset_containment.outer_dataset_id=dataset_usage.dataset_id and dataset_filenames.filename like '%.params%' and processes_in_workflows.process_id=dataset_usage.process_id and processes_in_workflows.workflow_id='$WORKFLOW_ID';" > query.sql
-
- $SQLCMD -t -A -F " " -f query.sql -o result.txt
-
- #DATASET_ID=`awk '{print $1}' result.txt`
- DATASET_ID=`awk '{if (NR==1) print $1}' result.txt`
-
- #FILENAME=`awk '{print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
- FILENAME=`awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
-
- cd $PROTESTS_HOME/run.loops.$OOPS_RUN_ID
-
- while read line
- do
- NAME=`echo $line | awk 'BEGIN { FS = "=" }; {print $1}'`
- VALUE=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}'`
- echo "insert into annotations values ('$DATASET_ID', '$NAME', '$VALUE');" | $SQLCMD
- done < $FILENAME
+cd $PROTESTS_HOME
+for k in `ls -1`;
+do
+ cd $PROTESTS_HOME/$k
+ for i in `ls | grep run.loops`;
+ do
+ cd $IMPORT_HOME
+ if ! grep --silent $i provdb_imported; then
+ if grep --silent "Swift finished with no errors" $PROTESTS_HOME/$k/$i/psim.loops-*.log; then
+ cd swift-logs
+ for j in `ls $PROTESTS_HOME/$k/$i | grep psim.loops-`; do
+ ln -s $PROTESTS_HOME/$k/$i/$j
+ done
+ cd import
+ # swift-prov-import-all-logs also controls what already has been
+ # imported, so it does not repeat work
+ $PROVDB_HOME/swift-prov-import-all-logs
+ cd $IMPORT_HOME
+ echo $i >> provdb_imported
+ cd swift-logs
+ # annotate workflows with their oops runid
+ OOPS_RUN_ID=`echo $i | awk -F . '{print $3}'`
+ cd $PROTESTS_HOME/$k/$i
+ LOG_FILENAME=`ls | grep psim.loops- | grep "\."log$`
+ WORKFLOW_ID=`echo "select workflow_id from known_workflows where workflow_log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}'`
+ cd $IMPORT_HOME/swift-logs
+ echo "insert into workflow_annotations_varchar values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
+
+ # using this as a workaround for the problem above, it will return nSim identical tuples
+ echo "select dataset_filenames.dataset_id,dataset_filenames.filename from dataset_usage,invocation_procedure_names,dataset_containment,dataset_filenames,processes_in_workflows where dataset_usage.process_id=invocation_procedure_names.execute_id and dataset_containment.inner_dataset_id=dataset_filenames.dataset_id and procedure_name='loopModel' and param_name='d' and dataset_containment.outer_dataset_id=dataset_usage.dataset_id and dataset_filenames.filename like '%.params%' and processes_in_workflows.process_id=dataset_usage.process_id and processes_in_workflows.workflow_id='$WORKFLOW_ID';" > query.sql
+
+ $SQLCMD -t -A -F " " -f query.sql -o result.txt
+
+ #DATASET_ID=`awk '{print $1}' result.txt`
+ DATASET_ID=`awk '{if (NR==1) print $1}' result.txt`
+
+ #FILENAME=`awk '{print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
+ FILENAME=`awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
+
+ cd $PROTESTS_HOME/$k/run.loops.$OOPS_RUN_ID
+
+ while read line; do
+ NAME=`echo $line | awk 'BEGIN { FS = "=" }; {print $1}'`
+ if [ "$NAME" = "SAMPLE RANGE" ]; then
+ VALUE1=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "-" }; {print $1}'`
+ VALUE2=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "-" }; {print $2}'`
+ echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME BEGIN', $VALUE1);" | $SQLCMD
+ echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME END', $VALUE2);" | $SQLCMD
+ fi
+ if [ "$NAME" = "RESTRAIN DISTANCE" ]; then
+ VALUE1=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $1}'`
+ VALUE2=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $2}'`
+ echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME 1', $VALUE1);" | $SQLCMD
+ echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME 2', $VALUE2);" | $SQLCMD
+ fi
+ if [ "$NAME" = "MAXIMUM NUMBER OF STEPS" ]; then
+ VALUE=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}'`
+ echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME', $VALUE);" | $SQLCMD
+ fi
+ done < $FILENAME
+ fi
fi
- fi
+ done
done
Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql 2010-06-18 21:21:53 UTC (rev 3396)
+++ provenancedb/prov-init.sql 2010-06-21 18:23:44 UTC (rev 3397)
@@ -17,23 +17,30 @@
DROP TABLE createarray;
DROP TABLE createarray_member;
DROP TABLE array_range;
-DROP TABLE annotations;
-
-
+DROP TABLE dataset_annotations_numeric;
+DROP TABLE dataset_annotations_varchar;
+DROP TABLE dataset_annotations_boolean;
+DROP TABLE process_annotations_numeric;
+DROP TABLE process_annotations_varchar;
+DROP TABLE process_annotations_boolean;
+DROP TABLE workflow_annotations_numeric;
+DROP TABLE workflow_annotations_varchar;
+DROP TABLE workflow_annotations_boolean;
-- associates each process with its containing workflow
-- TODO - perhaps a workflow is itself a big big process?
-- in which case this looks very much like a compound/app
-- containment?
CREATE TABLE processes_in_workflows
- (workflow_id char(256),
- process_id char(256)
+ (workflow_id varchar(2048),
+ process_id varchar(2048),
+ primary key (workflow_id, process_id)
);
-- processes gives information about each process (in the OPM sense)
-- it is augmented by information in other tables
CREATE TABLE processes
- (id char(256) PRIMARY KEY, -- a uri
- type char(16) -- specifies the type of process. for any type, it
+ (id varchar(2048) PRIMARY KEY, -- a uri
+ type varchar(16) -- specifies the type of process. for any type, it
-- must be the case that the specific type table
-- has an entry for this process.
-- Having this type here seems poor normalisation, though?
@@ -44,12 +51,12 @@
-- each execute is identified by a unique URI. other information from
-- swift logs is also stored here. an execute is an OPM process.
CREATE TABLE executes
- (id char(256) PRIMARY KEY, -- actually foreign key to processes
+ (id varchar(2048) PRIMARY KEY, -- actually foreign key to processes
starttime numeric,
duration numeric,
- finalstate char(256),
- app char(256),
- scratch char(256)
+ finalstate varchar(2048),
+ app varchar(2048),
+ scratch varchar(2048)
);
-- this gives information about each execute2, which is an attempt to
@@ -57,12 +64,12 @@
-- information such as wrapper logs
CREATE TABLE execute2s
- (id char(256) PRIMARY KEY,
- execute_id char(256), -- secondary key to executes and processes tables
+ (id varchar(2048) PRIMARY KEY,
+ execute_id varchar(2048), -- secondary key to executes and processes tables
starttime numeric,
duration numeric,
- finalstate char(256),
- site char(256)
+ finalstate varchar(2048),
+ site varchar(2048)
);
-- dataset_usage records usage relationships between processes and datasets;
@@ -74,11 +81,11 @@
-- dataset_id for common queries? maybe add arbitrary ID for sake of it?
CREATE TABLE dataset_usage
- (process_id char(256), -- foreign key but not enforced because maybe process
+ (process_id varchar(2048), -- foreign key but not enforced because maybe process
-- doesn't exist at time. same type as processes.id
direction char(1), -- I or O for input or output
- dataset_id char(256), -- this will perhaps key against dataset table
- param_name char(256) -- the name of the parameter in this execute that
+ dataset_id varchar(2048), -- this will perhaps key against dataset table
+ param_name varchar(2048) -- the name of the parameter in this execute that
-- this dataset was bound to. sometimes this must
-- be contrived (for example, in positional varargs)
);
@@ -89,11 +96,9 @@
-- TODO probably desirable that this is part of executes table
-- but for now this is the easiest to pull data from logs.
-
--- TODO primary key should be execute_id
CREATE TABLE invocation_procedure_names
- (execute_id char(256),
- procedure_name char(256)
+ (execute_id varchar(2048) PRIMARY KEY,
+ procedure_name varchar(2048)
);
@@ -107,19 +112,17 @@
-- a containment hierarchy. The relationship (such as array index or
-- structure member name) should also be stored in this table.
CREATE TABLE dataset_containment
- ( outer_dataset_id char(256),
- inner_dataset_id char(256)
+ ( outer_dataset_id varchar(2048),
+ inner_dataset_id varchar(2048)
);
-- dataset_filenames stores the filename mapped to each dataset. As some
-- datasets do not have filenames, it should not be expected that
-- every dataset will have a row in this table
-
--- TODO dataset_id should be primary key
CREATE TABLE dataset_filenames
- ( dataset_id char(256),
- filename char(256)
+ ( dataset_id varchar(2048) PRIMARY KEY,
+ filename varchar(2048)
);
-- dataset_values stores the value for each dataset which is known to have
@@ -128,8 +131,8 @@
-- example) SQL numerical operations should not be expected to work, even
-- though the user knows that a particular dataset stores a numeric value.
CREATE TABLE dataset_values
- ( dataset_id char(256), -- should be primary key
- value char(256)
+ ( dataset_id varchar(2048) PRIMARY KEY,
+ value varchar(2048)
);
-- The above dataset_* tables are the original containment representation
@@ -139,21 +142,21 @@
-- It is unclear which is the better representation.
CREATE TABLE createarray
- ( array_id char(256)
+ ( array_id varchar(2048)
);
CREATE TABLE createarray_member
- ( array_id char(256),
- ix char(256),
- member_id char(256)
+ ( array_id varchar(2048),
+ ix varchar(2048),
+ member_id varchar(2048)
);
-- TODO step
CREATE TABLE array_range
- ( array_id char(256),
- from_id char(256),
- to_id char(256),
- step_id char(256) -- nullable, if step is unspecified
+ ( array_id varchar(2048),
+ from_id varchar(2048),
+ to_id varchar(2048),
+ step_id varchar(2048) -- nullable, if step is unspecified
);
-- known_workflows stores some information about each workflow log that has
@@ -161,36 +164,90 @@
-- status.
CREATE TABLE known_workflows
(
- workflow_id char(256),
- workflow_log_filename char(256),
- version char(256),
- importstatus char(256)
+ workflow_id varchar(2048) PRIMARY KEY,
+ workflow_log_filename varchar(2048),
+ version varchar(2048),
+ importstatus varchar(2048)
);
-- workflow_events stores the start time and duration for each workflow
-- that has been successfully imported.
CREATE TABLE workflow_events
- ( workflow_id char(256),
+ ( workflow_id varchar(2048) PRIMARY KEY,
starttime numeric,
duration numeric
);
-- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
CREATE TABLE extrainfo
- ( execute2id char(256),
- extrainfo char(1024)
+ ( execute2id varchar(2048),
+ extrainfo varchar(1024)
);
-- annotations
-CREATE TABLE annotations
- ( id char(256), -- either dataset_id, process_id, or workflow_id
- name char(256),
- value char(256)
+CREATE TABLE dataset_annotations_numeric
+ ( dataset_id varchar(2048),
+ name varchar(2048),
+ value numeric,
+ primary key(dataset_id, name)
);
+CREATE TABLE dataset_annotations_varchar
+ ( dataset_id varchar(2048),
+ name varchar(2048),
+ value varchar(4096),
+ primary key(dataset_id, name)
+ );
+
+CREATE TABLE dataset_annotations_boolean
+ ( dataset_id varchar(2048),
+ name varchar(2048),
+ value boolean,
+ primary key(dataset_id, name)
+ );
+
+CREATE TABLE process_annotations_numeric
+ ( process_id varchar(2048),
+ name varchar(2048),
+ value numeric,
+ primary key(process_id, name)
+ );
+
+CREATE TABLE process_annotations_varchar
+ ( process_id varchar(2048),
+ name varchar(2048),
+ value varchar(1024),
+ primary key(process_id, name)
+ );
+
+CREATE TABLE process_annotations_boolean
+ ( process_id varchar(2048),
+ name varchar(2048),
+ value boolean,
+ primary key(process_id, name)
+ );
+
+CREATE TABLE workflow_annotations_numeric
+ ( workflow_id varchar(2048),
+ name varchar(2048),
+ value numeric,
+ primary key(workflow_id, name)
+ );
+
+CREATE TABLE workflow_annotations_varchar
+ ( workflow_id varchar(2048),
+ name varchar(2048),
+ value varchar(1024),
+ primary key(workflow_id, name)
+ );
+
+CREATE TABLE workflow_annotations_boolean
+ ( workflow_id varchar(2048),
+ name varchar(2048),
+ value boolean,
+ primary key(workflow_id, name)
+ );
-- this GRANT does not work for sqlite; you'll get a syntax error but
-- ignore it, as it is not needed in sqlite
grant all on dataset_containment, dataset_filenames, dataset_usage, processes_in_workflows, invocation_procedure_names, known_workflows, workflow_events to public, operators;
-
-
More information about the Swift-commit
mailing list