[Swift-commit] r3397 - in provenancedb: . apps/oops

Mon Jun 21 13:23:45 CDT 2010

Author: lgadelha
Date: 2010-06-21 13:23:44 -0500 (Mon, 21 Jun 2010)
New Revision: 3397

Modified:
   provenancedb/apps/oops/oops_extractor.sh
   provenancedb/prov-init.sql
Log:


Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================

--- provenancedb/apps/oops/oops_extractor.sh	2010-06-18 21:21:53 UTC (rev 3396)
+++ provenancedb/apps/oops/oops_extractor.sh	2010-06-21 18:23:44 UTC (rev 3397)
@@ -6,63 +6,78 @@
 # OOPS' Swift logs. 
 
 PROVDB_HOME=~/provenancedb
-PROTESTS_HOME=~/protests
+PROTESTS_HOME=/home/aashish/CASP
+IMPORT_HOME=~/protests
 
 source $PROVDB_HOME/etc/provenance.config
 
 # provdb_imported records runs already imported to the provenance database
-cd $PROTESTS_HOME
+cd $IMPORT_HOME
 if [ ! -a provdb_imported ]; then
     touch provdb_imported
 fi
 
-
-for i in `ls | grep run.loops`;
-do 
-    cd $PROTESTS_HOME
-    if ! grep $i provdb_imported; then
-	if grep "Swift finished with no errors" $i/psim.loops-*.log; then
-	    cd swift-logs
-	    for j in `ls ../$i | grep psim.loops-`; do
-		ln -s ../$i/$j
-	    done
-	    cd import
-	    # swift-prov-import-all-logs also controls what already has been
-	    # imported, so it does not repeat work
-	    $PROVDB_HOME/swift-prov-import-all-logs
-	    cd $PROTESTS_HOME
-	    echo $i >> provdb_imported
-
-	    # annotate workflows with their oops runid
-	    OOPS_RUN_ID=`echo $i | awk -F . '{print $3}'`
-	    LOG_FILENAME=`ls $i | grep psim.loops- | grep "\."log$`
-	    WORKFLOW_ID=`echo "select workflow_id from known_workflows where workflow_log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}'`
-	    echo "insert into annotations values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
-
-	    # annotate dataset with scientific parameters passed to doLoopRound
-
-	    # TODO: check why it is not recording doLoopRound in processes_in_workflows
-	    #echo "select dataset_filenames.dataset_id,dataset_filenames.filename from dataset_usage,invocation_procedure_names,dataset_containment,dataset_filenames,processes_in_workflows where dataset_usage.process_id=invocation_procedure_names.execute_id and dataset_containment.inner_dataset_id=dataset_filenames.dataset_id and procedure_name='loopModel' and param_name='d' and dataset_containment.outer_dataset_id=dataset_usage.dataset_id and dataset_filenames.filename like '%.params%' and processes_in_workflows.process_id=dataset_usage.process_id and processes_in_workflows.workflow_id='$WORKFLOW_ID';" > query.sql
-
-	    # using this as a workaround for the problem above, it will return nSim identical tuples
-            echo "select dataset_filenames.dataset_id,dataset_filenames.filename from dataset_usage,invocation_procedure_names,dataset_containment,dataset_filenames,processes_in_workflows where dataset_usage.process_id=invocation_procedure_names.execute_id and dataset_containment.inner_dataset_id=dataset_filenames.dataset_id and procedure_name='loopModel' and param_name='d' and dataset_containment.outer_dataset_id=dataset_usage.dataset_id and dataset_filenames.filename like '%.params%' and processes_in_workflows.process_id=dataset_usage.process_id and processes_in_workflows.workflow_id='$WORKFLOW_ID';" > query.sql
-
-	    $SQLCMD -t -A -F " " -f query.sql -o result.txt
-
-            #DATASET_ID=`awk '{print $1}' result.txt`
-            DATASET_ID=`awk '{if (NR==1) print $1}' result.txt`
-
-            #FILENAME=`awk '{print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
-            FILENAME=`awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
-
-	    cd $PROTESTS_HOME/run.loops.$OOPS_RUN_ID
-	    
-	    while read line
-	    do
-		NAME=`echo $line | awk 'BEGIN { FS = "=" }; {print $1}'`
-		VALUE=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}'`
-		echo "insert into annotations values ('$DATASET_ID', '$NAME', '$VALUE');" | $SQLCMD
-	    done < $FILENAME
+cd $PROTESTS_HOME
+for k in `ls -1`;
+do
+    cd $PROTESTS_HOME/$k
+    for i in `ls | grep run.loops`;
+    do 
+	cd $IMPORT_HOME
+	if ! grep --silent $i provdb_imported; then
+	    if grep --silent "Swift finished with no errors" $PROTESTS_HOME/$k/$i/psim.loops-*.log; then
+		cd swift-logs
+		for j in `ls $PROTESTS_HOME/$k/$i | grep psim.loops-`; do
+		    ln -s $PROTESTS_HOME/$k/$i/$j
+		done
+		cd import
+	        # swift-prov-import-all-logs also controls what already has been
+	        # imported, so it does not repeat work
+		$PROVDB_HOME/swift-prov-import-all-logs
+		cd $IMPORT_HOME
+		echo $i >> provdb_imported
+		cd swift-logs
+	        # annotate workflows with their oops runid
+		OOPS_RUN_ID=`echo $i | awk -F . '{print $3}'`
+		cd $PROTESTS_HOME/$k/$i
+		LOG_FILENAME=`ls | grep psim.loops- | grep "\."log$`
+		WORKFLOW_ID=`echo "select workflow_id from known_workflows where workflow_log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}'`
+		cd $IMPORT_HOME/swift-logs
+		echo "insert into workflow_annotations_varchar values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
+		
+	        # using this as a workaround for the problem above, it will return nSim identical tuples
+		echo "select dataset_filenames.dataset_id,dataset_filenames.filename from dataset_usage,invocation_procedure_names,dataset_containment,dataset_filenames,processes_in_workflows where dataset_usage.process_id=invocation_procedure_names.execute_id and dataset_containment.inner_dataset_id=dataset_filenames.dataset_id and procedure_name='loopModel' and param_name='d' and dataset_containment.outer_dataset_id=dataset_usage.dataset_id and dataset_filenames.filename like '%.params%' and processes_in_workflows.process_id=dataset_usage.process_id and processes_in_workflows.workflow_id='$WORKFLOW_ID';" > query.sql
+		
+		$SQLCMD -t -A -F " " -f query.sql -o result.txt
+		
+                #DATASET_ID=`awk '{print $1}' result.txt`
+		DATASET_ID=`awk '{if (NR==1) print $1}' result.txt`
+		
+                #FILENAME=`awk '{print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
+		FILENAME=`awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g'`
+		
+		cd $PROTESTS_HOME/$k/run.loops.$OOPS_RUN_ID
+		
+		while read line; do
+		    NAME=`echo $line | awk 'BEGIN { FS = "=" }; {print $1}'`
+		    if [ "$NAME" = "SAMPLE RANGE" ]; then
+			VALUE1=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "-" }; {print $1}'`
+			VALUE2=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "-" }; {print $2}'`
+			echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME BEGIN', $VALUE1);" | $SQLCMD
+			echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME END', $VALUE2);" | $SQLCMD
+		    fi 
+		    if [ "$NAME" = "RESTRAIN DISTANCE" ]; then
+			VALUE1=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $1}'`
+			VALUE2=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $2}'`
+			echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME 1', $VALUE1);" | $SQLCMD
+			echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME 2', $VALUE2);" | $SQLCMD
+		    fi 
+		    if [ "$NAME" = "MAXIMUM NUMBER OF STEPS" ]; then
+			VALUE=`echo $line | awk 'BEGIN { FS = "=" }; {print $2}'`
+			echo "insert into dataset_annotations_numeric values ('$DATASET_ID', '$NAME', $VALUE);" | $SQLCMD
+		    fi 
+		done < $FILENAME
+	    fi
 	fi 
-    fi 
+    done
 done

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2010-06-18 21:21:53 UTC (rev 3396)
+++ provenancedb/prov-init.sql	2010-06-21 18:23:44 UTC (rev 3397)
@@ -17,23 +17,30 @@
 DROP TABLE createarray;
 DROP TABLE createarray_member;
 DROP TABLE array_range;
-DROP TABLE annotations;
-
-
+DROP TABLE dataset_annotations_numeric;
+DROP TABLE dataset_annotations_varchar;
+DROP TABLE dataset_annotations_boolean;
+DROP TABLE process_annotations_numeric;
+DROP TABLE process_annotations_varchar;
+DROP TABLE process_annotations_boolean;
+DROP TABLE workflow_annotations_numeric;
+DROP TABLE workflow_annotations_varchar;
+DROP TABLE workflow_annotations_boolean;
 -- associates each process with its containing workflow
 -- TODO - perhaps a workflow is itself a big big process?
 -- in which case this looks very much like a compound/app
 -- containment?
 CREATE TABLE processes_in_workflows
-    (workflow_id char(256),
-     process_id char(256)
+    (workflow_id varchar(2048),
+     process_id varchar(2048),
+     primary key (workflow_id, process_id)
     );
 
 -- processes gives information about each process (in the OPM sense)
 -- it is augmented by information in other tables
 CREATE TABLE processes
-    (id char(256) PRIMARY KEY, -- a uri
-     type char(16) -- specifies the type of process. for any type, it
+    (id varchar(2048) PRIMARY KEY, -- a uri
+     type varchar(16) -- specifies the type of process. for any type, it
                    -- must be the case that the specific type table
                    -- has an entry for this process.
                    -- Having this type here seems poor normalisation, though?
@@ -44,12 +51,12 @@
 -- each execute is identified by a unique URI. other information from
 -- swift logs is also stored here. an execute is an OPM process.
 CREATE TABLE executes
-    (id char(256) PRIMARY KEY, -- actually foreign key to processes
+    (id varchar(2048) PRIMARY KEY, -- actually foreign key to processes
      starttime numeric,
      duration numeric,
-     finalstate char(256),
-     app char(256),
-     scratch char(256)
+     finalstate varchar(2048),
+     app varchar(2048),
+     scratch varchar(2048)
     );
 
 -- this gives information about each execute2, which is an attempt to
@@ -57,12 +64,12 @@
 -- information such as wrapper logs
 
 CREATE TABLE execute2s
-    (id char(256) PRIMARY KEY,
-     execute_id char(256), -- secondary key to executes and processes tables
+    (id varchar(2048) PRIMARY KEY,
+     execute_id varchar(2048), -- secondary key to executes and processes tables
      starttime numeric,
      duration numeric,
-     finalstate char(256),
-     site char(256)
+     finalstate varchar(2048),
+     site varchar(2048)
     );
 
 -- dataset_usage records usage relationships between processes and datasets;
@@ -74,11 +81,11 @@
 -- dataset_id for common queries? maybe add arbitrary ID for sake of it?
 
 CREATE TABLE dataset_usage
-    (process_id char(256), -- foreign key but not enforced because maybe process
+    (process_id varchar(2048), -- foreign key but not enforced because maybe process
                            -- doesn't exist at time. same type as processes.id
      direction char(1), -- I or O for input or output
-     dataset_id char(256), -- this will perhaps key against dataset table
-     param_name char(256) -- the name of the parameter in this execute that
+     dataset_id varchar(2048), -- this will perhaps key against dataset table
+     param_name varchar(2048) -- the name of the parameter in this execute that
                           -- this dataset was bound to. sometimes this must
                           -- be contrived (for example, in positional varargs)
     );
@@ -89,11 +96,9 @@
 
 -- TODO probably desirable that this is part of executes table
 -- but for now this is the easiest to pull data from logs.
-
--- TODO primary key should be execute_id
 CREATE TABLE invocation_procedure_names
-    (execute_id char(256),
-     procedure_name char(256)
+    (execute_id varchar(2048) PRIMARY KEY,
+     procedure_name varchar(2048)
     );
 
 
@@ -107,19 +112,17 @@
 -- a containment hierarchy. The relationship (such as array index or
 -- structure member name) should also be stored in this table.
 CREATE TABLE dataset_containment
-    ( outer_dataset_id char(256),
-      inner_dataset_id char(256)
+    ( outer_dataset_id varchar(2048),
+      inner_dataset_id varchar(2048)
     );
 
 
 -- dataset_filenames stores the filename mapped to each dataset. As some
 -- datasets do not have filenames, it should not be expected that 
 -- every dataset will have a row in this table
-
--- TODO dataset_id should be primary key
 CREATE TABLE dataset_filenames
-    ( dataset_id char(256),
-      filename char(256)
+    ( dataset_id varchar(2048) PRIMARY KEY,
+      filename varchar(2048)
     );
 
 -- dataset_values stores the value for each dataset which is known to have
@@ -128,8 +131,8 @@
 -- example) SQL numerical operations should not be expected to work, even
 -- though the user knows that a particular dataset stores a numeric value.
 CREATE TABLE dataset_values
-    ( dataset_id char(256), -- should be primary key
-      value char(256)
+    ( dataset_id varchar(2048) PRIMARY KEY,
+      value varchar(2048)
     );
 
 -- The above dataset_* tables are the original containment representation
@@ -139,21 +142,21 @@
 -- It is unclear which is the better representation.
 
 CREATE TABLE createarray
-    ( array_id char(256)
+    ( array_id varchar(2048)
     );
 
 CREATE TABLE createarray_member
-    ( array_id char(256),
-      ix char(256),
-      member_id char(256)
+    ( array_id varchar(2048),
+      ix varchar(2048),
+      member_id varchar(2048)
     );
 
 -- TODO step
 CREATE TABLE array_range
-    ( array_id char(256),
-      from_id char(256),
-      to_id char(256),
-      step_id char(256) -- nullable, if step is unspecified
+    ( array_id varchar(2048),
+      from_id varchar(2048),
+      to_id varchar(2048),
+      step_id varchar(2048) -- nullable, if step is unspecified
     );
 
 -- known_workflows stores some information about each workflow log that has
@@ -161,36 +164,90 @@
 -- status.
 CREATE TABLE known_workflows
     (
-      workflow_id char(256),
-      workflow_log_filename char(256),
-      version char(256),
-      importstatus char(256)
+      workflow_id varchar(2048) PRIMARY KEY,
+      workflow_log_filename varchar(2048),
+      version varchar(2048),
+      importstatus varchar(2048)
     );
 
 
 -- workflow_events stores the start time and duration for each workflow
 -- that has been successfully imported.
 CREATE TABLE workflow_events
-    ( workflow_id char(256),
+    ( workflow_id varchar(2048) PRIMARY KEY,
       starttime numeric,
       duration numeric
     );
 
 -- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
 CREATE TABLE extrainfo
-    ( execute2id char(256),
-      extrainfo char(1024)
+    ( execute2id varchar(2048),
+      extrainfo varchar(1024)
     );
 
 -- annotations
-CREATE TABLE annotations
-   ( id char(256), -- either dataset_id, process_id, or workflow_id
-     name char(256),
-     value char(256)
+CREATE TABLE dataset_annotations_numeric
+   ( dataset_id varchar(2048), 
+     name varchar(2048),
+     value numeric,
+     primary key(dataset_id, name)
    );
 
+CREATE TABLE dataset_annotations_varchar
+   ( dataset_id varchar(2048), 
+     name varchar(2048),
+     value varchar(4096),
+     primary key(dataset_id, name)
+   );
+
+CREATE TABLE dataset_annotations_boolean
+   ( dataset_id varchar(2048), 
+     name varchar(2048),
+     value boolean,
+     primary key(dataset_id, name)
+   );
+
+CREATE TABLE process_annotations_numeric
+   ( process_id varchar(2048), 
+     name varchar(2048),
+     value numeric,
+     primary key(process_id, name)
+   );
+
+CREATE TABLE process_annotations_varchar
+   ( process_id varchar(2048), 
+     name varchar(2048),
+     value varchar(1024),
+     primary key(process_id, name)
+   );
+
+CREATE TABLE process_annotations_boolean
+   ( process_id varchar(2048), 
+     name varchar(2048),
+     value boolean,
+     primary key(process_id, name)
+   );
+
+CREATE TABLE workflow_annotations_numeric
+   ( workflow_id varchar(2048), 
+     name varchar(2048),
+     value numeric,
+     primary key(workflow_id, name)
+   );
+
+CREATE TABLE workflow_annotations_varchar
+   ( workflow_id varchar(2048), 
+     name varchar(2048),
+     value varchar(1024),
+     primary key(workflow_id, name)
+   );
+
+CREATE TABLE workflow_annotations_boolean
+   ( workflow_id varchar(2048), 
+     name varchar(2048),
+     value boolean,
+     primary key(workflow_id, name)
+   );
 -- this GRANT does not work for sqlite; you'll get a syntax error but
 -- ignore it, as it is not needed in sqlite
 grant all on dataset_containment, dataset_filenames, dataset_usage, processes_in_workflows, invocation_procedure_names, known_workflows, workflow_events to public, operators;
-
-