[Swift-commit] r3706 - provenancedb

Wed Nov 10 12:18:39 CST 2010

Author: lgadelha
Date: 2010-11-10 12:18:39 -0600 (Wed, 10 Nov 2010)
New Revision: 3706

Modified:
   provenancedb/README
   provenancedb/prepare-provenance-chart
   provenancedb/prov-init.sql
   provenancedb/prov-sql-generate-transitive-closures.sql
   provenancedb/prov-to-sql.sh
   provenancedb/swift-prov-import-all-logs
Log:
New database schema, and corrections to import scripts.


Modified: provenancedb/README
===================================================================

--- provenancedb/README	2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/README	2010-11-10 18:18:39 UTC (rev 3706)
@@ -1,12 +1,9 @@
-provenance database prototype
+The file etc/provenance.config should be edited to define the local configuration. The location of the directory containing the log files should be defined in the variable LOGREPO. For instance:
 
-At the moment, you have pretty much no chance of getting this working.
+export LOGREPO=~/swift-logs/
 
-Working notes are in docbook form in provenance.xml
+The command used for connecting to the database should be defined in the variable SQLCMD. For example, to connect to CI's PostgreSQL? database:
 
-To import, run:
+export SQLCMD="psql -h db.ci.uchicago.edu -U provdb provdb"
 
-  ./import-all
-
-with no parameters.
-
+The script ./swift-prov-import-all-logs will import provenance information from the log files in $LOGREPO into the database. The command line option -rebuild will initialize the database before importing provenance information. The file prov-init.sql contains the database schema.

Modified: provenancedb/prepare-provenance-chart
===================================================================
--- provenancedb/prepare-provenance-chart	2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prepare-provenance-chart	2010-11-10 18:18:39 UTC (rev 3706)
@@ -8,8 +8,8 @@
 
 export RUNID=$(basename $1 .log)
 
-export WFID="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
-export EXECUTE2PREFIX="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute2:${RUNID}:"
+export WFID="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
+export EXECUTE2PREFIX="tag:ci.uchicago.edu,2008:swiftlogs:execute2:${RUNID}:"
 
 # will output log information about datasets from a log file passed as $1
 
@@ -30,8 +30,8 @@
 cat $1 | grep ' OPERATOR ' | sed 's/^.*thread=\([^ ]*\) operator="\([^ ]*\)" lhs=\([^ ]*\) rhs=\([^ ]*\) result=\([^ ]*\).*$/\1 \2 \3 \4 \5/' > operators.txt
 
 
-# 2009-03-19 19:15:35,244+0100 INFO  vdl:arguments FUNCTION id=88000-0-4-4 name="f ilename" result=tag:benc at ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg 13:720000000060
-# 2009-03-19 19:15:35,246+0100 INFO  vdl:arguments FUNCTIONPARAMETER id=88001-0-4- 4 input=tag:benc at ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg13:72000 0000058
+# 2009-03-19 19:15:35,244+0100 INFO  vdl:arguments FUNCTION id=88000-0-4-4 name="f ilename" result=tag:ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg 13:720000000060
+# 2009-03-19 19:15:35,246+0100 INFO  vdl:arguments FUNCTIONPARAMETER id=88001-0-4- 4 input=tag:ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg13:72000 0000058
 
 cat $1 | grep ' FUNCTION ' | sed "s/^.*id=\([^ ]*\) name=\([^ ]*\) result=\([^ ]*\).*\$/$WFID\1 \2 \3/" > functions.txt
 # the IDs in functions.txt should be unique...

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prov-init.sql	2010-11-10 18:18:39 UTC (rev 3706)
@@ -1,253 +1,259 @@
-
 -- this is the schema definition used for the main relational provenance
 -- implementation (in both sqlite3 and postgres)
-
-DROP TABLE processes;
-DROP TABLE executes;
-DROP TABLE execute2s;
-DROP TABLE dataset_usage;
-DROP TABLE invocation_procedure_names;
-DROP TABLE dataset_containment;
-DROP TABLE dataset_filenames;
-DROP TABLE processes_in_workflows;
-DROP TABLE dataset_values;
-DROP TABLE known_workflows;
-DROP TABLE workflow_events;
+   
+DROP TABLE dataset;
+DROP TABLE file;
+DROP TABLE variable;
+DROP TABLE ds_containment;
+DROP TABLE process;
+DROP TABLE execute;
+DROP TABLE execute2;
+DROP TABLE workflow;
+DROP TABLE ds_usage;
+DROP TABLE annot_ds_num;
+DROP TABLE annot_ds_txt;
+DROP TABLE annot_ds_bool;
+DROP TABLE annot_p_num;
+DROP TABLE annot_p_txt;
+DROP TABLE annot_p_bool;
+DROP TABLE annot_wf_num;
+DROP TABLE annot_wf_txt;
+DROP TABLE annot_wf_bool;
 DROP TABLE extrainfo;
 DROP TABLE createarray;
 DROP TABLE createarray_member;
 DROP TABLE array_range;
-DROP TABLE dataset_annotations_numeric;
-DROP TABLE dataset_annotations_varchar;
-DROP TABLE dataset_annotations_boolean;
-DROP TABLE process_annotations_numeric;
-DROP TABLE process_annotations_varchar;
-DROP TABLE process_annotations_boolean;
-DROP TABLE workflow_annotations_numeric;
-DROP TABLE workflow_annotations_varchar;
-DROP TABLE workflow_annotations_boolean;
--- associates each process with its containing workflow
--- TODO - perhaps a workflow is itself a big big process?
--- in which case this looks very much like a compound/app
--- containment?
-CREATE TABLE processes_in_workflows
-    (workflow_id varchar(2048),
-     process_id varchar(2048),
-     primary key (workflow_id, process_id)
-    );
 
--- processes gives information about each process (in the OPM sense)
--- it is augmented by information in other tables
-CREATE TABLE processes
-    (id varchar(2048) PRIMARY KEY, -- a uri
-     type varchar(16) -- specifies the type of process. for any type, it
-                   -- must be the case that the specific type table
-                   -- has an entry for this process.
-                   -- Having this type here seems poor normalisation, though?
+-- workflow stores some information about each workflow log that has
+-- been seen by the importer: the log filename, swift version and import
+-- status.
+-- Might be interesting to store xml translation of the swiftscript code
+-- here for prospective provenance/versioning
+CREATE TABLE workflow
+    (id            varchar(256) PRIMARY KEY,
+     log_filename  varchar(2048),
+     swift_version varchar(16),
+     import_status varchar(16),
+     start_time    numeric,
+     duration      numeric
     );
 
+-- workflow_run stores the start time and duration for each workflow
+-- that has been successfully imported.
+--CREATE TABLE wf_run
+--    (id         varchar(256) PRIMARY KEY  REFERENCES workflow (id) ON DELETE CASCADE, 
+--     start_time    numeric,
+--     duration      numeric
+--    );
 
--- this gives information about each execute.
--- each execute is identified by a unique URI. other information from
--- swift logs is also stored here. an execute is an OPM process.
-CREATE TABLE executes
-    (id varchar(2048) PRIMARY KEY, -- actually foreign key to processes
-     starttime numeric,
-     duration numeric,
-     finalstate varchar(2048),
-     app varchar(2048),
-     scratch varchar(2048)
+-- dataset stores all dataset identifiers.
+CREATE TABLE dataset
+    (id varchar(256) PRIMARY KEY
     );
 
--- this gives information about each execute2, which is an attempt to
--- perform an execution. the execute2 id is tied to per-execution-attempt
--- information such as wrapper logs
-
-CREATE TABLE execute2s
-    (id varchar(2048) PRIMARY KEY,
-     execute_id varchar(2048), -- secondary key to executes and processes tables
-     starttime numeric,
-     duration numeric,
-     finalstate varchar(2048),
-     site varchar(2048)
+-- file stores the filename mapped to each dataset. 
+CREATE TABLE file
+    ( id         varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+      filename   varchar(2048)
     );
 
--- dataset_usage records usage relationships between processes and datasets;
--- in SwiftScript terms, the input and output parameters for each
--- application procedure invocation; in OPM terms, the artificts which are
--- input to and output from each process that is a Swift execution
-
--- TODO: no primary key here. should probably index both on execute_id and on
--- dataset_id for common queries? maybe add arbitrary ID for sake of it?
-
-CREATE TABLE dataset_usage
-    (process_id varchar(2048), -- foreign key but not enforced because maybe process
-                           -- doesn't exist at time. same type as processes.id
-     direction char(1), -- I or O for input or output
-     dataset_id varchar(2048), -- this will perhaps key against dataset table
-     param_name varchar(2048) -- the name of the parameter in this execute that
-                          -- this dataset was bound to. sometimes this must
-                          -- be contrived (for example, in positional varargs)
+-- dataset_values stores the value for each dataset which is known to have
+-- a value (which is all assigned primitive types). No attempt is made here
+-- to expose that value as an SQL type other than a string, and so (for
+-- example) SQL numerical operations should not be expected to work, even
+-- though the user knows that a particular dataset stores a numeric value.
+CREATE TABLE variable
+    ( id    varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+      value varchar(2048)
     );
 
-
--- invocation_procedure_name maps each execute ID to the name of its
--- SwiftScript procedure
-
--- TODO probably desirable that this is part of executes table
--- but for now this is the easiest to pull data from logs.
-CREATE TABLE invocation_procedure_names
-    (execute_id varchar(2048) PRIMARY KEY,
-     procedure_name varchar(2048)
-    );
-
-
 -- dataset_containment stores the containment hierarchy between
 -- container datasets (arrays and structs) and their contents.
-
--- outer_dataset_id contains inner_dataset_id
-
+-- out_id contains in_id
 -- TODO this should perhaps be replaced with a more OPM-like model of
 -- constructors and accessors, rather than, or in addition to,
 -- a containment hierarchy. The relationship (such as array index or
 -- structure member name) should also be stored in this table.
-CREATE TABLE dataset_containment
-    ( outer_dataset_id varchar(2048),
-      inner_dataset_id varchar(2048)
+CREATE TABLE ds_containment
+    ( out_id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+      in_id  varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+      PRIMARY KEY (out_id,in_id)
     );
 
-
--- dataset_filenames stores the filename mapped to each dataset. As some
--- datasets do not have filenames, it should not be expected that 
--- every dataset will have a row in this table
-CREATE TABLE dataset_filenames
-    ( dataset_id varchar(2048) PRIMARY KEY,
-      filename varchar(2048)
+-- process gives information about each process (in the OPM sense)
+-- it is augmented by information in other tables
+-- specifies the type of process. for any type, it
+-- must be the case that the specific type table
+-- has an entry for this process.
+-- Having this type here seems poor normalisation, though?
+-- process types: internal, rootthread, execute, function, compound, scope, operator
+-- maybe create a table for each type?
+CREATE TABLE process
+    (id          varchar(256) PRIMARY KEY, 
+     type        varchar(16),
+     name	 varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
+     workflow_id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE   -- normalize: workflow_id of sub-procedure determined
+          	 	      		 	       	  	 	   -- by compound procedure 
     );
 
--- dataset_values stores the value for each dataset which is known to have
--- a value (which is all assigned primitive types). No attempt is made here
--- to expose that value as an SQL type other than a string, and so (for
--- example) SQL numerical operations should not be expected to work, even
--- though the user knows that a particular dataset stores a numeric value.
-CREATE TABLE dataset_values
-    ( dataset_id varchar(2048) PRIMARY KEY,
-      value varchar(2048)
+-- this gives information about each execute.
+-- each execute is identified by a unique URI. other information from
+-- swift logs is also stored here. an execute is an OPM process.
+CREATE TABLE execute
+    (id             varchar(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,  
+     procedure_name varchar(256), -- name of the app procedure that invokes the transformation
+     start_time     numeric,
+     duration       numeric,
+     final_state    varchar(16),
+     scratch        varchar(2048)
     );
 
--- The above dataset_* tables are the original containment representation
--- of datasets.
--- Now here are some tables to represent complex datasets differently, in
--- terms of accessors and constructors. 
--- It is unclear which is the better representation.
-
-CREATE TABLE createarray
-    ( array_id varchar(2048)
+-- this gives information about each execute2, which is an attempt to
+-- perform an execution. the execute2 id is tied to per-execution-attempt
+-- information such as wrapper logs
+CREATE TABLE execute2
+    (id          varchar(256) PRIMARY KEY,
+     execute_id  varchar(256) REFERENCES execute (id) ON DELETE CASCADE, 
+     start_time  numeric,
+     duration    numeric,
+     final_state varchar(16),
+     site        varchar(256)
     );
 
-CREATE TABLE createarray_member
-    ( array_id varchar(2048),
-      ix varchar(2048),
-      member_id varchar(2048)
+-- dataset_usage records usage relationships between processes and datasets;
+-- in SwiftScript terms, the input and output parameters for each
+-- application procedure invocation; in OPM terms, the artificts which are
+-- input to and output from each process that is a Swift execution
+CREATE TABLE ds_usage
+    (process_id varchar(256) REFERENCES process(id) ON DELETE CASCADE, 
+     direction  char(1), -- I or O for input or output
+     dataset_id varchar(256) REFERENCES dataset(id) ON DELETE CASCADE,
+     param_name varchar(256), -- the name of the parameter in this execute that
+                              -- this dataset was bound to. sometimes this must
+                              -- be contrived (for example, in positional varargs)
+     PRIMARY KEY (process_id,direction,dataset_id,param_name)
     );
 
--- TODO step
-CREATE TABLE array_range
-    ( array_id varchar(2048),
-      from_id varchar(2048),
-      to_id varchar(2048),
-      step_id varchar(2048) -- nullable, if step is unspecified
-    );
-
--- known_workflows stores some information about each workflow log that has
--- been seen by the importer: the log filename, swift version and import
--- status.
-CREATE TABLE known_workflows
-    (
-      workflow_id varchar(2048) PRIMARY KEY,
-      workflow_log_filename varchar(2048),
-      version varchar(2048),
-      importstatus varchar(2048)
-    );
-
-
--- workflow_events stores the start time and duration for each workflow
--- that has been successfully imported.
-CREATE TABLE workflow_events
-    ( workflow_id varchar(2048) PRIMARY KEY,
-      starttime numeric,
-      duration numeric
-    );
-
--- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
-CREATE TABLE extrainfo
-    ( execute2id varchar(2048),
-      extrainfo varchar(1024)
-    );
-
 -- annotations
-CREATE TABLE dataset_annotations_numeric
-   ( dataset_id varchar(2048), 
-     name varchar(2048),
+CREATE TABLE annot_ds_num
+   ( id    varchar(256) REFERENCES dataset (id) ON DELETE CASCADE, 
+     name  varchar(256),
      value numeric,
-     primary key(dataset_id, name)
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE dataset_annotations_varchar
-   ( dataset_id varchar(2048), 
-     name varchar(2048),
-     value varchar(4096),
-     primary key(dataset_id, name)
+CREATE TABLE annot_ds_txt
+   ( id    varchar(256) REFERENCES dataset (id) ON DELETE CASCADE, 
+     name  varchar(256),
+     value varchar(2048),
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE dataset_annotations_boolean
-   ( dataset_id varchar(2048), 
-     name varchar(2048),
+CREATE TABLE annot_ds_bool
+   ( id    varchar(256) REFERENCES dataset (id) ON DELETE CASCADE, 
+     name  varchar(256),
      value boolean,
-     primary key(dataset_id, name)
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE process_annotations_numeric
-   ( process_id varchar(2048), 
-     name varchar(2048),
+CREATE TABLE annot_p_num
+   ( id    varchar(256) REFERENCES process (id) ON DELETE CASCADE, 
+     name  varchar(256),
      value numeric,
-     primary key(process_id, name)
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE process_annotations_varchar
-   ( process_id varchar(2048), 
-     name varchar(2048),
-     value varchar(1024),
-     primary key(process_id, name)
+CREATE TABLE annot_p_txt
+   ( id    varchar(256) REFERENCES process (id) ON DELETE CASCADE, 
+     name  varchar(256),
+     value varchar(2048),
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE process_annotations_boolean
-   ( process_id varchar(2048), 
-     name varchar(2048),
+CREATE TABLE annot_p_bool
+   ( id    varchar(256) REFERENCES process (id) ON DELETE CASCADE, 
+     name  varchar(256),
      value boolean,
-     primary key(process_id, name)
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE workflow_annotations_numeric
-   ( workflow_id varchar(2048), 
-     name varchar(2048),
+CREATE TABLE annot_wf_num
+   ( id    varchar(256) REFERENCES workflow (id) ON DELETE CASCADE, 
+     name  varchar(256),
      value numeric,
-     primary key(workflow_id, name)
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE workflow_annotations_varchar
-   ( workflow_id varchar(2048), 
-     name varchar(2048),
-     value varchar(1024),
-     primary key(workflow_id, name)
+CREATE TABLE annot_wf_txt
+   ( id    varchar(256) REFERENCES workflow (id) ON DELETE CASCADE, 
+     name  varchar(256),
+     value varchar(2048),
+     PRIMARY KEY (id, name)
    );
 
-CREATE TABLE workflow_annotations_boolean
-   ( workflow_id varchar(2048), 
-     name varchar(2048),
+CREATE TABLE annot_wf_bool
+   ( id    varchar(256) REFERENCES workflow (id) ON DELETE CASCADE, 
+     name  varchar(2048),
      value boolean,
-     primary key(workflow_id, name)
+     PRIMARY KEY (id, name)
    );
+
+-- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
+-- extrainfo will be processes into annotation tables
+-- CREATE TABLE extrainfo
+--     ( id        varchar(256) PRIMARY KEY REFERENCES execute (id) ON DELETE CASCADE,
+--       extrainfo varchar(2048)
+--     );
+
+-- The above dataset_* tables are the original containment representation
+-- of datasets.
+-- Now here are some tables to represent complex datasets differently, in
+-- terms of accessors and constructors. 
+-- It is unclear which is the better representation.
+
+CREATE TABLE createarray
+    ( id varchar(256) PRIMARY KEY
+    );
+
+CREATE TABLE createarray_member
+    ( array_id  varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
+      ix        varchar(256),
+      member_id varchar(256),
+      PRIMARY KEY (array_id, ix)
+    );
+
+-- TODO step
+CREATE TABLE array_range
+    ( array_id varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
+      from_id  varchar(256),
+      to_id    varchar(256),
+      step_id  varchar(256), -- nullable, if step is unspecified
+      PRIMARY KEY (array_id,from_id,to_id,step_id)
+    );
+
 -- this GRANT does not work for sqlite; you'll get a syntax error but
 -- ignore it, as it is not needed in sqlite
-grant all on dataset_containment, dataset_filenames, dataset_usage, processes_in_workflows, invocation_procedure_names, known_workflows, workflow_events to public, operators;
+grant all on  
+ dataset,
+ file,
+ variable,
+ ds_containment,
+ process,
+ execute,
+ execute2,
+ workflow,
+ ds_usage,
+ annot_ds_num,
+ annot_ds_txt,
+ annot_ds_bool,
+ annot_p_num,
+ annot_p_txt,
+ annot_p_bool,
+ annot_wf_num,
+ annot_wf_txt,
+ annot_wf_bool,
+ extrainfo,
+ createarray,
+ createarray_member,
+ array_range
+to public, operators;

Modified: provenancedb/prov-sql-generate-transitive-closures.sql
===================================================================
--- provenancedb/prov-sql-generate-transitive-closures.sql	2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prov-sql-generate-transitive-closures.sql	2010-11-10 18:18:39 UTC (rev 3706)
@@ -10,13 +10,13 @@
 -- of same?
 
 CREATE TABLE trans 
-    (before char(128),
-     after char(128),
+    (before varchar(2048),
+     after varchar(2048),
     CONSTRAINT no_duplicate_arcs_in_trans UNIQUE (before, after));
 
 CREATE TABLE transbase
-    (before char(128),
-     after char(128),
+    (before varchar(2048),
+     after varchar(2048),
      CONSTRAINT no_duplicate_arcs_in_transbase UNIQUE (before, after)
     );
 

Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh	2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prov-to-sql.sh	2010-11-10 18:18:39 UTC (rev 3706)
@@ -1,12 +1,11 @@
 #!/bin/bash
 
-
 export RUNID=$(basename $1 .log)
 
-export WFID="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
+export WFID="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
 
 # TODO is there already a URI form for identifying workflows?
-export WF="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
+export WF="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
 
 echo Generating SQL for $RUNID
 
@@ -14,110 +13,147 @@
 
 # this gives a distinction between the root process for a workflow and the
 # workflow itself. perhaps better to model the workflow as a process
-echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('${WFID}0', '$WF');" >> tmp-import.sql
-  echo "INSERT INTO processes (id, type) VALUES ('${WFID}0', 'rootthread');" >> tmp-import.sql
+echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" >> tmp-import.sql
 
-while read time duration thread localthread endstate app scratch; do
-  echo "INSERT INTO processes (id, type) VALUES ('$thread', 'execute');" >> tmp-import.sql
-  echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$thread', '$WF');" >> tmp-import.sql
-  echo "INSERT INTO executes (id, starttime, duration, finalstate, app, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$app', '$scratch');" >> tmp-import.sql
+
+while read time duration thread localthread endstate tr_name scratch; do
+    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-import.sql
+    echo "INSERT INTO execute (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" >> tmp-import.sql
 done < execute.global.event
 
-while read starttime duration globalid id endstate thread site scratch; do
+while read start_time duration globalid id endstate thread site scratch; do
+    # cut off the last component of the thread, so that we end up at the
+    # parent thread id which should correspond with the execute-level ID
+    inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
+    echo "INSERT INTO execute2 (id, execute_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-import.sql
+done < execute2.global.event
 
-  # cut off the last component of the thread, so that we end up at the
-  # parent thread id which should correspond with the execute-level ID
-  execute_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
 
-  echo "INSERT INTO execute2s (id, execute_id, starttime, duration, finalstate, site) VALUES ('$globalid', '$execute_id', '$starttime', '$duration', '$endstate', '$site');" >> tmp-import.sql
-done < execute2.global.event
 
-while read thread name lhs rhs result; do
-  operatorid="${WFID}operator:$thread"
-  echo "INSERT INTO processes (id, type) VALUES ('$operatorid', 'operator');" >> tmp-import.sql
-  echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$operatorid','$name'); " >> tmp-import.sql
-  echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$operatorid', '$WF');" >> tmp-import.sql
-  echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$lhs', 'lhs');" >> tmp-import.sql
-  echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$rhs', 'rhs');" >> tmp-import.sql
-  echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'O', '$result', 'result');" >> tmp-import.sql
+while read col1 col2 col3 col4 col5 threadst namest lhsst rhsst resultst; do
+    thread=`echo $threadst | awk 'BEGIN { FS = "=" }; {print $2}'`
+    name=`echo $name | awk 'BEGIN { FS = "=" }; {print $2}'`
+    lhs=`echo $lhsst | awk 'BEGIN { FS = "=" }; {print $2}'`
+    rhs=`echo $rhsst | awk 'BEGIN { FS = "=" }; {print $2}'`
+    result=`echo $resultst | awk 'BEGIN { FS = "=" }; {print $2}'`
+    
+    operatorid="${WFID}operator:$thread"
+    
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$lhs';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$lhs');" >> tmp-import.sql
+    fi
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$rhs';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$rhs');" >> tmp-import.sql
+    fi    
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$result';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$result');" >> tmp-import.sql
+    fi
+    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-import.sql
+    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$lhs', 'lhs');" >> tmp-import.sql
+    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$rhs', 'rhs');" >> tmp-import.sql
+    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'O', '$result', 'result');" >> tmp-import.sql
 done < operators.txt
 
 while read id name output; do
-  echo "INSERT INTO processes (id, type) VALUES ('$id', 'function');" >> tmp-import.sql
-  echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$id', '$WF');" >> tmp-import.sql
-  echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'O', '$output', 'result');" >> tmp-import.sql
-  echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$id', '$name');" >> tmp-import.sql
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$id';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$id');" >> tmp-import.sql
+    fi    
+    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-import.sql
+    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'O', '$output', 'result');" >> tmp-import.sql
 done < functions.txt
 
 while read id value; do
-# TODO need ordering/naming
-  echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'I', '$value', 'undefined');" >> tmp-import.sql
+    # TODO need ordering/naming
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$id';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$id');" >> tmp-import.sql
+    fi       
+    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'I', '$value', 'undefined');" >> tmp-import.sql
 done < function-inputs.txt
 
 while read thread direction dataset variable rest; do 
-  if [ "$direction" == "input" ] ; then
-    dir=I
-  else
-    dir=O
-  fi
-  echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$thread', '$dir', '$dataset', '$variable');" >> tmp-import.sql
+    if [ "$direction" == "input" ] ; then
+	dir=I
+    else
+	dir=O
+    fi
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$dataset';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-import.sql
+    fi       
+    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$thread', '$dir', '$dataset', '$variable');" >> tmp-import.sql
 done < tie-data-invocs.txt
 
 while read thread appname; do
-  echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$thread', '$appname');" >> tmp-import.sql
-
+    echo "UPDATE execute SET procedure_name='$appname' WHERE id='$thread';" >> tmp-import.sql
 done < invocation-procedure-names.txt
 
 while read outer inner; do
-  echo "INSERT INTO dataset_containment (outer_dataset_id, inner_dataset_id) VALUES ('$outer', '$inner');" >> tmp-import.sql
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$outer';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$outer');" >> tmp-import.sql
+    fi       
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$inner';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$inner');" >> tmp-import.sql
+    fi       
+    echo "INSERT INTO ds_containment (out_id, in_id) VALUES ('$outer', '$inner');" >> tmp-import.sql
 done < tie-containers.txt
 
 while read dataset filename; do
-  echo "INSERT INTO dataset_filenames (dataset_id, filename) VALUES ('$dataset', '$filename');" >> tmp-import.sql
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$dataset';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-import.sql
+    fi           
+    echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" >> tmp-import.sql
 done < dataset-filenames.txt
 
 while read dataset value; do
-  echo "INSERT INTO dataset_values (dataset_id, value) VALUES ('$dataset', '$value');" >> tmp-import.sql
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$dataset';")
+    if [ "$EXISTING" -eq "0" ];  then
+	echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-import.sql
+    fi           
+    echo "INSERT INTO variable (id, value) VALUES ('$dataset', '$value');" >> tmp-import.sql
 done < dataset-values.txt
 
-while read start duration wfid; do
-  echo "INSERT INTO workflow_events (workflow_id,starttime, duration) VALUES ('$wfid', '$start', '$duration');" >> tmp-import.sql
+while read start duration wfid rest; do
+    echo "UPDATE workflow SET start_time=$start WHERE id='$WF';" >> tmp-import.sql
+    echo "UPDATE workflow SET duration=$duration WHERE id='$WF';" >> tmp-import.sql
 done < workflow.event
 
 while read id extrainfo ; do
-# TODO this will not like quotes and things like that in extrainfo
-  echo "INSERT INTO extrainfo (execute2id, extrainfo) VALUES ('$id', '$extrainfo');" >> tmp-import.sql
+    # TODO this will not like quotes and things like that in extrainfo
+    echo "INSERT INTO extrainfo (id, extrainfo) VALUES ('$id', '$extrainfo');" >> tmp-import.sql
 done < extrainfo.txt
 
 # TODO this could merge with other naming tables
-while read start duration thread finalstate procname ; do
-  compoundid=$WFID$thread
-  echo "INSERT INTO processes (id, type) VALUES ('$compoundid', 'compound');" >> tmp-import.sql
-  echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$compound', '$WF');" >> tmp-import.sql
-  echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$compoundid', '$procname');" >> tmp-import.sql
+while read start duration thread final_state procname ; do
+    compoundid=$WFID$thread
+    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-import.sql
 done < compound.event
 
-while read start duration thread finalstate procname ; do
-  fqid=$WFID$thread
-  echo "INSERT INTO processes (id, type) VALUES ('$fqid', 'internal');" >> tmp-import.sql
-  echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$fqid', '$WF');" >> tmp-import.sql
-  echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$fqid', '$procname');" >> tmp-import.sql
+while read start duration thread final_state procname ; do
+    fqid=$WFID$thread
+    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-import.sql
 done < internalproc.event
 
+while read t ; do 
+    thread="${WFID}$t"
+    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-import.sql
+done < scopes.txt
+
 while read id ; do
-  echo "INSERT INTO createarray (array_id) VALUES ('$id');" >> tmp-import.sql
+    echo "INSERT INTO createarray (array_id) VALUES ('$id');" >> tmp-import.sql
 done < createarray.txt
 
 while read arrayid index memberid ; do
-  echo "INSERT INTO createarray_member (array_id, ix, member_id) VALUES ('$arrayid', '$index', '$memberid');" >> tmp-import.sql
+    echo "INSERT INTO createarray_member (array_id, ix, member_id) VALUES ('$arrayid', '$index', '$memberid');" >> tmp-import.sql
 done < createarray-members.txt
 
-while read t ; do 
-  thread="${WFID}$t"
-  echo "INSERT INTO processes (id, type) VALUES ('$thread', 'scope');" >> tmp-import.sql
-  echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$thread', '$WF');" >> tmp-import.sql
-done < scopes.txt
-
 echo Sending SQL to DB
 
 $SQLCMD < tmp-import.sql

Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs	2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/swift-prov-import-all-logs	2010-11-10 18:18:39 UTC (rev 3706)
@@ -11,11 +11,13 @@
 source $PROVDIR/etc/provenance.config
 export PATH=$PROVDIR:$PATH
 
+# this generates a file with pairs like:
+# <starttime> <swift version> <logfilename> 
 swift-plot-log $LOGREPO everylog-vs-versions.data
 
 if [ "$?" != "0" ]; then
-  echo swift-plot-log failed when building everylog-vs-versions.data
-  exit 1
+    echo swift-plot-log failed when building everylog-vs-versions.data
+    exit 1
 fi
 
 # TODO better tmp handling that always using the same name in a shared
@@ -24,53 +26,55 @@
 
 echo first commandline param is $1
 if [ "$1" == "rebuild" ]; then
-  echo CLEANING DATABASE
-  $SQLCMD < $PROVDIR/prov-init.sql
+    echo CLEANING DATABASE
+    $SQLCMD < $PROVDIR/prov-init.sql
 fi
 
 while read start version filename; do
+    
+    export IDIR=$(echo $filename | sed 's/\.log$/.d/')
+    echo IDIR=$IDIR
+    if [ $version -ge 1538 ]; then
+	echo -n "Log: $filename ... "
+	
+        # TODO: does not work in sqlite
+	EXISTING=$($SQLCMD --tuples-only -c "select count(*) from workflow where log_filename='$filename';")
+	
+	if [ "$EXISTING" -eq "0" ];  then
+	    echo IMPORTING
+	    
+	    if grep --silent "DEBUG Loader Swift finished with no errors" $filename; then
+		wfstatus="SUCCESS"
+    	    else
+		wfstatus="FAIL"
+	    fi
 
- export IDIR=$(echo $filename | sed 's/\.log$/.d/')
- echo IDIR=$IDIR
- if [ $version -ge 1538 ]; then
-   echo -n "Log: $filename ... "
+	    export RUNID=$(basename $filename .log)
+	    # changing tag authority from e-mail benc at ci.uchicago.edu to dnsname ci.uchicago.edu
+	    export WF="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
+	    
+	    echo "INSERT INTO workflow (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
 
-# TODO: make it work in general (sqlite, mysql, ...)
-# Works only with PostgreSQL
-EXISTING=$($SQLCMD --tuples-only -c "select count(*) from known_workflows where workflow_log_filename='$filename';")
-
-if [ "$EXISTING" -eq "0" ];  then
-    echo IMPORTING
-    
-    if grep --silent "DEBUG Loader Swift finished with no errors" $filename; then
-	wfstatus="SUCCESS"
-    else
-	wfstatus="FAIL"
+	    
+	    echo version $version in log file $filename
+	    echo ============= will import =============
+	    prepare-for-import $filename
+	    if [ "$?" != "0" ]; then
+		echo prepare-for-import failed
+		exit 2
+	    fi
+	    import-run-to-sql $filename
+	    if [ "$?" != "0" ]; then
+		echo import-run-to-sql failed
+		exit 3
+	    fi
+	    
+            #     import-run-to-xml $filename
+	    
+	else
+	    echo SKIP: Already known in workflow
+	fi
     fi
-    
-    echo version $version in log file $filename
-    echo ============= will import =============
-    prepare-for-import $filename
-    if [ "$?" != "0" ]; then
-	echo prepare-for-import failed
-	exit 2
-    fi
-    import-run-to-sql $filename
-    if [ "$?" != "0" ]; then
-	echo import-run-to-sql failed
-	exit 3
-    fi
-    
-#     import-run-to-xml $filename
-    
-    export RUNID=$(basename $filename .log)
-    export WF="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
-    
-    echo "INSERT INTO known_workflows (workflow_id, workflow_log_filename, version, importstatus) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
-else
-    echo SKIP: Already known in workflow
-fi
- fi
 done < /tmp/everylog-vs-versions.data
 
 # now pull the XML data into eXist, in as few runs as possible to avoid