[Swift-commit] r4224 - in provenancedb: . apps/oops etc

Sun Mar 27 16:54:23 CDT 2011

Author: lgadelha
Date: 2011-03-27 16:54:23 -0500 (Sun, 27 Mar 2011)
New Revision: 4224

Removed:
   provenancedb/etc/provenance.config.soju
Modified:
   provenancedb/apps/oops/oops_extractor.sh
   provenancedb/info-to-extrainfo
   provenancedb/pql_functions.sql
   provenancedb/prov-init.sql
   provenancedb/prov-to-sql.sh
   provenancedb/swift-prov-import-all-logs
Log:
Update schema to include runtime_info relation.
Minor updates.


Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================

--- provenancedb/apps/oops/oops_extractor.sh	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/apps/oops/oops_extractor.sh	2011-03-27 21:54:23 UTC (rev 4224)
@@ -82,7 +82,7 @@
 		
 		if [ -n "$FILENAME" ]; then
 		    SEQLENGTH=$(awk '{if (NR==2) print $1}' $FILENAME | wc -c)
-		    echo "insert into annot_ds_txt values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);" | $SQLCMD
+		    echo "insert into annot_ds_num values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);" | $SQLCMD
 		fi
 		# extracts scientific parameters given as output by the workflow in *.log.
 		# relevant lines:

Deleted: provenancedb/etc/provenance.config.soju
===================================================================
--- provenancedb/etc/provenance.config.soju	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/etc/provenance.config.soju	2011-03-27 21:54:23 UTC (rev 4224)
@@ -1,10 +0,0 @@
-# file to source that sets variables for the various paths that are
-# presently hardcoded
-
-# this is the path to log repo on benc's laptop
-export LOGREPO=/Users/benc/work/everylog/
-export IDIR=/Users/benc/work/everylog/
-
-export SQLCMD="sqlite3 provdb "
-# export SQLCMD="psql -p 5435 -d provdb "
-

Modified: provenancedb/info-to-extrainfo
===================================================================
--- provenancedb/info-to-extrainfo	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/info-to-extrainfo	2011-03-27 21:54:23 UTC (rev 4224)
@@ -3,17 +3,17 @@
 # must have $EXECUTE2PREFIX appropriately by containing script
 
 while read t d id rest; do
-  echo processing wrapper file for job $id >&2
-  record=$(find ${IDIR} -name ${id}-info)
-
-  globalid=$EXECUTE2PREFIX$id
-
-  if [ "X$record" != "X" ] && [ -f $record ] ; then
-
-  grep '^EXTRAINFO=' $record | sed "s/^EXTRAINFO=\(.*\)$/$globalid \1/"
-
-  else
-    echo no wrapper log for $id >&2
-  fi
-
+    echo processing wrapper file for job $id >&2
+    record=$(find ${IDIR} -name ${id}-info)
+    
+    globalid=$EXECUTE2PREFIX$id
+    
+    if [ "X$record" != "X" ] && [ -f $record ] ; then
+	
+	grep '^EXTRAINFO=' $record | sed "s/^EXTRAINFO=\(.*\)$/$globalid \1/"
+	
+    else
+	echo no wrapper log for $id >&2
+    fi
+    
 done < execute2.event

Modified: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/pql_functions.sql	2011-03-27 21:54:23 UTC (rev 4224)
@@ -3,79 +3,134 @@
 
 -- SQL Functions
 
--- list_runs_* lists workflows recorded in the database by id or log_filename
-DROP TYPE list_runs_type CASCADE;
+-- lists variations in a parameter's value across workflows, for parameters that are in-memory variables
 
-CREATE TYPE list_runs_type 
-AS (id            VARCHAR, 
-    log_filename  VARCHAR,
-    swift_version VARCHAR, 
-    start_time    TIMESTAMP WITH TIME ZONE,
-    duration      NUMERIC,
-    final_state   VARCHAR);
-CREATE OR REPLACE FUNCTION list_runs() 
-RETURNS SETOF list_runs_type AS $$
-	SELECT id, log_filename, swift_version, TO_TIMESTAMP(start_time), 
-	       duration, import_status AS final_state 
-    FROM   workflow;
+CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name VARCHAR) 
+RETURNS TABLE (
+  run_id VARCHAR, 
+  param_name VARCHAR, 
+  value VARCHAR
+)
+AS $$
+  SELECT   process.workflow_id, ds_usage.param_name, variable.value
+  FROM     variable, ds_usage, process
+  WHERE    variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND 
+           ds_usage.param_name=$1 
+  GROUP BY process.workflow_id, ds_usage.param_name, variable.value;
 $$ LANGUAGE SQL;
 
+CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR) 
+RETURNS TABLE (
+  workflow_id VARCHAR, 
+  param_name1 VARCHAR, 
+  value1 VARCHAR, 
+  param_name2 VARCHAR, 
+  value2 VARCHAR
+) 
+AS $$
+  SELECT * 
+  FROM   compare_run_by_parameter($1) as t 
+         INNER JOIN 
+         compare_run_by_parameter($2) as s 
+         USING (workflow_id); 
+$$ LANGUAGE SQL;
 
--- lists variations in a parameter's value across workflows
-DROP TYPE param_across_wf_type CASCADE;
-CREATE TYPE param_across_wf_type AS (workflow VARCHAR, parameter VARCHAR, value VARCHAR);
-
-CREATE OR REPLACE FUNCTION param_across_wf(param_name VARCHAR) 
-RETURNS SETOF param_across_wf_type AS $$
-	SELECT workflow.log_filename,ds_usage.param_name,variable.value
-	FROM   variable,ds_usage,process,workflow
-	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND 
-	       process.workflow_id=workflow.id AND ds_usage.param_name=$1 
-	GROUP BY workflow.log_filename,ds_usage.param_name,variable.value;
+CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR, param_name3 VARCHAR) 
+RETURNS TABLE (
+  workflow_id VARCHAR, 
+  param_name1 VARCHAR, 
+  value1 VARCHAR, 
+  param_name2 VARCHAR, 
+  value2 VARCHAR, 
+  param_name3 VARCHAR, 
+  value3 VARCHAR
+) 
+AS $$
+  SELECT * 
+  FROM   compare_run_by_parameter($1, $2) as t 
+         INNER JOIN 
+         compare_run_by_parameter($3) as s 
+         USING (workflow_id); 
 $$ LANGUAGE SQL;
 
-drop type annot_across_wf_type cascade;
-create type annot_across_wf_type as (workflow varchar, process varchar, name varchar, value numeric);
 
-create or replace function annot_across_wf(name varchar)
-returns setof annot_across_wf_type as $$
-	select   process.workflow_id,process.id,annot_ds_num.name,annot_ds_num.value 
-	from     process,ds_usage,ds_containment,annot_ds_num
-	where    process.id=ds_usage.process_id and ds_usage.dataset_id=ds_containment.out_id and
-        	 ds_containment.in_id=annot_ds_num.id and annot_ds_num.name=$1
-	order by process.workflow_id,process.id;
-$$ language sql;
+CREATE OR REPLACE FUNCTION compare_run_by_annot_num(name VARCHAR)
+RETURNS TABLE (
+  workflow_id VARCHAR, 
+  name VARCHAR, 
+  value NUMERIC
+)
+AS $$
+    SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value
+    FROM   annot_ds_num,ds_usage,ds_containment,process
+    WHERE  annot_ds_num.id=ds_containment.in_id AND ds_containment.out_id=ds_usage.dataset_id AND
+           ds_usage.process_id=process.id AND annot_ds_num.name=$1
+  UNION
+    SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value 
+    FROM   process, ds_usage, annot_ds_num
+    WHERE  process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_num.id and
+           annot_ds_num.name=$1
+  UNION
+    SELECT process.workflow_id, annot_p_num.name, annot_p_num.value 
+    FROM   process, annot_p_num
+    WHERE  process.id=annot_p_num.id and annot_p_num.name=$1
+  UNION
+    SELECT workflow.id as workflow_id, annot_wf_num.name, annot_wf_num.value 
+    FROM   workflow, annot_wf_num
+    WHERE  workflow.id=annot_wf_num.id and annot_wf_num.name=$1
+$$ LANGUAGE SQL;
 
+CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
+RETURNS TABLE (
+  workflow_id VARCHAR, 
+  name VARCHAR, 
+  value VARCHAR) 
+AS $$
+    SELECT   process.workflow_id, annot_ds_txt.name, annot_ds_txt.value 
+    FROM     process, ds_usage, annot_ds_txt
+    WHERE    process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_txt.id and
+             annot_ds_txt.name=$1
+  UNION
+    SELECT   process.workflow_id, annot_p_txt.name, annot_p_txt.value 
+    FROM     process, annot_p_txt
+    WHERE    process.id=annot_p_txt.id and annot_p_txt.name=$1
+  UNION
+    SELECT   workflow.id as workflow_id, annot_wf_txt.name, annot_wf_txt.value 
+    FROM     workflow, annot_wf_txt
+    WHERE    workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
+$$ LANGUAGE SQL;
 
+CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
+RETURNS TABLE (
+  workflow_id VARCHAR,
+  name VARCHAR, 
+  value BOOLEAN
+) 
+AS $$
+    SELECT   process.workflow_id, annot_ds_bool.name, annot_ds_bool.value 
+    FROM     process, ds_usage, annot_ds_bool
+    WHERE    process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_bool.id and
+             annot_ds_bool.name=$1
+  UNION
+    SELECT   process.workflow_id, annot_p_bool.name, annot_p_bool.value 
+    FROM     process, annot_p_bool
+    WHERE    process.id=annot_p_bool.id and annot_p_bool.name=$1
+  UNION
+    SELECT   workflow.id as workflow_id, annot_wf_bool.name, annot_wf_bool.value 
+    FROM     workflow, annot_wf_bool
+    WHERE    workflow.id=annot_wf_bool.id and annot_wf_bool.name=$1
+$$ LANGUAGE SQL;
 
--- lists variations of the values of a set of parameters
-
-DROP TYPE param_across_wf_class_type CASCADE;
-CREATE TYPE param_across_wf_class_type AS (workflow VARCHAR, parameter VARCHAR, value VARCHAR);
-
-CREATE OR REPLACE FUNCTION param_across_wf_class(param_name VARCHAR, wf_class VARACHAR) 
-RETURNS SETOF param_across_wf_type AS $$
-
-$$ LANGUAGE plpgsql; 
-
 -- correlate a parameter with workflow runtime statistics
-DROP TYPE correlate_param_runtime_type CASCADE;
-CREATE TYPE correlate_param_runtime_type 
-AS (workflow VARCHAR, 
+CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR) 
+RETURNS TABLE (
+    workflow VARCHAR,  
     workflow_starttime TIMESTAMP WITH TIME ZONE, 
     workflow_duration NUMERIC, 
     parameter VARCHAR, 
-    parameter_value VARCHAR);
-
-CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR) 
-RETURNS SETOF correlate_param_runtime_type AS $$
-	SELECT A.workflow,to_timestamp(B.start_time),B.duration,A.parameter,A.value
-	FROM   param_across_wf($1) AS A, workflow AS B
-	WHERE  A.workflow=B.log_filename;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR) 
-RETURNS SETOF correlate_param_runtime_type AS $$
+    parameter_value VARCHAR
+) 
+AS $$
 	SELECT workflow.id,to_timestamp(workflow.start_time),workflow.duration,ds_usage.param_name,variable.value
 	FROM   variable,ds_usage,process,workflow
 	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND 
@@ -90,73 +145,13 @@
 	WHERE  process.workflow_id=$1;
 $$ LANGUAGE SQL;
 
--- lists distinct processes in a 
-
--- OOPS-specific functions
-
-
-
-CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
-	SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
-$$ LANGUAGE SQL;
-
-DROP TYPE oops_param_across_wf_type CASCADE;
-CREATE TYPE oops_param_across_wf_type AS (oops_run_id VARCHAR, param_name VARCHAR, variable VARCHAR);
-CREATE OR REPLACE FUNCTION oops_param_across_wf(VARCHAR) RETURNS SETOF oops_param_across_wf_type AS $$
-	SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
-	FROM   variable,ds_usage,process,annot_wf_txt
-	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
-	       ds_usage.param_name=$1 AND annot_wf_txt.name='oops_run_id'
-	GROUP BY annot_wf_txt.name,annot_wf_txt.value,ds_usage.param_name,variable.value;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
-       SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
-$$ LANGUAGE SQL;
-
-DROP TYPE oops_summary CASCADE;
-CREATE TYPE oops_summary AS (oops_run_id VARCHAR, start_time TIMESTAMP WITH TIME ZONE, duration_sec NUMERIC, swift_version VARCHAR);
-CREATE OR REPLACE FUNCTION oops_run_summary(varchar) RETURNS SETOF oops_summary AS $$
-       SELECT annot_wf_txt.value as oops_run_id, to_timestamp(workflow.start_time) as start_time,
-              workflow.duration as duration_sec,workflow.swift_version as swift_version
-       FROM   annot_wf_txt,workflow
-       WHERE  annot_wf_txt.id=workflow.id and annot_wf_txt.name='oops_run_id' and annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION oops_process_names(varchar) RETURNS SETOF varchar AS $$
-	   SELECT DISTINCT(process.name)
-	   FROM   process, annot_wf_txt
-       WHERE  process.workflow_id=annot_wf_txt.id AND 
-              annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-DROP TYPE oops_wf_param_summary CASCADE;
-CREATE TYPE oops_wf_param_summary AS (oops_run_id varchar, param_name varchar, value varchar);
-CREATE OR REPLACE FUNCTION oops_variable_summary() RETURNS SETOF oops_wf_param_summary AS $$
-	SELECT annot_wf_txt.value,ds_usage.param_name,variable.value 
-	FROM   variable,ds_usage,process,annot_wf_txt
-	WHERE  variable.id=ds_usage.dataset_id and ds_usage.process_id=process.id and process.workflow_id=annot_wf_txt.id;
-$$ LANGUAGE SQL;
-
-
-DROP TYPE oops_param_summary CASCADE;
-CREATE TYPE oops_param_summary AS (param_name varchar, value varchar);
-CREATE OR REPLACE FUNCTION oops_science_summary(varchar) RETURNS SETOF oops_param_summary AS $$
-	SELECT ds_usage.param_name,variable.value
-	FROM   variable,ds_usage,process,annot_wf_txt
-	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
-	       (ds_usage.param_name='proteinId' OR ds_usage.param_name='targetId' OR ds_usage.param_name='seqFile' OR 
-	       ds_usage.param_name='prot' OR ds_usage.param_name='prepTarFile' OR ds_usage.param_name='nSim') AND
-	       annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
 -- recursive query to find ancestor entities in a provenance graph
 
 CREATE OR REPLACE FUNCTION ancestors(varchar) RETURNS SETOF varchar AS $$
        WITH RECURSIVE anc(ancestor,descendant) AS
          (    
               SELECT parent AS ancestor, child AS descendant FROM parent_of WHERE child=$1
-              UNION
+            UNION
               SELECT parent_of.parent AS ancestor, anc.descendant AS descendant
               FROM   anc,parent_of
               WHERE  anc.ancestor=parent_of.child
@@ -164,5 +159,77 @@
        SELECT ancestor FROM anc
 $$ LANGUAGE SQL;
 
+-- compare(<entity>, <list of param_names, annotations keys>
 
- 
+CREATE OR REPLACE FUNCTION compare_run(VARIADIC args VARCHAR[])
+RETURNS SETOF RECORD AS $$
+DECLARE 
+  i             INTEGER;
+  q             VARCHAR;
+  selectq       VARCHAR;
+  fromq         VARCHAR;
+  property      VARCHAR;
+  property_type VARCHAR;
+  function_name VARCHAR;
+BEGIN 
+  selectq := 'SELECT *';
+  FOR i IN array_lower(args, 1)..array_upper(args, 1) LOOP
+    property_type := split_part(args[i], '=', 1);
+    property := split_part(args[i], '=', 2);
+    CASE property_type
+    WHEN 'param_name' THEN
+      function_name := 'compare_run_by_parameter';
+    WHEN 'annot_num' THEN
+      function_name := 'compare_run_by_annot_num';
+    WHEN 'annot_txt' THEN
+      function_name := 'compare_run_by_annot_txt';
+    WHEN 'annot_bool' THEN
+      function_name := 'compare_run_by_annot_bool';
+    END CASE;
+    IF i = 1 THEN
+      fromq := function_name || '(''' || property || ''') as t' || i;
+    ELSE
+      fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
+    END IF;
+  END LOOP;
+  q := selectq || ' FROM ' || fromq;
+  RETURN QUERY EXECUTE q;
+END;
+$$ LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION compare_run_print(VARIADIC args VARCHAR[])
+RETURNS VARCHAR AS $$
+DECLARE 
+  i             INTEGER;
+  q             VARCHAR;
+  selectq       VARCHAR;
+  fromq         VARCHAR;
+  property      VARCHAR;
+  property_type VARCHAR;
+  function_name VARCHAR;
+BEGIN 
+  selectq := 'SELECT *';
+  FOR i IN array_lower(args, 1)..array_upper(args, 1) LOOP
+    property_type := split_part(args[i], '=', 1);
+    property := split_part(args[i], '=', 2);
+    CASE property_type
+    WHEN 'param_name' THEN
+      function_name := 'compare_run_by_parameter';
+    WHEN 'annot_num' THEN
+      function_name := 'compare_run_by_annot_num';
+    WHEN 'annot_txt' THEN
+      function_name := 'compare_run_by_annot_txt';
+    WHEN 'annot_bool' THEN
+      function_name := 'compare_run_by_annot_bool';
+    END CASE;
+    IF i = 1 THEN
+      fromq := function_name || '(''' || property || ''') as t' || i;
+    ELSE
+      fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
+    END IF;
+  END LOOP;
+  q := selectq || ' FROM ' || fromq;
+  RETURN q;
+END;
+$$ LANGUAGE plpgsql;

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/prov-init.sql	2011-03-27 21:54:23 UTC (rev 4224)
@@ -1,35 +1,30 @@
 -- this is the schema definition used for the main relational provenance
 -- implementation (in both sqlite3 and postgres)
    
+drop table run cascade;
+drop table process cascade;
+drop table app_invocation cascade;
+drop table app_execution cascade;
+drop table runtime_info cascade;
 drop table dataset cascade;
 drop table file cascade;
-drop table variable cascade;
-drop table ds_containment cascade;
-drop table process cascade;
-drop table execute cascade;
-drop table execute2 cascade;
-drop table workflow cascade;
-drop table ds_usage cascade;
-drop table annot_ds_num cascade;
-drop table annot_ds_txt cascade;
-drop table annot_ds_bool cascade;
-drop table annot_p_num cascade;
-drop table annot_p_txt cascade;
-drop table annot_p_bool cascade;
-drop table annot_wf_num cascade;
-drop table annot_wf_txt cascade;
-drop table annot_wf_bool cascade;
--- drop table extrainfo cascade;
-drop table createarray cascade;
-drop table createarray_member cascade;
-drop table array_range cascade;
+drop table in_memory cascade;
+drop table dataset_containment cascade;
+drop table dataset_consumption cascade;
+drop table dataset_production cascade;
+drop table annot_dataset_numeric cascade;
+drop table annot_dataset_text cascade;
+drop table annot_process_numeric cascade;
+drop table annot_process_text cascade;
+drop table annot_run_numeric cascade;
+drop table annot_run_text cascade;
 
--- workflow stores some information about each workflow log that has
+-- run stores information about each script run log that has
 -- been seen by the importer: the log filename, swift version and import
 -- status.
--- Might be interesting to store xml translation of the swiftscript code
--- here for prospective provenance/versioning
-create table workflow
+-- Might be interesting to store xml translation of the Swift script
+-- here for prospective provenance and versioning.
+create table run
     (id            varchar(256) primary key,
      log_filename  varchar(2048),
      swift_version varchar(16),
@@ -38,68 +33,24 @@
      duration      numeric
     );
 
--- workflow_run stores the start time and duration for each workflow
--- that has been successfully imported.
---CREATE TABLE wf_run
---    (id         varchar(256) PRIMARY KEY  REFERENCES workflow (id) ON DELETE CASCADE, 
---     start_time    numeric,
---     duration      numeric
---    );
-
--- dataset stores all dataset identifiers.
-create table dataset
-    (id varchar(256) primary key
-    );
-
--- file stores the filename mapped to each dataset. 
-create table file
-    ( id         varchar(256) primary key references dataset (id) on delete cascade,
-      filename   varchar(2048)
-    );
-
--- dataset_values stores the value for each dataset which is known to have
--- a value (which is all assigned primitive types). No attempt is made here
--- to expose that value as an SQL type other than a string, and so (for
--- example) SQL numerical operations should not be expected to work, even
--- though the user knows that a particular dataset stores a numeric value.
-create table variable
-    ( id    varchar(256) primary key references dataset (id) on delete cascade,
-      value varchar(2048)
-    );
-
--- dataset_containment stores the containment hierarchy between
--- container datasets (arrays and structs) and their contents.
--- out_id contains in_id
--- TODO this should perhaps be replaced with a more OPM-like model of
--- constructors and accessors, rather than, or in addition to,
--- a containment hierarchy. The relationship (such as array index or
--- structure member name) should also be stored in this table.
-create table ds_containment
-    ( out_id varchar(256) references dataset (id) on delete cascade,
-      in_id  varchar(256) references dataset (id) on delete cascade,
-      primary key (out_id,in_id)
-    );
-
 -- process gives information about each process (in the OPM sense)
 -- it is augmented by information in other tables
 -- specifies the type of process. for any type, it
 -- must be the case that the specific type table
 -- has an entry for this process.
--- Having this type here seems poor normalisation, though?
 -- process types: internal, rootthread, execute, function, compound, scope, operator
--- maybe create a table for each type?
 create table process
     (id          varchar(256) primary key, 
      type        varchar(16),
      name	     varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
-     workflow_id varchar(256) references workflow (id) on delete cascade   -- normalize: workflow_id of sub-procedure determined
+     run_id varchar(256) references run (id) on delete cascade   -- normalize: workflow_id of sub-procedure determined
           	 	      		 	       	  	 	   -- by compound procedure 
     );
 
 -- this gives information about each execute.
 -- each execute is identified by a unique URI. other information from
 -- swift logs is also stored here. an execute is an OPM process.
-create table execute
+create table app_invocation
     (id             varchar(256) primary key references process (id) on delete cascade,  
      procedure_name varchar(256), -- name of the app procedure that invokes the transformation
      start_time     numeric,
@@ -111,156 +62,159 @@
 -- this gives information about each execute2, which is an attempt to
 -- perform an execution. the execute2 id is tied to per-execution-attempt
 -- information such as wrapper logs
-create table execute2
+create table app_execution
     (id          varchar(256) primary key,
-     execute_id  varchar(256) references execute (id) on delete cascade, 
+     app_invocation_id  varchar(256) references app_invocation (id) on delete cascade, 
      start_time  numeric,
      duration    numeric,
      final_state varchar(16),
      site        varchar(256)
     );
 
+-- app execution runtime info extracted from the /proc filesystem (assumes the app executed
+-- in a Linux host) 
+create table runtime_info
+   ( app_execution_id   varchar(256) references app_execution (id) on delete cascade, 
+     event_timestamp	numeric,
+     cpu_usage          numeric,
+     max_phys_mem	numeric,
+     max_virtual_mem	numeric,
+     io_read_bytes	numeric,
+     io_write_bytes	numeric,
+     primary key (app_execution_id, event_timestamp)
+   );
+
+-- ds stores all dataset identifiers.
+create table dataset
+    (id varchar(256) primary key
+    );
+
+-- file stores the filename mapped to each dataset. 
+create table file
+    ( id         varchar(256) primary key references dataset (id) on delete cascade,
+      filename   varchar(2048)
+    );
+
+-- dataset_values stores the value for each dataset which is known to have
+-- a value (which is all assigned primitive types). No attempt is made here
+-- to expose that value as an SQL type other than a string, and so (for
+-- example) SQL numerical operations should not be expected to work, even
+-- though the user knows that a particular dataset stores a numeric value.
+create table in_memory
+    ( id    varchar(256) primary key references dataset (id) on delete cascade,
+      value varchar(2048)
+    );
+
+-- dataset_containment stores the containment hierarchy between
+-- container datasets (arrays and structs) and their contents.
+-- out_id contains in_id
+-- TODO this should perhaps be replaced with a more OPM-like model of
+-- constructors and accessors, rather than, or in addition to,
+-- a containment hierarchy. The relationship (such as array index or
+-- structure member name) should also be stored in this table.
+create table dataset_containment
+    ( out_id varchar(256) references dataset (id) on delete cascade,
+      in_id  varchar(256) references dataset (id) on delete cascade,
+      primary key (out_id,in_id)
+    );
+
+
 -- dataset_usage records usage relationships between processes and datasets;
 -- in SwiftScript terms, the input and output parameters for each
 -- application procedure invocation; in OPM terms, the artificts which are
 -- input to and output from each process that is a Swift execution
-create table ds_usage
+create table dataset_consumption
     (process_id varchar(256) references process(id) on delete cascade, 
-     direction  char(1), -- I or O for input or output
      dataset_id varchar(256) references dataset(id) on delete cascade,
      param_name varchar(256), -- the name of the parameter in this execute that
                               -- this dataset was bound to. sometimes this must
                               -- be contrived (for example, in positional varargs)
-     primary key (process_id,direction,dataset_id,param_name)
+     primary key (process_id,dataset_id,param_name)
     );
 
+create table dataset_production
+    (process_id varchar(256) references process(id) on delete cascade, 
+     dataset_id varchar(256) references dataset(id) on delete cascade,
+     param_name varchar(256), -- the name of the parameter in this execute that
+                              -- this dataset was bound to. sometimes this must
+                              -- be contrived (for example, in positional varargs)
+     primary key (process_id,dataset_id,param_name)
+    );
+
+
 -- annotations
-create table annot_ds_num
-   ( id    varchar(256) references dataset (id) on delete cascade, 
+create table annot_dataset_numeric
+   ( dataset_id    varchar(256) references dataset (id) on delete cascade, 
      name  varchar(256),
      value numeric,
-     primary key (id, name)
+     primary key (dataset_id, name)
    );
 
-create table annot_ds_txt
-   ( id    varchar(256) references dataset (id) on delete cascade, 
+create table annot_dataset_text
+   ( dataset_id    varchar(256) references dataset (id) on delete cascade, 
      name  varchar(256),
      value varchar(2048),
-     primary key (id, name)
+     primary key (dataset_id, name)
    );
 
-create table annot_ds_bool
-   ( id    varchar(256) references dataset (id) on delete cascade, 
+create table annot_process_numeric
+   ( process_id    varchar(256) references process (id) on delete cascade, 
      name  varchar(256),
-     value boolean,
-     primary key (id, name)
-   );
-
-create table annot_p_num
-   ( id    varchar(256) references process (id) on delete cascade, 
-     name  varchar(256),
      value numeric,
-     primary key (id, name)
+     primary key (process_id, name)
    );
 
-create table annot_p_txt
-   ( id    varchar(256) references process (id) on delete cascade, 
+create table annot_process_text
+   ( process_id    varchar(256) references process (id) on delete cascade, 
      name  varchar(256),
      value varchar(2048),
-     primary key (id, name)
+     primary key (process_id, name)
    );
 
-create table annot_p_bool
-   ( id    varchar(256) references process (id) on delete cascade, 
+create table annot_run_numeric
+   ( run_id    varchar(256) references run (id) on delete cascade, 
      name  varchar(256),
-     value boolean,
-     primary key (id, name)
-   );
-
-create table annot_wf_num
-   ( id    varchar(256) references workflow (id) on delete cascade, 
-     name  varchar(256),
      value numeric,
-     primary key (id, name)
+     primary key (run_id, name)
    );
 
-create table annot_wf_txt
-   ( id    varchar(256) references workflow (id) on delete cascade, 
+create table annot_run_text
+   ( run_id    varchar(256) references run (id) on delete cascade, 
      name  varchar(256),
      value varchar(2048),
-     primary key (id, name)
+     primary key (run_id, name)
    );
 
-create table annot_wf_bool
-   ( id    varchar(256) references workflow (id) on delete cascade, 
-     name  varchar(2048),
-     value boolean,
-     primary key (id, name)
-   );
-
-create view parent_of as 
-       select process_id as parent,dataset_id as child from ds_usage where direction='O'
+create view provenance_graph as 
+       select process_id as parent,dataset_id as child from dataset_production
        union
-       select dataset_id as parent,process_id as child from ds_usage where direction='I'
+       select dataset_id as parent,process_id as child from dataset_consumption
        union 
-       select out_id as parent,in_id as child from ds_containment;
+       select out_id as parent,in_id as child from dataset_containment;
 
--- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
--- extrainfo will be processes into annotation tables
--- CREATE TABLE extrainfo
---     ( id        varchar(256) PRIMARY KEY REFERENCES execute (id) ON DELETE CASCADE,
---       extrainfo varchar(2048)
---     );
 
--- The above dataset_* tables are the original containment representation
--- of datasets.
--- Now here are some tables to represent complex datasets differently, in
--- terms of accessors and constructors. 
--- It is unclear which is the better representation.
+CREATE VIEW run_date AS
+  SELECT id, log_filename, swift_version, import_status, 
+         to_timestamp(start_time) as start_time, duration
+  FROM   run;
 
-create table createarray
-    ( id varchar(256) primary key
-    );
+CREATE VIEW annot_text AS
+    SELECT *
+    FROM annot_run_text 
+  UNION
+    SELECT * 
+    FROM annot_dataset_text 
+  UNION 
+    SELECT * 
+    FROM annot_process_text;
 
-create table createarray_member
-    ( array_id  varchar(256) references createarray (id) on delete cascade,
-      ix        varchar(256),
-      member_id varchar(256),
-      primary key (array_id, ix)
-    );
+CREATE VIEW annot_numeric AS
+    SELECT *
+    FROM annot_run_numeric 
+  UNION
+    SELECT * 
+    FROM annot_dataset_numeric 
+  UNION 
+    SELECT * 
+    FROM annot_process_numeric;
 
--- TODO step
-create table array_range
-    ( array_id varchar(256) references createarray (id) on delete cascade,
-      from_id  varchar(256),
-      to_id    varchar(256),
-      step_id  varchar(256), -- nullable, if step is unspecified
-      primary key (array_id,from_id,to_id,step_id)
-    );
-
--- this GRANT does not work for sqlite; you'll get a syntax error but
--- ignore it, as it is not needed in sqlite
-grant all on  
- dataset,
- file,
- variable,
- ds_containment,
- process,
- execute,
- execute2,
- workflow,
- ds_usage,
- annot_ds_num,
- annot_ds_txt,
- annot_ds_bool,
- annot_p_num,
- annot_p_txt,
- annot_p_bool,
- annot_wf_num,
- annot_wf_txt,
- annot_wf_bool,
--- extrainfo,
- createarray,
- createarray_member,
- array_range
-to public, operators;

Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/prov-to-sql.sh	2011-03-27 21:54:23 UTC (rev 4224)
@@ -8,22 +8,23 @@
 export WF="${RUNID}"
 
 echo Generating SQL for $RUNID
+rm -f tmp-u.sql tmp-ds.sql tmp-p.sql tmp-e.sql tmp-e2.sql tmp-dsu.sql tmp-dsc.sql tmp-f.sql tmp-v.sql tmp-import.sql import.sql extrainfo.sql fields.txt
 
 
 # this gives a distinction between the root process for a workflow and the
 # workflow itself. perhaps better to model the workflow as a process
-echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" > tmp-p.sql
+echo "INSERT INTO process (id, type, name, run_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" > tmp-p.sql
 
 while read time duration thread localthread endstate tr_name scratch; do
-    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-p.sql
-    echo "INSERT INTO execute (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" > tmp-e.sql
+    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-p.sql
+    echo "INSERT INTO app_invocation (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" >> tmp-e.sql
 done < execute.global.event
 
 while read start_time duration globalid id endstate thread site scratch; do
     # cut off the last component of the thread, so that we end up at the
     # parent thread id which should correspond with the execute-level ID
     inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
-    echo "INSERT INTO execute2 (id, execute_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" > tmp-e2.sql
+    echo "INSERT INTO app_execution (id, app_invocation_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-e2.sql
 done < execute2.global.event
 
 while read col1 col2 col3 col4 col5 thread name lhs rhs result; do
@@ -41,13 +42,13 @@
 	result=$(echo $result | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
     
-    echo "INSERT INTO dataset (id) VALUES ('$lhs');" > tmp-ds.sql
+    echo "INSERT INTO dataset (id) VALUES ('$lhs');" >> tmp-ds.sql
     echo "INSERT INTO dataset (id) VALUES ('$rhs');" >> tmp-ds.sql
     echo "INSERT INTO dataset (id) VALUES ('$result');" >> tmp-ds.sql
-    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-p.sql
-    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$lhs', 'lhs');" > tmp-dsu.sql
-    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$rhs', 'rhs');" >> tmp-dsu.sql
-    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'O', '$result', 'result');" >> tmp-dsu.sql
+    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-p.sql
+    echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$lhs', 'lhs');" >> tmp-dsu.sql
+    echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$rhs', 'rhs');" >> tmp-dsu.sql
+    echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$operatorid', '$result', 'result');" >> tmp-dsu.sql
 done < operators.txt
 
 while read id name output; do
@@ -55,8 +56,8 @@
 	output=$(echo $output | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
     echo "INSERT INTO dataset (id) VALUES ('$output');" >> tmp-ds.sql
-    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-p.sql
-    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'O', '$output', 'result');" >> tmp-dsu.sql
+    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-p.sql
+    echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$id', '$output', 'result');" >> tmp-dsu.sql
 done < functions.txt
 
 while read id value; do
@@ -67,12 +68,12 @@
     fi
     
     echo "INSERT INTO dataset (id) VALUES ('$value');" >> tmp-ds.sql
-    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'I', '$value', 'undefined');" >> tmp-dsu.sql
+    echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$id', '$value', 'undefined');" >> tmp-dsu.sql
 done < function-inputs.txt
 
 
 while read thread appname; do
-    echo "UPDATE execute SET procedure_name='$appname' WHERE id='$thread';" >> tmp-e.sql
+    echo "UPDATE app_invocation SET procedure_name='$appname' WHERE id='$thread';" >> tmp-e.sql
 done < invocation-procedure-names.txt
 
 while read outer inner; do
@@ -84,7 +85,7 @@
 
     echo "INSERT INTO dataset (id) VALUES ('$outer');" >> tmp-ds.sql
     echo "INSERT INTO dataset (id) VALUES ('$inner');" >> tmp-ds.sql
-    echo "INSERT INTO ds_containment (out_id, in_id) VALUES ('$outer', '$inner');" > tmp-dsc.sql
+    echo "INSERT INTO dataset_containment (out_id, in_id) VALUES ('$outer', '$inner');" >> tmp-dsc.sql
 done < tie-containers.txt
 
 while read dataset filename; do
@@ -94,7 +95,7 @@
     fi
 
     echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
-    echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" > tmp-f.sql
+    echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" >> tmp-f.sql
 done < dataset-filenames.txt
 
 while read dataset value; do
@@ -104,77 +105,52 @@
     fi
 
     echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
-    echo "INSERT INTO variable (id, value) VALUES ('$dataset', '$value');" > tmp-v.sql
+    echo "INSERT INTO in_memory (id, value) VALUES ('$dataset', '$value');" >> tmp-v.sql
 done < dataset-values.txt
 
 while read start duration wfid rest; do
-    echo "UPDATE workflow SET start_time=$start WHERE id='$WF';" > tmp-u.sql
-    echo "UPDATE workflow SET duration=$duration WHERE id='$WF';" >> tmp-u.sql
+    echo "UPDATE run SET start_time=$start WHERE id='$WF';" >> tmp-u.sql
+    echo "UPDATE run SET duration=$duration WHERE id='$WF';" >> tmp-u.sql
 done < workflow.event
 
 
-
-while read id extrainfo; do
-    echo $extrainfo | awk -F ";"  '{ for (i = 1; i <= NF; i++)
-                                             print $i
-                                         }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
-    while read name type value; do
-	if [ "$type" = "num" ]; then
-	    echo "INSERT INTO annot_p_num (id, name, value) VALUES ('$id', '$name', $value);" > tmp-import.sql
-	fi 
-	if [ "$type" = "txt" ]; then
-	    echo "INSERT INTO annot_p_txt (id, name, value) VALUES ('$id', '$name', '$value');" >> tmp-import.sql
-	fi
-	if [ "$type" = "bool" ]; then
-	    echo "INSERT INTO annot_p_bool (id, name, value) VALUES ('$id', '$name', $value);" >> tmp-import.sql
-	fi
-    done < fields.txt
-done < extrainfo.txt
-
 # TODO this could merge with other naming tables
 while read start duration thread final_state procname ; do
     if [ "$duration" != "last-event-line" ]; then
 	compoundid=$WFID$thread
-	echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-p.sql
+	echo "INSERT INTO process (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-p.sql
     fi
 done < compound.event
 
 while read start duration thread final_state procname ; do
     if [ "$duration" != "last-event-line" ]; then
 	fqid=$WFID$thread
-	echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-p.sql
+	echo "INSERT INTO process (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-p.sql
     fi	
 done < internalproc.event
 
 while read t ; do 
     thread="${WFID}$t"
-    echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-p.sql
+    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-p.sql
 done < scopes.txt
 
 while read thread direction dataset variable rest; do 
-    if [ "$direction" == "input" ] ; then
-	dir=I
-    else
-	dir=O
-    fi
 
     if [ $version -le 3726 ]; then
 	dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
 
+    if [ "$direction" == "input" ] ; then
+	table=dataset_consumption
+    else
+	table=dataset_production
+    fi
+
     echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
-    echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$thread', '$dir', '$dataset', '$variable');" >> tmp-dsu.sql
+    echo "INSERT INTO $table (process_id, dataset_id, param_name) VALUES ('$thread', '$dataset', '$variable');" >> tmp-dsu.sql
 done < tie-data-invocs.txt
 
-while read id ; do
-    echo "INSERT INTO createarray (array_id) VALUES ('$id');" >> tmp-import.sql
-done < createarray.txt
-
-while read arrayid index memberid ; do
-    echo "INSERT INTO createarray_member (array_id, ix, member_id) VALUES ('$arrayid', '$index', '$memberid');" >> tmp-import.sql
-done < createarray-members.txt
-
-cat tmp-ds.sql | sort | uniq > import.sql
+cat tmp-ds.sql | sort | uniq >> import.sql
 cat tmp-f.sql | sort | uniq >> import.sql
 cat tmp-v.sql | sort | uniq >> import.sql
 cat tmp-p.sql | sort | uniq >> import.sql
@@ -187,7 +163,23 @@
 echo Sending SQL to DB
 
 $SQLCMD < import.sql
-rm -f tmp-u.sql tmp-ds.sql tmp-p.sql tmp-e.sql tmp-e2.sql tmp-dsu.sql tmp-dsc.sql tmp-f.sql tmp-v.sql tmp-import.sql import.sql fields.txt
 
+while read execute2_id extrainfo; do
+    echo $extrainfo | awk -F ";"  '{ for (i = 1; i <= NF; i++)
+                                             print $i
+                                         }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
+    id=$($SQLCMD --tuples-only -c "select app_invocation_id from app_execution where id='$execute2_id';" | awk '{print $1}')
+    while read name type value; do
+	if [ "$type" = "num" ]; then
+	    echo "INSERT INTO annot_process_numeric (id, name, value) VALUES ('$id', '$name', $value);" >> extrainfo.sql
+	fi 
+	if [ "$type" = "txt" ]; then
+	    echo "INSERT INTO annot_process_text (id, name, value) VALUES ('$id', '$name', '$value');" >> extrainfo.sql
+	fi
+    done < fields.txt
+done < extrainfo.txt
+
+$SQLCMD < extrainfo.sql
+
 echo Finished sending SQL to DB
 

Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs	2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/swift-prov-import-all-logs	2011-03-27 21:54:23 UTC (rev 4224)
@@ -38,7 +38,7 @@
 	echo -n "Log: $filename ... "
 	
         # TODO: does not work in sqlite
-	EXISTING=$($SQLCMD --tuples-only -c "select count(*) from workflow where log_filename='$filename';")
+	EXISTING=$($SQLCMD --tuples-only -c "select count(*) from run where log_filename='$filename';")
 	
 	if [ "$EXISTING" -eq "0" ];  then
 	    PROV_ENABLED=$(grep provenanceid $filename | wc -l)
@@ -55,7 +55,7 @@
 		
 		export WF="${RUNID}"
 		
-		echo "INSERT INTO workflow (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
+		echo "INSERT INTO run (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
 		
 		echo version $version in log file $filename
 		echo ============= will import =============