[Swift-commit] r3725 - provenancedb

noreply at svn.ci.uchicago.edu noreply at svn.ci.uchicago.edu
Thu Nov 25 16:57:40 CST 2010


Author: lgadelha
Date: 2010-11-25 16:57:39 -0600 (Thu, 25 Nov 2010)
New Revision: 3725

Added:
   provenancedb/pql_functions.sql
Modified:
   provenancedb/prov-init.sql
   provenancedb/prov-to-sql.sh
   provenancedb/swift-prov-import-all-logs
Log:


Added: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql	                        (rev 0)
+++ provenancedb/pql_functions.sql	2010-11-25 22:57:39 UTC (rev 3725)
@@ -0,0 +1,95 @@
+-- SQL Functions
+
+CREATE OR REPLACE FUNCTION list_runs() RETURNS SETOF VARCHAR AS $$
+       SELECT DISTINCT(log_filename) FROM workflow;
+$$ LANGUAGE SQL;
+
+-- lists distinct processes by name in a workflow
+
+CREATE OR REPLACE FUNCTION process_names(wf_id VARCHAR) RETURNS SETOF VARCHAR AS $$
+	   SELECT DISTINCT(process.name)
+	   FROM   process
+       WHERE  process.workflow_id=$1;
+$$ LANGUAGE SQL;
+
+-- lists variations in a parameter's value across workflows
+
+DROP TYPE param_across_wf_type CASCADE;
+CREATE TYPE param_across_wf_type AS (workflow VARCHAR, parameter VARCHAR, value VARCHAR);
+
+CREATE OR REPLACE FUNCTION param_across_wf(param_name VARCHAR) RETURNS SETOF param_across_wf_type AS $$
+	SELECT workflow.log_filename,ds_usage.param_name,variable.value
+	FROM   variable,ds_usage,process,workflow
+	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND 
+	       process.workflow_id=workflow.id AND ds_usage.param_name=$1 
+	GROUP BY workflow.log_filename,ds_usage.param_name,variable.value;
+$$ LANGUAGE SQL;
+
+-- correlate a parameter with workflow runtime statistics
+DROP TYPE correlate_param_runtime_type CASCADE;
+CREATE TYPE correlate_param_runtime_type AS (workflow VARCHAR, workflow_starttime TIMESTAMP WITH TIME ZONE, workflow_duration NUMERIC, parameter VARCHAR, parameter_value VARCHAR);
+
+CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR) RETURNS SETOF correlate_param_runtime_type AS $$
+	SELECT A.workflow,to_timestamp(B.start_time),B.duration,A.parameter,A.value
+	FROM   param_across_wf($1) AS A, workflow AS B
+	WHERE  A.workflow=B.log_filename;
+$$ LANGUAGE SQL;
+
+-- OOPS-specific functions
+
+CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
+	SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_param_across_wf_type CASCADE;
+CREATE TYPE oops_param_across_wf_type AS (oops_run_id VARCHAR, param_name VARCHAR, variable VARCHAR);
+
+CREATE OR REPLACE FUNCTION oops_param_across_wf(VARCHAR) RETURNS SETOF oops_param_across_wf_type AS $$
+	SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
+	FROM   variable,ds_usage,process,annot_wf_txt
+	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
+	       ds_usage.param_name=$1 AND annot_wf_txt.name='oops_run_id'
+	GROUP BY annot_wf_txt.name,annot_wf_txt.value,ds_usage.param_name,variable.value;
+$$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
+       SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_summary CASCADE;
+CREATE TYPE oops_summary AS (oops_run_id VARCHAR, start_time TIMESTAMP WITH TIME ZONE, duration_sec NUMERIC, swift_version VARCHAR);
+
+CREATE OR REPLACE FUNCTION oops_run_summary(varchar) RETURNS SETOF oops_summary AS $$
+       SELECT annot_wf_txt.value as oops_run_id, to_timestamp(workflow.start_time) as start_time,
+              workflow.duration as duration_sec,workflow.swift_version as swift_version
+       FROM   annot_wf_txt,workflow
+       WHERE  annot_wf_txt.id=workflow.id and annot_wf_txt.name='oops_run_id' and annot_wf_txt.value=$1;
+$$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION oops_process_names(varchar) RETURNS SETOF varchar AS $$
+	   SELECT DISTINCT(process.name)
+	   FROM   process, annot_wf_txt
+       WHERE  process.workflow_id=annot_wf_txt.id AND 
+              annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_wf_param_summary CASCADE;
+CREATE TYPE oops_wf_param_summary AS (oops_run_id varchar, param_name varchar, value varchar);
+
+CREATE OR REPLACE FUNCTION oops_variable_summary() RETURNS SETOF oops_wf_param_summary AS $$
+	SELECT annot_wf_txt.value,ds_usage.param_name,variable.value 
+	FROM   variable,ds_usage,process,annot_wf_txt
+	WHERE  variable.id=ds_usage.dataset_id and ds_usage.process_id=process.id and process.workflow_id=annot_wf_txt.id;
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_param_summary CASCADE;
+CREATE TYPE oops_param_summary AS (param_name varchar, value varchar);
+
+CREATE OR REPLACE FUNCTION oops_science_summary(varchar) RETURNS SETOF oops_param_summary AS $$
+	SELECT ds_usage.param_name,variable.value
+	FROM   variable,ds_usage,process,annot_wf_txt
+	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
+	       (ds_usage.param_name='proteinId' OR ds_usage.param_name='targetId' OR ds_usage.param_name='seqFile' OR 
+	       ds_usage.param_name='prot' OR ds_usage.param_name='prepTarFile' OR ds_usage.param_name='nSim') AND
+	       annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
+$$ LANGUAGE SQL;

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2010-11-24 17:18:12 UTC (rev 3724)
+++ provenancedb/prov-init.sql	2010-11-25 22:57:39 UTC (rev 3725)
@@ -19,7 +19,7 @@
 DROP TABLE annot_wf_num CASCADE;
 DROP TABLE annot_wf_txt CASCADE;
 DROP TABLE annot_wf_bool CASCADE;
-DROP TABLE extrainfo CASCADE;
+-- DROP TABLE extrainfo CASCADE;
 DROP TABLE createarray CASCADE;
 DROP TABLE createarray_member CASCADE;
 DROP TABLE array_range CASCADE;
@@ -30,12 +30,12 @@
 -- Might be interesting to store xml translation of the swiftscript code
 -- here for prospective provenance/versioning
 CREATE TABLE workflow
-    (id            varchar(256) PRIMARY KEY,
-     log_filename  varchar(2048),
-     swift_version varchar(16),
-     import_status varchar(16),
-     start_time    numeric,
-     duration      numeric
+    (id            VARCHAR(256) PRIMARY KEY,
+     log_filename  VARCHAR(2048),
+     swift_version VARCHAR(16),
+     import_status VARCHAR(16),
+     start_time    NUMERIC,
+     duration      NUMERIC
     );
 
 -- workflow_run stores the start time and duration for each workflow
@@ -48,13 +48,13 @@
 
 -- dataset stores all dataset identifiers.
 CREATE TABLE dataset
-    (id varchar(256) PRIMARY KEY
+    (id VARCHAR(256) PRIMARY KEY
     );
 
 -- file stores the filename mapped to each dataset. 
 CREATE TABLE file
-    ( id         varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
-      filename   varchar(2048)
+    ( id         VARCHAR(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+      filename   VARCHAR(2048)
     );
 
 -- dataset_values stores the value for each dataset which is known to have
@@ -63,8 +63,8 @@
 -- example) SQL numerical operations should not be expected to work, even
 -- though the user knows that a particular dataset stores a numeric value.
 CREATE TABLE variable
-    ( id    varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
-      value varchar(2048)
+    ( id    VARCHAR(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+      value VARCHAR(2048)
     );
 
 -- dataset_containment stores the containment hierarchy between
@@ -75,8 +75,8 @@
 -- a containment hierarchy. The relationship (such as array index or
 -- structure member name) should also be stored in this table.
 CREATE TABLE ds_containment
-    ( out_id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
-      in_id  varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+    ( out_id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
+      in_id  VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
       PRIMARY KEY (out_id,in_id)
     );
 
@@ -89,10 +89,10 @@
 -- process types: internal, rootthread, execute, function, compound, scope, operator
 -- maybe create a table for each type?
 CREATE TABLE process
-    (id          varchar(256) PRIMARY KEY, 
-     type        varchar(16),
-     name	 varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
-     workflow_id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE   -- normalize: workflow_id of sub-procedure determined
+    (id          VARCHAR(256) PRIMARY KEY, 
+     type        VARCHAR(16),
+     name	     VARCHAR(256), -- in the case of an execute this refers to the transformation name in tc.data
+     workflow_id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE   -- normalize: workflow_id of sub-procedure determined
           	 	      		 	       	  	 	   -- by compound procedure 
     );
 
@@ -100,24 +100,24 @@
 -- each execute is identified by a unique URI. other information from
 -- swift logs is also stored here. an execute is an OPM process.
 CREATE TABLE execute
-    (id             varchar(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,  
-     procedure_name varchar(256), -- name of the app procedure that invokes the transformation
-     start_time     numeric,
-     duration       numeric,
-     final_state    varchar(16),
-     scratch        varchar(2048)
+    (id             VARCHAR(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,  
+     procedure_name VARCHAR(256), -- name of the app procedure that invokes the transformation
+     start_time     NUMERIC,
+     duration       NUMERIC,
+     final_state    VARCHAR(16),
+     scratch        VARCHAR(2048)
     );
 
 -- this gives information about each execute2, which is an attempt to
 -- perform an execution. the execute2 id is tied to per-execution-attempt
 -- information such as wrapper logs
 CREATE TABLE execute2
-    (id          varchar(256) PRIMARY KEY,
-     execute_id  varchar(256) REFERENCES execute (id) ON DELETE CASCADE, 
-     start_time  numeric,
-     duration    numeric,
-     final_state varchar(16),
-     site        varchar(256)
+    (id          VARCHAR(256) PRIMARY KEY,
+     execute_id  VARCHAR(256) REFERENCES execute (id) ON DELETE CASCADE, 
+     start_time  NUMERIC,
+     duration    NUMERIC,
+     final_state VARCHAR(16),
+     site        VARCHAR(256)
     );
 
 -- dataset_usage records usage relationships between processes and datasets;
@@ -125,10 +125,10 @@
 -- application procedure invocation; in OPM terms, the artificts which are
 -- input to and output from each process that is a Swift execution
 CREATE TABLE ds_usage
-    (process_id varchar(256) REFERENCES process(id) ON DELETE CASCADE, 
-     direction  char(1), -- I or O for input or output
-     dataset_id varchar(256) REFERENCES dataset(id) ON DELETE CASCADE,
-     param_name varchar(256), -- the name of the parameter in this execute that
+    (process_id VARCHAR(256) REFERENCES process(id) ON DELETE CASCADE, 
+     direction  CHAR(1), -- I or O for input or output
+     dataset_id VARCHAR(256) REFERENCES dataset(id) ON DELETE CASCADE,
+     param_name VARCHAR(256), -- the name of the parameter in this execute that
                               -- this dataset was bound to. sometimes this must
                               -- be contrived (for example, in positional varargs)
      PRIMARY KEY (process_id,direction,dataset_id,param_name)
@@ -136,65 +136,65 @@
 
 -- annotations
 CREATE TABLE annot_ds_num
-   ( id    varchar(256) REFERENCES dataset (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value numeric,
+   ( id    VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value NUMERIC,
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_ds_txt
-   ( id    varchar(256) REFERENCES dataset (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value varchar(2048),
+   ( id    VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value VARCHAR(2048),
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_ds_bool
-   ( id    varchar(256) REFERENCES dataset (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value boolean,
+   ( id    VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value BOOLEAN,
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_p_num
-   ( id    varchar(256) REFERENCES process (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value numeric,
+   ( id    VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value NUMERIC,
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_p_txt
-   ( id    varchar(256) REFERENCES process (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value varchar(2048),
+   ( id    VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value VARCHAR(2048),
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_p_bool
-   ( id    varchar(256) REFERENCES process (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value boolean,
+   ( id    VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value BOOLEAN,
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_wf_num
-   ( id    varchar(256) REFERENCES workflow (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value numeric,
+   ( id    VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value NUMERIC,
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_wf_txt
-   ( id    varchar(256) REFERENCES workflow (id) ON DELETE CASCADE, 
-     name  varchar(256),
-     value varchar(2048),
+   ( id    VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE, 
+     name  VARCHAR(256),
+     value VARCHAR(2048),
      PRIMARY KEY (id, name)
    );
 
 CREATE TABLE annot_wf_bool
-   ( id    varchar(256) REFERENCES workflow (id) ON DELETE CASCADE, 
-     name  varchar(2048),
-     value boolean,
+   ( id    VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE, 
+     name  VARCHAR(2048),
+     value BOOLEAN,
      PRIMARY KEY (id, name)
    );
 
@@ -212,78 +212,30 @@
 -- It is unclear which is the better representation.
 
 CREATE TABLE createarray
-    ( id varchar(256) PRIMARY KEY
+    ( id VARCHAR(256) PRIMARY KEY
     );
 
 CREATE TABLE createarray_member
-    ( array_id  varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
-      ix        varchar(256),
-      member_id varchar(256),
+    ( array_id  VARCHAR(256) REFERENCES createarray (id) ON DELETE CASCADE,
+      ix        VARCHAR(256),
+      member_id VARCHAR(256),
       PRIMARY KEY (array_id, ix)
     );
 
 -- TODO step
 CREATE TABLE array_range
-    ( array_id varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
-      from_id  varchar(256),
-      to_id    varchar(256),
-      step_id  varchar(256), -- nullable, if step is unspecified
+    ( array_id VARCHAR(256) REFERENCES createarray (id) ON DELETE CASCADE,
+      from_id  VARCHAR(256),
+      to_id    VARCHAR(256),
+      step_id  VARCHAR(256), -- nullable, if step is unspecified
       PRIMARY KEY (array_id,from_id,to_id,step_id)
     );
 
 
--- lists distinct processes by name in a workflow
 
-CREATE OR REPLACE FUNCTION process_names(varchar) RETURNS SETOF varchar AS $$
-	   SELECT DISTINCT(process.name)
-	   FROM   process
-       WHERE  process.workflow_id=$1;
-$$ LANGUAGE SQL;
-
-
--- OOPS-specific functions
-
-CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF varchar AS $$
-       SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
-$$ LANGUAGE SQL;
-
-CREATE TYPE oops_summary AS (oops_run_id varchar, start_time timestamp with time zone, duration_sec numeric, swift_version varchar);
-
-CREATE OR REPLACE FUNCTION oops_run_summary(varchar) RETURNS SETOF oops_summary AS $$
-       SELECT annot_wf_txt.value as oops_run_id, to_timestamp(workflow.start_time) as start_time,
-              workflow.duration as duration_sec,workflow.swift_version as swift_version
-       FROM   annot_wf_txt,workflow
-       WHERE  annot_wf_txt.id=workflow.id and annot_wf_txt.name='oops_run_id' and annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION oops_process_names(varchar) RETURNS SETOF varchar AS $$
-	   SELECT DISTINCT(process.name)
-	   FROM   process, annot_wf_txt
-       WHERE  process.workflow_id=annot_wf_txt.id AND 
-              annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-CREATE TYPE oops_variable_summary AS (oops_run_id varchar, param_name varchar, value varchar);
-
-CREATE OR REPLACE FUNCTION oops_variable_summary(varchar) RETURNS SETOF oops_variable_summary $$
-	SELECT annot_wf_txt.value,ds_usage.param_name,variable.value 
-	FROM   variable,ds_usage,process,annot_wf_txt
-	WHERE  variable.id=ds_usage.dataset_id and ds_usage.process_id=process.id and process.workflow_id=annot_wf_txt.id;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION oops_science_summary(varchar) RETURNS SETOF oops_variable_summary AS $$
-	SELECT ds_usage.param_name,variable.value
-	FROM   variable,ds_usage,process,annot_wf_txt
-	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
-	       (ds_usage.param_name='proteinId' OR ds_usage.param_name='targetId' OR ds_usage.param_name='seqFile' OR 
-	       ds_usage.param_name='prot' OR ds_usage.param_name='prepTarFile' OR ds_usage.param_name='nSim') AND
-	       annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value='h733';
-$$ LANGUAGE SQL;
-
-
 -- this GRANT does not work for sqlite; you'll get a syntax error but
 -- ignore it, as it is not needed in sqlite
-grant all on  
+GRANT ALL ON  
  dataset,
  file,
  variable,
@@ -306,4 +258,4 @@
  createarray,
  createarray_member,
  array_range
-to public, operators;
+TO public, operators;

Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh	2010-11-24 17:18:12 UTC (rev 3724)
+++ provenancedb/prov-to-sql.sh	2010-11-25 22:57:39 UTC (rev 3725)
@@ -27,8 +27,6 @@
     echo "INSERT INTO execute2 (id, execute_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-e2.sql
 done < execute2.global.event
 
-
-
 while read col1 col2 col3 col4 col5 threadst namest lhsst rhsst resultst; do
     thread=`echo $threadst | awk 'BEGIN { FS = "=" }; {print $2}'`
     name=`echo $namest | awk 'BEGIN { FS = "=" }; {print $2}'`

Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs	2010-11-24 17:18:12 UTC (rev 3724)
+++ provenancedb/swift-prov-import-all-logs	2010-11-25 22:57:39 UTC (rev 3725)
@@ -50,7 +50,6 @@
 	    fi
 
 	    export RUNID=$(basename $filename .log)
-	    # changing tag authority from e-mail benc at ci.uchicago.edu to dnsname ci.uchicago.edu
 	    export WF="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
 	    
 	    echo "INSERT INTO workflow (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD




More information about the Swift-commit mailing list