[Swift-commit] r3725 - provenancedb
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Thu Nov 25 16:57:40 CST 2010
Author: lgadelha
Date: 2010-11-25 16:57:39 -0600 (Thu, 25 Nov 2010)
New Revision: 3725
Added:
provenancedb/pql_functions.sql
Modified:
provenancedb/prov-init.sql
provenancedb/prov-to-sql.sh
provenancedb/swift-prov-import-all-logs
Log:
Added: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql (rev 0)
+++ provenancedb/pql_functions.sql 2010-11-25 22:57:39 UTC (rev 3725)
@@ -0,0 +1,95 @@
+-- SQL Functions
+
+CREATE OR REPLACE FUNCTION list_runs() RETURNS SETOF VARCHAR AS $$
+ SELECT DISTINCT(log_filename) FROM workflow;
+$$ LANGUAGE SQL;
+
+-- lists distinct processes by name in a workflow
+
+CREATE OR REPLACE FUNCTION process_names(wf_id VARCHAR) RETURNS SETOF VARCHAR AS $$
+ SELECT DISTINCT(process.name)
+ FROM process
+ WHERE process.workflow_id=$1;
+$$ LANGUAGE SQL;
+
+-- lists variations in a parameter's value across workflows
+
+DROP TYPE param_across_wf_type CASCADE;
+CREATE TYPE param_across_wf_type AS (workflow VARCHAR, parameter VARCHAR, value VARCHAR);
+
+CREATE OR REPLACE FUNCTION param_across_wf(param_name VARCHAR) RETURNS SETOF param_across_wf_type AS $$
+ SELECT workflow.log_filename,ds_usage.param_name,variable.value
+ FROM variable,ds_usage,process,workflow
+ WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND
+ process.workflow_id=workflow.id AND ds_usage.param_name=$1
+ GROUP BY workflow.log_filename,ds_usage.param_name,variable.value;
+$$ LANGUAGE SQL;
+
+-- correlate a parameter with workflow runtime statistics
+DROP TYPE correlate_param_runtime_type CASCADE;
+CREATE TYPE correlate_param_runtime_type AS (workflow VARCHAR, workflow_starttime TIMESTAMP WITH TIME ZONE, workflow_duration NUMERIC, parameter VARCHAR, parameter_value VARCHAR);
+
+CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR) RETURNS SETOF correlate_param_runtime_type AS $$
+ SELECT A.workflow,to_timestamp(B.start_time),B.duration,A.parameter,A.value
+ FROM param_across_wf($1) AS A, workflow AS B
+ WHERE A.workflow=B.log_filename;
+$$ LANGUAGE SQL;
+
+-- OOPS-specific functions
+
+CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
+ SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_param_across_wf_type CASCADE;
+CREATE TYPE oops_param_across_wf_type AS (oops_run_id VARCHAR, param_name VARCHAR, variable VARCHAR);
+
+CREATE OR REPLACE FUNCTION oops_param_across_wf(VARCHAR) RETURNS SETOF oops_param_across_wf_type AS $$
+ SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
+ FROM variable,ds_usage,process,annot_wf_txt
+ WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
+ ds_usage.param_name=$1 AND annot_wf_txt.name='oops_run_id'
+ GROUP BY annot_wf_txt.name,annot_wf_txt.value,ds_usage.param_name,variable.value;
+$$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
+ SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_summary CASCADE;
+CREATE TYPE oops_summary AS (oops_run_id VARCHAR, start_time TIMESTAMP WITH TIME ZONE, duration_sec NUMERIC, swift_version VARCHAR);
+
+CREATE OR REPLACE FUNCTION oops_run_summary(varchar) RETURNS SETOF oops_summary AS $$
+ SELECT annot_wf_txt.value as oops_run_id, to_timestamp(workflow.start_time) as start_time,
+ workflow.duration as duration_sec,workflow.swift_version as swift_version
+ FROM annot_wf_txt,workflow
+ WHERE annot_wf_txt.id=workflow.id and annot_wf_txt.name='oops_run_id' and annot_wf_txt.value=$1;
+$$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION oops_process_names(varchar) RETURNS SETOF varchar AS $$
+ SELECT DISTINCT(process.name)
+ FROM process, annot_wf_txt
+ WHERE process.workflow_id=annot_wf_txt.id AND
+ annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_wf_param_summary CASCADE;
+CREATE TYPE oops_wf_param_summary AS (oops_run_id varchar, param_name varchar, value varchar);
+
+CREATE OR REPLACE FUNCTION oops_variable_summary() RETURNS SETOF oops_wf_param_summary AS $$
+ SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
+ FROM variable,ds_usage,process,annot_wf_txt
+ WHERE variable.id=ds_usage.dataset_id and ds_usage.process_id=process.id and process.workflow_id=annot_wf_txt.id;
+$$ LANGUAGE SQL;
+
+DROP TYPE oops_param_summary CASCADE;
+CREATE TYPE oops_param_summary AS (param_name varchar, value varchar);
+
+CREATE OR REPLACE FUNCTION oops_science_summary(varchar) RETURNS SETOF oops_param_summary AS $$
+ SELECT ds_usage.param_name,variable.value
+ FROM variable,ds_usage,process,annot_wf_txt
+ WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
+ (ds_usage.param_name='proteinId' OR ds_usage.param_name='targetId' OR ds_usage.param_name='seqFile' OR
+ ds_usage.param_name='prot' OR ds_usage.param_name='prepTarFile' OR ds_usage.param_name='nSim') AND
+ annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
+$$ LANGUAGE SQL;
Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql 2010-11-24 17:18:12 UTC (rev 3724)
+++ provenancedb/prov-init.sql 2010-11-25 22:57:39 UTC (rev 3725)
@@ -19,7 +19,7 @@
DROP TABLE annot_wf_num CASCADE;
DROP TABLE annot_wf_txt CASCADE;
DROP TABLE annot_wf_bool CASCADE;
-DROP TABLE extrainfo CASCADE;
+-- DROP TABLE extrainfo CASCADE;
DROP TABLE createarray CASCADE;
DROP TABLE createarray_member CASCADE;
DROP TABLE array_range CASCADE;
@@ -30,12 +30,12 @@
-- Might be interesting to store xml translation of the swiftscript code
-- here for prospective provenance/versioning
CREATE TABLE workflow
- (id varchar(256) PRIMARY KEY,
- log_filename varchar(2048),
- swift_version varchar(16),
- import_status varchar(16),
- start_time numeric,
- duration numeric
+ (id VARCHAR(256) PRIMARY KEY,
+ log_filename VARCHAR(2048),
+ swift_version VARCHAR(16),
+ import_status VARCHAR(16),
+ start_time NUMERIC,
+ duration NUMERIC
);
-- workflow_run stores the start time and duration for each workflow
@@ -48,13 +48,13 @@
-- dataset stores all dataset identifiers.
CREATE TABLE dataset
- (id varchar(256) PRIMARY KEY
+ (id VARCHAR(256) PRIMARY KEY
);
-- file stores the filename mapped to each dataset.
CREATE TABLE file
- ( id varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
- filename varchar(2048)
+ ( id VARCHAR(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+ filename VARCHAR(2048)
);
-- dataset_values stores the value for each dataset which is known to have
@@ -63,8 +63,8 @@
-- example) SQL numerical operations should not be expected to work, even
-- though the user knows that a particular dataset stores a numeric value.
CREATE TABLE variable
- ( id varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
- value varchar(2048)
+ ( id VARCHAR(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+ value VARCHAR(2048)
);
-- dataset_containment stores the containment hierarchy between
@@ -75,8 +75,8 @@
-- a containment hierarchy. The relationship (such as array index or
-- structure member name) should also be stored in this table.
CREATE TABLE ds_containment
- ( out_id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
- in_id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ ( out_id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ in_id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
PRIMARY KEY (out_id,in_id)
);
@@ -89,10 +89,10 @@
-- process types: internal, rootthread, execute, function, compound, scope, operator
-- maybe create a table for each type?
CREATE TABLE process
- (id varchar(256) PRIMARY KEY,
- type varchar(16),
- name varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
- workflow_id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE -- normalize: workflow_id of sub-procedure determined
+ (id VARCHAR(256) PRIMARY KEY,
+ type VARCHAR(16),
+ name VARCHAR(256), -- in the case of an execute this refers to the transformation name in tc.data
+ workflow_id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE -- normalize: workflow_id of sub-procedure determined
-- by compound procedure
);
@@ -100,24 +100,24 @@
-- each execute is identified by a unique URI. other information from
-- swift logs is also stored here. an execute is an OPM process.
CREATE TABLE execute
- (id varchar(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,
- procedure_name varchar(256), -- name of the app procedure that invokes the transformation
- start_time numeric,
- duration numeric,
- final_state varchar(16),
- scratch varchar(2048)
+ (id VARCHAR(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,
+ procedure_name VARCHAR(256), -- name of the app procedure that invokes the transformation
+ start_time NUMERIC,
+ duration NUMERIC,
+ final_state VARCHAR(16),
+ scratch VARCHAR(2048)
);
-- this gives information about each execute2, which is an attempt to
-- perform an execution. the execute2 id is tied to per-execution-attempt
-- information such as wrapper logs
CREATE TABLE execute2
- (id varchar(256) PRIMARY KEY,
- execute_id varchar(256) REFERENCES execute (id) ON DELETE CASCADE,
- start_time numeric,
- duration numeric,
- final_state varchar(16),
- site varchar(256)
+ (id VARCHAR(256) PRIMARY KEY,
+ execute_id VARCHAR(256) REFERENCES execute (id) ON DELETE CASCADE,
+ start_time NUMERIC,
+ duration NUMERIC,
+ final_state VARCHAR(16),
+ site VARCHAR(256)
);
-- dataset_usage records usage relationships between processes and datasets;
@@ -125,10 +125,10 @@
-- application procedure invocation; in OPM terms, the artificts which are
-- input to and output from each process that is a Swift execution
CREATE TABLE ds_usage
- (process_id varchar(256) REFERENCES process(id) ON DELETE CASCADE,
- direction char(1), -- I or O for input or output
- dataset_id varchar(256) REFERENCES dataset(id) ON DELETE CASCADE,
- param_name varchar(256), -- the name of the parameter in this execute that
+ (process_id VARCHAR(256) REFERENCES process(id) ON DELETE CASCADE,
+ direction CHAR(1), -- I or O for input or output
+ dataset_id VARCHAR(256) REFERENCES dataset(id) ON DELETE CASCADE,
+ param_name VARCHAR(256), -- the name of the parameter in this execute that
-- this dataset was bound to. sometimes this must
-- be contrived (for example, in positional varargs)
PRIMARY KEY (process_id,direction,dataset_id,param_name)
@@ -136,65 +136,65 @@
-- annotations
CREATE TABLE annot_ds_num
- ( id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
- name varchar(256),
- value numeric,
+ ( id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value NUMERIC,
PRIMARY KEY (id, name)
);
CREATE TABLE annot_ds_txt
- ( id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
- name varchar(256),
- value varchar(2048),
+ ( id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value VARCHAR(2048),
PRIMARY KEY (id, name)
);
CREATE TABLE annot_ds_bool
- ( id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
- name varchar(256),
- value boolean,
+ ( id VARCHAR(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value BOOLEAN,
PRIMARY KEY (id, name)
);
CREATE TABLE annot_p_num
- ( id varchar(256) REFERENCES process (id) ON DELETE CASCADE,
- name varchar(256),
- value numeric,
+ ( id VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value NUMERIC,
PRIMARY KEY (id, name)
);
CREATE TABLE annot_p_txt
- ( id varchar(256) REFERENCES process (id) ON DELETE CASCADE,
- name varchar(256),
- value varchar(2048),
+ ( id VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value VARCHAR(2048),
PRIMARY KEY (id, name)
);
CREATE TABLE annot_p_bool
- ( id varchar(256) REFERENCES process (id) ON DELETE CASCADE,
- name varchar(256),
- value boolean,
+ ( id VARCHAR(256) REFERENCES process (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value BOOLEAN,
PRIMARY KEY (id, name)
);
CREATE TABLE annot_wf_num
- ( id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE,
- name varchar(256),
- value numeric,
+ ( id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value NUMERIC,
PRIMARY KEY (id, name)
);
CREATE TABLE annot_wf_txt
- ( id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE,
- name varchar(256),
- value varchar(2048),
+ ( id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE,
+ name VARCHAR(256),
+ value VARCHAR(2048),
PRIMARY KEY (id, name)
);
CREATE TABLE annot_wf_bool
- ( id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE,
- name varchar(2048),
- value boolean,
+ ( id VARCHAR(256) REFERENCES workflow (id) ON DELETE CASCADE,
+ name VARCHAR(2048),
+ value BOOLEAN,
PRIMARY KEY (id, name)
);
@@ -212,78 +212,30 @@
-- It is unclear which is the better representation.
CREATE TABLE createarray
- ( id varchar(256) PRIMARY KEY
+ ( id VARCHAR(256) PRIMARY KEY
);
CREATE TABLE createarray_member
- ( array_id varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
- ix varchar(256),
- member_id varchar(256),
+ ( array_id VARCHAR(256) REFERENCES createarray (id) ON DELETE CASCADE,
+ ix VARCHAR(256),
+ member_id VARCHAR(256),
PRIMARY KEY (array_id, ix)
);
-- TODO step
CREATE TABLE array_range
- ( array_id varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
- from_id varchar(256),
- to_id varchar(256),
- step_id varchar(256), -- nullable, if step is unspecified
+ ( array_id VARCHAR(256) REFERENCES createarray (id) ON DELETE CASCADE,
+ from_id VARCHAR(256),
+ to_id VARCHAR(256),
+ step_id VARCHAR(256), -- nullable, if step is unspecified
PRIMARY KEY (array_id,from_id,to_id,step_id)
);
--- lists distinct processes by name in a workflow
-CREATE OR REPLACE FUNCTION process_names(varchar) RETURNS SETOF varchar AS $$
- SELECT DISTINCT(process.name)
- FROM process
- WHERE process.workflow_id=$1;
-$$ LANGUAGE SQL;
-
-
--- OOPS-specific functions
-
-CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF varchar AS $$
- SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
-$$ LANGUAGE SQL;
-
-CREATE TYPE oops_summary AS (oops_run_id varchar, start_time timestamp with time zone, duration_sec numeric, swift_version varchar);
-
-CREATE OR REPLACE FUNCTION oops_run_summary(varchar) RETURNS SETOF oops_summary AS $$
- SELECT annot_wf_txt.value as oops_run_id, to_timestamp(workflow.start_time) as start_time,
- workflow.duration as duration_sec,workflow.swift_version as swift_version
- FROM annot_wf_txt,workflow
- WHERE annot_wf_txt.id=workflow.id and annot_wf_txt.name='oops_run_id' and annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION oops_process_names(varchar) RETURNS SETOF varchar AS $$
- SELECT DISTINCT(process.name)
- FROM process, annot_wf_txt
- WHERE process.workflow_id=annot_wf_txt.id AND
- annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-CREATE TYPE oops_variable_summary AS (oops_run_id varchar, param_name varchar, value varchar);
-
-CREATE OR REPLACE FUNCTION oops_variable_summary(varchar) RETURNS SETOF oops_variable_summary $$
- SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
- FROM variable,ds_usage,process,annot_wf_txt
- WHERE variable.id=ds_usage.dataset_id and ds_usage.process_id=process.id and process.workflow_id=annot_wf_txt.id;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION oops_science_summary(varchar) RETURNS SETOF oops_variable_summary AS $$
- SELECT ds_usage.param_name,variable.value
- FROM variable,ds_usage,process,annot_wf_txt
- WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
- (ds_usage.param_name='proteinId' OR ds_usage.param_name='targetId' OR ds_usage.param_name='seqFile' OR
- ds_usage.param_name='prot' OR ds_usage.param_name='prepTarFile' OR ds_usage.param_name='nSim') AND
- annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value='h733';
-$$ LANGUAGE SQL;
-
-
-- this GRANT does not work for sqlite; you'll get a syntax error but
-- ignore it, as it is not needed in sqlite
-grant all on
+GRANT ALL ON
dataset,
file,
variable,
@@ -306,4 +258,4 @@
createarray,
createarray_member,
array_range
-to public, operators;
+TO public, operators;
Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh 2010-11-24 17:18:12 UTC (rev 3724)
+++ provenancedb/prov-to-sql.sh 2010-11-25 22:57:39 UTC (rev 3725)
@@ -27,8 +27,6 @@
echo "INSERT INTO execute2 (id, execute_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-e2.sql
done < execute2.global.event
-
-
while read col1 col2 col3 col4 col5 threadst namest lhsst rhsst resultst; do
thread=`echo $threadst | awk 'BEGIN { FS = "=" }; {print $2}'`
name=`echo $namest | awk 'BEGIN { FS = "=" }; {print $2}'`
Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs 2010-11-24 17:18:12 UTC (rev 3724)
+++ provenancedb/swift-prov-import-all-logs 2010-11-25 22:57:39 UTC (rev 3725)
@@ -50,7 +50,6 @@
fi
export RUNID=$(basename $filename .log)
- # changing tag authority from e-mail benc at ci.uchicago.edu to dnsname ci.uchicago.edu
export WF="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
echo "INSERT INTO workflow (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
More information about the Swift-commit
mailing list