[Swift-commit] r4224 - in provenancedb: . apps/oops etc
lgadelha at ci.uchicago.edu
lgadelha at ci.uchicago.edu
Sun Mar 27 16:54:23 CDT 2011
Author: lgadelha
Date: 2011-03-27 16:54:23 -0500 (Sun, 27 Mar 2011)
New Revision: 4224
Removed:
provenancedb/etc/provenance.config.soju
Modified:
provenancedb/apps/oops/oops_extractor.sh
provenancedb/info-to-extrainfo
provenancedb/pql_functions.sql
provenancedb/prov-init.sql
provenancedb/prov-to-sql.sh
provenancedb/swift-prov-import-all-logs
Log:
Update schema to include runtime_info relation.
Minor updates.
Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================
--- provenancedb/apps/oops/oops_extractor.sh 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/apps/oops/oops_extractor.sh 2011-03-27 21:54:23 UTC (rev 4224)
@@ -82,7 +82,7 @@
if [ -n "$FILENAME" ]; then
SEQLENGTH=$(awk '{if (NR==2) print $1}' $FILENAME | wc -c)
- echo "insert into annot_ds_txt values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);" | $SQLCMD
+ echo "insert into annot_ds_num values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);" | $SQLCMD
fi
# extracts scientific parameters given as output by the workflow in *.log.
# relevant lines:
Deleted: provenancedb/etc/provenance.config.soju
===================================================================
--- provenancedb/etc/provenance.config.soju 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/etc/provenance.config.soju 2011-03-27 21:54:23 UTC (rev 4224)
@@ -1,10 +0,0 @@
-# file to source that sets variables for the various paths that are
-# presently hardcoded
-
-# this is the path to log repo on benc's laptop
-export LOGREPO=/Users/benc/work/everylog/
-export IDIR=/Users/benc/work/everylog/
-
-export SQLCMD="sqlite3 provdb "
-# export SQLCMD="psql -p 5435 -d provdb "
-
Modified: provenancedb/info-to-extrainfo
===================================================================
--- provenancedb/info-to-extrainfo 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/info-to-extrainfo 2011-03-27 21:54:23 UTC (rev 4224)
@@ -3,17 +3,17 @@
# must have $EXECUTE2PREFIX appropriately by containing script
while read t d id rest; do
- echo processing wrapper file for job $id >&2
- record=$(find ${IDIR} -name ${id}-info)
-
- globalid=$EXECUTE2PREFIX$id
-
- if [ "X$record" != "X" ] && [ -f $record ] ; then
-
- grep '^EXTRAINFO=' $record | sed "s/^EXTRAINFO=\(.*\)$/$globalid \1/"
-
- else
- echo no wrapper log for $id >&2
- fi
-
+ echo processing wrapper file for job $id >&2
+ record=$(find ${IDIR} -name ${id}-info)
+
+ globalid=$EXECUTE2PREFIX$id
+
+ if [ "X$record" != "X" ] && [ -f $record ] ; then
+
+ grep '^EXTRAINFO=' $record | sed "s/^EXTRAINFO=\(.*\)$/$globalid \1/"
+
+ else
+ echo no wrapper log for $id >&2
+ fi
+
done < execute2.event
Modified: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/pql_functions.sql 2011-03-27 21:54:23 UTC (rev 4224)
@@ -3,79 +3,134 @@
-- SQL Functions
--- list_runs_* lists workflows recorded in the database by id or log_filename
-DROP TYPE list_runs_type CASCADE;
+-- lists variations in a parameter's value across workflows, for parameters that are in-memory variables
-CREATE TYPE list_runs_type
-AS (id VARCHAR,
- log_filename VARCHAR,
- swift_version VARCHAR,
- start_time TIMESTAMP WITH TIME ZONE,
- duration NUMERIC,
- final_state VARCHAR);
-CREATE OR REPLACE FUNCTION list_runs()
-RETURNS SETOF list_runs_type AS $$
- SELECT id, log_filename, swift_version, TO_TIMESTAMP(start_time),
- duration, import_status AS final_state
- FROM workflow;
+CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name VARCHAR)
+RETURNS TABLE (
+ run_id VARCHAR,
+ param_name VARCHAR,
+ value VARCHAR
+)
+AS $$
+ SELECT process.workflow_id, ds_usage.param_name, variable.value
+ FROM variable, ds_usage, process
+ WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND
+ ds_usage.param_name=$1
+ GROUP BY process.workflow_id, ds_usage.param_name, variable.value;
$$ LANGUAGE SQL;
+CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR)
+RETURNS TABLE (
+ workflow_id VARCHAR,
+ param_name1 VARCHAR,
+ value1 VARCHAR,
+ param_name2 VARCHAR,
+ value2 VARCHAR
+)
+AS $$
+ SELECT *
+ FROM compare_run_by_parameter($1) as t
+ INNER JOIN
+ compare_run_by_parameter($2) as s
+ USING (workflow_id);
+$$ LANGUAGE SQL;
--- lists variations in a parameter's value across workflows
-DROP TYPE param_across_wf_type CASCADE;
-CREATE TYPE param_across_wf_type AS (workflow VARCHAR, parameter VARCHAR, value VARCHAR);
-
-CREATE OR REPLACE FUNCTION param_across_wf(param_name VARCHAR)
-RETURNS SETOF param_across_wf_type AS $$
- SELECT workflow.log_filename,ds_usage.param_name,variable.value
- FROM variable,ds_usage,process,workflow
- WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND
- process.workflow_id=workflow.id AND ds_usage.param_name=$1
- GROUP BY workflow.log_filename,ds_usage.param_name,variable.value;
+CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR, param_name3 VARCHAR)
+RETURNS TABLE (
+ workflow_id VARCHAR,
+ param_name1 VARCHAR,
+ value1 VARCHAR,
+ param_name2 VARCHAR,
+ value2 VARCHAR,
+ param_name3 VARCHAR,
+ value3 VARCHAR
+)
+AS $$
+ SELECT *
+ FROM compare_run_by_parameter($1, $2) as t
+ INNER JOIN
+ compare_run_by_parameter($3) as s
+ USING (workflow_id);
$$ LANGUAGE SQL;
-drop type annot_across_wf_type cascade;
-create type annot_across_wf_type as (workflow varchar, process varchar, name varchar, value numeric);
-create or replace function annot_across_wf(name varchar)
-returns setof annot_across_wf_type as $$
- select process.workflow_id,process.id,annot_ds_num.name,annot_ds_num.value
- from process,ds_usage,ds_containment,annot_ds_num
- where process.id=ds_usage.process_id and ds_usage.dataset_id=ds_containment.out_id and
- ds_containment.in_id=annot_ds_num.id and annot_ds_num.name=$1
- order by process.workflow_id,process.id;
-$$ language sql;
+CREATE OR REPLACE FUNCTION compare_run_by_annot_num(name VARCHAR)
+RETURNS TABLE (
+ workflow_id VARCHAR,
+ name VARCHAR,
+ value NUMERIC
+)
+AS $$
+ SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value
+ FROM annot_ds_num,ds_usage,ds_containment,process
+ WHERE annot_ds_num.id=ds_containment.in_id AND ds_containment.out_id=ds_usage.dataset_id AND
+ ds_usage.process_id=process.id AND annot_ds_num.name=$1
+ UNION
+ SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value
+ FROM process, ds_usage, annot_ds_num
+ WHERE process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_num.id and
+ annot_ds_num.name=$1
+ UNION
+ SELECT process.workflow_id, annot_p_num.name, annot_p_num.value
+ FROM process, annot_p_num
+ WHERE process.id=annot_p_num.id and annot_p_num.name=$1
+ UNION
+ SELECT workflow.id as workflow_id, annot_wf_num.name, annot_wf_num.value
+ FROM workflow, annot_wf_num
+ WHERE workflow.id=annot_wf_num.id and annot_wf_num.name=$1
+$$ LANGUAGE SQL;
+CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
+RETURNS TABLE (
+ workflow_id VARCHAR,
+ name VARCHAR,
+ value VARCHAR)
+AS $$
+ SELECT process.workflow_id, annot_ds_txt.name, annot_ds_txt.value
+ FROM process, ds_usage, annot_ds_txt
+ WHERE process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_txt.id and
+ annot_ds_txt.name=$1
+ UNION
+ SELECT process.workflow_id, annot_p_txt.name, annot_p_txt.value
+ FROM process, annot_p_txt
+ WHERE process.id=annot_p_txt.id and annot_p_txt.name=$1
+ UNION
+ SELECT workflow.id as workflow_id, annot_wf_txt.name, annot_wf_txt.value
+ FROM workflow, annot_wf_txt
+ WHERE workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
+$$ LANGUAGE SQL;
+CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
+RETURNS TABLE (
+ workflow_id VARCHAR,
+ name VARCHAR,
+ value BOOLEAN
+)
+AS $$
+ SELECT process.workflow_id, annot_ds_bool.name, annot_ds_bool.value
+ FROM process, ds_usage, annot_ds_bool
+ WHERE process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_bool.id and
+ annot_ds_bool.name=$1
+ UNION
+ SELECT process.workflow_id, annot_p_bool.name, annot_p_bool.value
+ FROM process, annot_p_bool
+ WHERE process.id=annot_p_bool.id and annot_p_bool.name=$1
+ UNION
+ SELECT workflow.id as workflow_id, annot_wf_bool.name, annot_wf_bool.value
+ FROM workflow, annot_wf_bool
+ WHERE workflow.id=annot_wf_bool.id and annot_wf_bool.name=$1
+$$ LANGUAGE SQL;
--- lists variations of the values of a set of parameters
-
-DROP TYPE param_across_wf_class_type CASCADE;
-CREATE TYPE param_across_wf_class_type AS (workflow VARCHAR, parameter VARCHAR, value VARCHAR);
-
-CREATE OR REPLACE FUNCTION param_across_wf_class(param_name VARCHAR, wf_class VARACHAR)
-RETURNS SETOF param_across_wf_type AS $$
-
-$$ LANGUAGE plpgsql;
-
-- correlate a parameter with workflow runtime statistics
-DROP TYPE correlate_param_runtime_type CASCADE;
-CREATE TYPE correlate_param_runtime_type
-AS (workflow VARCHAR,
+CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR)
+RETURNS TABLE (
+ workflow VARCHAR,
workflow_starttime TIMESTAMP WITH TIME ZONE,
workflow_duration NUMERIC,
parameter VARCHAR,
- parameter_value VARCHAR);
-
-CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR)
-RETURNS SETOF correlate_param_runtime_type AS $$
- SELECT A.workflow,to_timestamp(B.start_time),B.duration,A.parameter,A.value
- FROM param_across_wf($1) AS A, workflow AS B
- WHERE A.workflow=B.log_filename;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR)
-RETURNS SETOF correlate_param_runtime_type AS $$
+ parameter_value VARCHAR
+)
+AS $$
SELECT workflow.id,to_timestamp(workflow.start_time),workflow.duration,ds_usage.param_name,variable.value
FROM variable,ds_usage,process,workflow
WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND
@@ -90,73 +145,13 @@
WHERE process.workflow_id=$1;
$$ LANGUAGE SQL;
--- lists distinct processes in a
-
--- OOPS-specific functions
-
-
-
-CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
- SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
-$$ LANGUAGE SQL;
-
-DROP TYPE oops_param_across_wf_type CASCADE;
-CREATE TYPE oops_param_across_wf_type AS (oops_run_id VARCHAR, param_name VARCHAR, variable VARCHAR);
-CREATE OR REPLACE FUNCTION oops_param_across_wf(VARCHAR) RETURNS SETOF oops_param_across_wf_type AS $$
- SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
- FROM variable,ds_usage,process,annot_wf_txt
- WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
- ds_usage.param_name=$1 AND annot_wf_txt.name='oops_run_id'
- GROUP BY annot_wf_txt.name,annot_wf_txt.value,ds_usage.param_name,variable.value;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION list_oops_runs() RETURNS SETOF VARCHAR AS $$
- SELECT DISTINCT(value) FROM annot_wf_txt WHERE annot_wf_txt.name=('oops_run_id');
-$$ LANGUAGE SQL;
-
-DROP TYPE oops_summary CASCADE;
-CREATE TYPE oops_summary AS (oops_run_id VARCHAR, start_time TIMESTAMP WITH TIME ZONE, duration_sec NUMERIC, swift_version VARCHAR);
-CREATE OR REPLACE FUNCTION oops_run_summary(varchar) RETURNS SETOF oops_summary AS $$
- SELECT annot_wf_txt.value as oops_run_id, to_timestamp(workflow.start_time) as start_time,
- workflow.duration as duration_sec,workflow.swift_version as swift_version
- FROM annot_wf_txt,workflow
- WHERE annot_wf_txt.id=workflow.id and annot_wf_txt.name='oops_run_id' and annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-CREATE OR REPLACE FUNCTION oops_process_names(varchar) RETURNS SETOF varchar AS $$
- SELECT DISTINCT(process.name)
- FROM process, annot_wf_txt
- WHERE process.workflow_id=annot_wf_txt.id AND
- annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-DROP TYPE oops_wf_param_summary CASCADE;
-CREATE TYPE oops_wf_param_summary AS (oops_run_id varchar, param_name varchar, value varchar);
-CREATE OR REPLACE FUNCTION oops_variable_summary() RETURNS SETOF oops_wf_param_summary AS $$
- SELECT annot_wf_txt.value,ds_usage.param_name,variable.value
- FROM variable,ds_usage,process,annot_wf_txt
- WHERE variable.id=ds_usage.dataset_id and ds_usage.process_id=process.id and process.workflow_id=annot_wf_txt.id;
-$$ LANGUAGE SQL;
-
-
-DROP TYPE oops_param_summary CASCADE;
-CREATE TYPE oops_param_summary AS (param_name varchar, value varchar);
-CREATE OR REPLACE FUNCTION oops_science_summary(varchar) RETURNS SETOF oops_param_summary AS $$
- SELECT ds_usage.param_name,variable.value
- FROM variable,ds_usage,process,annot_wf_txt
- WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND process.workflow_id=annot_wf_txt.id AND
- (ds_usage.param_name='proteinId' OR ds_usage.param_name='targetId' OR ds_usage.param_name='seqFile' OR
- ds_usage.param_name='prot' OR ds_usage.param_name='prepTarFile' OR ds_usage.param_name='nSim') AND
- annot_wf_txt.name='oops_run_id' AND annot_wf_txt.value=$1;
-$$ LANGUAGE SQL;
-
-- recursive query to find ancestor entities in a provenance graph
CREATE OR REPLACE FUNCTION ancestors(varchar) RETURNS SETOF varchar AS $$
WITH RECURSIVE anc(ancestor,descendant) AS
(
SELECT parent AS ancestor, child AS descendant FROM parent_of WHERE child=$1
- UNION
+ UNION
SELECT parent_of.parent AS ancestor, anc.descendant AS descendant
FROM anc,parent_of
WHERE anc.ancestor=parent_of.child
@@ -164,5 +159,77 @@
SELECT ancestor FROM anc
$$ LANGUAGE SQL;
+-- compare(<entity>, <list of param_names, annotations keys>
-
+CREATE OR REPLACE FUNCTION compare_run(VARIADIC args VARCHAR[])
+RETURNS SETOF RECORD AS $$
+DECLARE
+ i INTEGER;
+ q VARCHAR;
+ selectq VARCHAR;
+ fromq VARCHAR;
+ property VARCHAR;
+ property_type VARCHAR;
+ function_name VARCHAR;
+BEGIN
+ selectq := 'SELECT *';
+ FOR i IN array_lower(args, 1)..array_upper(args, 1) LOOP
+ property_type := split_part(args[i], '=', 1);
+ property := split_part(args[i], '=', 2);
+ CASE property_type
+ WHEN 'param_name' THEN
+ function_name := 'compare_run_by_parameter';
+ WHEN 'annot_num' THEN
+ function_name := 'compare_run_by_annot_num';
+ WHEN 'annot_txt' THEN
+ function_name := 'compare_run_by_annot_txt';
+ WHEN 'annot_bool' THEN
+ function_name := 'compare_run_by_annot_bool';
+ END CASE;
+ IF i = 1 THEN
+ fromq := function_name || '(''' || property || ''') as t' || i;
+ ELSE
+ fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
+ END IF;
+ END LOOP;
+ q := selectq || ' FROM ' || fromq;
+ RETURN QUERY EXECUTE q;
+END;
+$$ LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION compare_run_print(VARIADIC args VARCHAR[])
+RETURNS VARCHAR AS $$
+DECLARE
+ i INTEGER;
+ q VARCHAR;
+ selectq VARCHAR;
+ fromq VARCHAR;
+ property VARCHAR;
+ property_type VARCHAR;
+ function_name VARCHAR;
+BEGIN
+ selectq := 'SELECT *';
+ FOR i IN array_lower(args, 1)..array_upper(args, 1) LOOP
+ property_type := split_part(args[i], '=', 1);
+ property := split_part(args[i], '=', 2);
+ CASE property_type
+ WHEN 'param_name' THEN
+ function_name := 'compare_run_by_parameter';
+ WHEN 'annot_num' THEN
+ function_name := 'compare_run_by_annot_num';
+ WHEN 'annot_txt' THEN
+ function_name := 'compare_run_by_annot_txt';
+ WHEN 'annot_bool' THEN
+ function_name := 'compare_run_by_annot_bool';
+ END CASE;
+ IF i = 1 THEN
+ fromq := function_name || '(''' || property || ''') as t' || i;
+ ELSE
+ fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
+ END IF;
+ END LOOP;
+ q := selectq || ' FROM ' || fromq;
+ RETURN q;
+END;
+$$ LANGUAGE plpgsql;
Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/prov-init.sql 2011-03-27 21:54:23 UTC (rev 4224)
@@ -1,35 +1,30 @@
-- this is the schema definition used for the main relational provenance
-- implementation (in both sqlite3 and postgres)
+drop table run cascade;
+drop table process cascade;
+drop table app_invocation cascade;
+drop table app_execution cascade;
+drop table runtime_info cascade;
drop table dataset cascade;
drop table file cascade;
-drop table variable cascade;
-drop table ds_containment cascade;
-drop table process cascade;
-drop table execute cascade;
-drop table execute2 cascade;
-drop table workflow cascade;
-drop table ds_usage cascade;
-drop table annot_ds_num cascade;
-drop table annot_ds_txt cascade;
-drop table annot_ds_bool cascade;
-drop table annot_p_num cascade;
-drop table annot_p_txt cascade;
-drop table annot_p_bool cascade;
-drop table annot_wf_num cascade;
-drop table annot_wf_txt cascade;
-drop table annot_wf_bool cascade;
--- drop table extrainfo cascade;
-drop table createarray cascade;
-drop table createarray_member cascade;
-drop table array_range cascade;
+drop table in_memory cascade;
+drop table dataset_containment cascade;
+drop table dataset_consumption cascade;
+drop table dataset_production cascade;
+drop table annot_dataset_numeric cascade;
+drop table annot_dataset_text cascade;
+drop table annot_process_numeric cascade;
+drop table annot_process_text cascade;
+drop table annot_run_numeric cascade;
+drop table annot_run_text cascade;
--- workflow stores some information about each workflow log that has
+-- run stores information about each script run log that has
-- been seen by the importer: the log filename, swift version and import
-- status.
--- Might be interesting to store xml translation of the swiftscript code
--- here for prospective provenance/versioning
-create table workflow
+-- Might be interesting to store xml translation of the Swift script
+-- here for prospective provenance and versioning.
+create table run
(id varchar(256) primary key,
log_filename varchar(2048),
swift_version varchar(16),
@@ -38,68 +33,24 @@
duration numeric
);
--- workflow_run stores the start time and duration for each workflow
--- that has been successfully imported.
---CREATE TABLE wf_run
--- (id varchar(256) PRIMARY KEY REFERENCES workflow (id) ON DELETE CASCADE,
--- start_time numeric,
--- duration numeric
--- );
-
--- dataset stores all dataset identifiers.
-create table dataset
- (id varchar(256) primary key
- );
-
--- file stores the filename mapped to each dataset.
-create table file
- ( id varchar(256) primary key references dataset (id) on delete cascade,
- filename varchar(2048)
- );
-
--- dataset_values stores the value for each dataset which is known to have
--- a value (which is all assigned primitive types). No attempt is made here
--- to expose that value as an SQL type other than a string, and so (for
--- example) SQL numerical operations should not be expected to work, even
--- though the user knows that a particular dataset stores a numeric value.
-create table variable
- ( id varchar(256) primary key references dataset (id) on delete cascade,
- value varchar(2048)
- );
-
--- dataset_containment stores the containment hierarchy between
--- container datasets (arrays and structs) and their contents.
--- out_id contains in_id
--- TODO this should perhaps be replaced with a more OPM-like model of
--- constructors and accessors, rather than, or in addition to,
--- a containment hierarchy. The relationship (such as array index or
--- structure member name) should also be stored in this table.
-create table ds_containment
- ( out_id varchar(256) references dataset (id) on delete cascade,
- in_id varchar(256) references dataset (id) on delete cascade,
- primary key (out_id,in_id)
- );
-
-- process gives information about each process (in the OPM sense)
-- it is augmented by information in other tables
-- specifies the type of process. for any type, it
-- must be the case that the specific type table
-- has an entry for this process.
--- Having this type here seems poor normalisation, though?
-- process types: internal, rootthread, execute, function, compound, scope, operator
--- maybe create a table for each type?
create table process
(id varchar(256) primary key,
type varchar(16),
name varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
- workflow_id varchar(256) references workflow (id) on delete cascade -- normalize: workflow_id of sub-procedure determined
+ run_id varchar(256) references run (id) on delete cascade -- normalize: workflow_id of sub-procedure determined
-- by compound procedure
);
-- this gives information about each execute.
-- each execute is identified by a unique URI. other information from
-- swift logs is also stored here. an execute is an OPM process.
-create table execute
+create table app_invocation
(id varchar(256) primary key references process (id) on delete cascade,
procedure_name varchar(256), -- name of the app procedure that invokes the transformation
start_time numeric,
@@ -111,156 +62,159 @@
-- this gives information about each execute2, which is an attempt to
-- perform an execution. the execute2 id is tied to per-execution-attempt
-- information such as wrapper logs
-create table execute2
+create table app_execution
(id varchar(256) primary key,
- execute_id varchar(256) references execute (id) on delete cascade,
+ app_invocation_id varchar(256) references app_invocation (id) on delete cascade,
start_time numeric,
duration numeric,
final_state varchar(16),
site varchar(256)
);
+-- app execution runtime info extracted from the /proc filesystem (assumes the app executed
+-- in a Linux host)
+create table runtime_info
+ ( app_execution_id varchar(256) references app_execution (id) on delete cascade,
+ event_timestamp numeric,
+ cpu_usage numeric,
+ max_phys_mem numeric,
+ max_virtual_mem numeric,
+ io_read_bytes numeric,
+ io_write_bytes numeric,
+ primary key (app_execution_id, event_timestamp)
+ );
+
+-- ds stores all dataset identifiers.
+create table dataset
+ (id varchar(256) primary key
+ );
+
+-- file stores the filename mapped to each dataset.
+create table file
+ ( id varchar(256) primary key references dataset (id) on delete cascade,
+ filename varchar(2048)
+ );
+
+-- dataset_values stores the value for each dataset which is known to have
+-- a value (which is all assigned primitive types). No attempt is made here
+-- to expose that value as an SQL type other than a string, and so (for
+-- example) SQL numerical operations should not be expected to work, even
+-- though the user knows that a particular dataset stores a numeric value.
+create table in_memory
+ ( id varchar(256) primary key references dataset (id) on delete cascade,
+ value varchar(2048)
+ );
+
+-- dataset_containment stores the containment hierarchy between
+-- container datasets (arrays and structs) and their contents.
+-- out_id contains in_id
+-- TODO this should perhaps be replaced with a more OPM-like model of
+-- constructors and accessors, rather than, or in addition to,
+-- a containment hierarchy. The relationship (such as array index or
+-- structure member name) should also be stored in this table.
+create table dataset_containment
+ ( out_id varchar(256) references dataset (id) on delete cascade,
+ in_id varchar(256) references dataset (id) on delete cascade,
+ primary key (out_id,in_id)
+ );
+
+
-- dataset_usage records usage relationships between processes and datasets;
-- in SwiftScript terms, the input and output parameters for each
-- application procedure invocation; in OPM terms, the artificts which are
-- input to and output from each process that is a Swift execution
-create table ds_usage
+create table dataset_consumption
(process_id varchar(256) references process(id) on delete cascade,
- direction char(1), -- I or O for input or output
dataset_id varchar(256) references dataset(id) on delete cascade,
param_name varchar(256), -- the name of the parameter in this execute that
-- this dataset was bound to. sometimes this must
-- be contrived (for example, in positional varargs)
- primary key (process_id,direction,dataset_id,param_name)
+ primary key (process_id,dataset_id,param_name)
);
+create table dataset_production
+ (process_id varchar(256) references process(id) on delete cascade,
+ dataset_id varchar(256) references dataset(id) on delete cascade,
+ param_name varchar(256), -- the name of the parameter in this execute that
+ -- this dataset was bound to. sometimes this must
+ -- be contrived (for example, in positional varargs)
+ primary key (process_id,dataset_id,param_name)
+ );
+
+
-- annotations
-create table annot_ds_num
- ( id varchar(256) references dataset (id) on delete cascade,
+create table annot_dataset_numeric
+ ( dataset_id varchar(256) references dataset (id) on delete cascade,
name varchar(256),
value numeric,
- primary key (id, name)
+ primary key (dataset_id, name)
);
-create table annot_ds_txt
- ( id varchar(256) references dataset (id) on delete cascade,
+create table annot_dataset_text
+ ( dataset_id varchar(256) references dataset (id) on delete cascade,
name varchar(256),
value varchar(2048),
- primary key (id, name)
+ primary key (dataset_id, name)
);
-create table annot_ds_bool
- ( id varchar(256) references dataset (id) on delete cascade,
+create table annot_process_numeric
+ ( process_id varchar(256) references process (id) on delete cascade,
name varchar(256),
- value boolean,
- primary key (id, name)
- );
-
-create table annot_p_num
- ( id varchar(256) references process (id) on delete cascade,
- name varchar(256),
value numeric,
- primary key (id, name)
+ primary key (process_id, name)
);
-create table annot_p_txt
- ( id varchar(256) references process (id) on delete cascade,
+create table annot_process_text
+ ( process_id varchar(256) references process (id) on delete cascade,
name varchar(256),
value varchar(2048),
- primary key (id, name)
+ primary key (process_id, name)
);
-create table annot_p_bool
- ( id varchar(256) references process (id) on delete cascade,
+create table annot_run_numeric
+ ( run_id varchar(256) references run (id) on delete cascade,
name varchar(256),
- value boolean,
- primary key (id, name)
- );
-
-create table annot_wf_num
- ( id varchar(256) references workflow (id) on delete cascade,
- name varchar(256),
value numeric,
- primary key (id, name)
+ primary key (run_id, name)
);
-create table annot_wf_txt
- ( id varchar(256) references workflow (id) on delete cascade,
+create table annot_run_text
+ ( run_id varchar(256) references run (id) on delete cascade,
name varchar(256),
value varchar(2048),
- primary key (id, name)
+ primary key (run_id, name)
);
-create table annot_wf_bool
- ( id varchar(256) references workflow (id) on delete cascade,
- name varchar(2048),
- value boolean,
- primary key (id, name)
- );
-
-create view parent_of as
- select process_id as parent,dataset_id as child from ds_usage where direction='O'
+create view provenance_graph as
+ select process_id as parent,dataset_id as child from dataset_production
union
- select dataset_id as parent,process_id as child from ds_usage where direction='I'
+ select dataset_id as parent,process_id as child from dataset_consumption
union
- select out_id as parent,in_id as child from ds_containment;
+ select out_id as parent,in_id as child from dataset_containment;
--- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
--- extrainfo will be processes into annotation tables
--- CREATE TABLE extrainfo
--- ( id varchar(256) PRIMARY KEY REFERENCES execute (id) ON DELETE CASCADE,
--- extrainfo varchar(2048)
--- );
--- The above dataset_* tables are the original containment representation
--- of datasets.
--- Now here are some tables to represent complex datasets differently, in
--- terms of accessors and constructors.
--- It is unclear which is the better representation.
+CREATE VIEW run_date AS
+ SELECT id, log_filename, swift_version, import_status,
+ to_timestamp(start_time) as start_time, duration
+ FROM run;
-create table createarray
- ( id varchar(256) primary key
- );
+CREATE VIEW annot_text AS
+ SELECT *
+ FROM annot_run_text
+ UNION
+ SELECT *
+ FROM annot_dataset_text
+ UNION
+ SELECT *
+ FROM annot_process_text;
-create table createarray_member
- ( array_id varchar(256) references createarray (id) on delete cascade,
- ix varchar(256),
- member_id varchar(256),
- primary key (array_id, ix)
- );
+CREATE VIEW annot_numeric AS
+ SELECT *
+ FROM annot_run_numeric
+ UNION
+ SELECT *
+ FROM annot_dataset_numeric
+ UNION
+ SELECT *
+ FROM annot_process_numeric;
--- TODO step
-create table array_range
- ( array_id varchar(256) references createarray (id) on delete cascade,
- from_id varchar(256),
- to_id varchar(256),
- step_id varchar(256), -- nullable, if step is unspecified
- primary key (array_id,from_id,to_id,step_id)
- );
-
--- this GRANT does not work for sqlite; you'll get a syntax error but
--- ignore it, as it is not needed in sqlite
-grant all on
- dataset,
- file,
- variable,
- ds_containment,
- process,
- execute,
- execute2,
- workflow,
- ds_usage,
- annot_ds_num,
- annot_ds_txt,
- annot_ds_bool,
- annot_p_num,
- annot_p_txt,
- annot_p_bool,
- annot_wf_num,
- annot_wf_txt,
- annot_wf_bool,
--- extrainfo,
- createarray,
- createarray_member,
- array_range
-to public, operators;
Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/prov-to-sql.sh 2011-03-27 21:54:23 UTC (rev 4224)
@@ -8,22 +8,23 @@
export WF="${RUNID}"
echo Generating SQL for $RUNID
+rm -f tmp-u.sql tmp-ds.sql tmp-p.sql tmp-e.sql tmp-e2.sql tmp-dsu.sql tmp-dsc.sql tmp-f.sql tmp-v.sql tmp-import.sql import.sql extrainfo.sql fields.txt
# this gives a distinction between the root process for a workflow and the
# workflow itself. perhaps better to model the workflow as a process
-echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" > tmp-p.sql
+echo "INSERT INTO process (id, type, name, run_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" > tmp-p.sql
while read time duration thread localthread endstate tr_name scratch; do
- echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-p.sql
- echo "INSERT INTO execute (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" > tmp-e.sql
+ echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-p.sql
+ echo "INSERT INTO app_invocation (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" >> tmp-e.sql
done < execute.global.event
while read start_time duration globalid id endstate thread site scratch; do
# cut off the last component of the thread, so that we end up at the
# parent thread id which should correspond with the execute-level ID
inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
- echo "INSERT INTO execute2 (id, execute_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" > tmp-e2.sql
+ echo "INSERT INTO app_execution (id, app_invocation_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-e2.sql
done < execute2.global.event
while read col1 col2 col3 col4 col5 thread name lhs rhs result; do
@@ -41,13 +42,13 @@
result=$(echo $result | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$lhs');" > tmp-ds.sql
+ echo "INSERT INTO dataset (id) VALUES ('$lhs');" >> tmp-ds.sql
echo "INSERT INTO dataset (id) VALUES ('$rhs');" >> tmp-ds.sql
echo "INSERT INTO dataset (id) VALUES ('$result');" >> tmp-ds.sql
- echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-p.sql
- echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$lhs', 'lhs');" > tmp-dsu.sql
- echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$rhs', 'rhs');" >> tmp-dsu.sql
- echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'O', '$result', 'result');" >> tmp-dsu.sql
+ echo "INSERT INTO process (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-p.sql
+ echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$lhs', 'lhs');" >> tmp-dsu.sql
+ echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$rhs', 'rhs');" >> tmp-dsu.sql
+ echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$operatorid', '$result', 'result');" >> tmp-dsu.sql
done < operators.txt
while read id name output; do
@@ -55,8 +56,8 @@
output=$(echo $output | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
echo "INSERT INTO dataset (id) VALUES ('$output');" >> tmp-ds.sql
- echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-p.sql
- echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'O', '$output', 'result');" >> tmp-dsu.sql
+ echo "INSERT INTO process (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-p.sql
+ echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$id', '$output', 'result');" >> tmp-dsu.sql
done < functions.txt
while read id value; do
@@ -67,12 +68,12 @@
fi
echo "INSERT INTO dataset (id) VALUES ('$value');" >> tmp-ds.sql
- echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'I', '$value', 'undefined');" >> tmp-dsu.sql
+ echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$id', '$value', 'undefined');" >> tmp-dsu.sql
done < function-inputs.txt
while read thread appname; do
- echo "UPDATE execute SET procedure_name='$appname' WHERE id='$thread';" >> tmp-e.sql
+ echo "UPDATE app_invocation SET procedure_name='$appname' WHERE id='$thread';" >> tmp-e.sql
done < invocation-procedure-names.txt
while read outer inner; do
@@ -84,7 +85,7 @@
echo "INSERT INTO dataset (id) VALUES ('$outer');" >> tmp-ds.sql
echo "INSERT INTO dataset (id) VALUES ('$inner');" >> tmp-ds.sql
- echo "INSERT INTO ds_containment (out_id, in_id) VALUES ('$outer', '$inner');" > tmp-dsc.sql
+ echo "INSERT INTO dataset_containment (out_id, in_id) VALUES ('$outer', '$inner');" >> tmp-dsc.sql
done < tie-containers.txt
while read dataset filename; do
@@ -94,7 +95,7 @@
fi
echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
- echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" > tmp-f.sql
+ echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" >> tmp-f.sql
done < dataset-filenames.txt
while read dataset value; do
@@ -104,77 +105,52 @@
fi
echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
- echo "INSERT INTO variable (id, value) VALUES ('$dataset', '$value');" > tmp-v.sql
+ echo "INSERT INTO in_memory (id, value) VALUES ('$dataset', '$value');" >> tmp-v.sql
done < dataset-values.txt
while read start duration wfid rest; do
- echo "UPDATE workflow SET start_time=$start WHERE id='$WF';" > tmp-u.sql
- echo "UPDATE workflow SET duration=$duration WHERE id='$WF';" >> tmp-u.sql
+ echo "UPDATE run SET start_time=$start WHERE id='$WF';" >> tmp-u.sql
+ echo "UPDATE run SET duration=$duration WHERE id='$WF';" >> tmp-u.sql
done < workflow.event
-
-while read id extrainfo; do
- echo $extrainfo | awk -F ";" '{ for (i = 1; i <= NF; i++)
- print $i
- }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
- while read name type value; do
- if [ "$type" = "num" ]; then
- echo "INSERT INTO annot_p_num (id, name, value) VALUES ('$id', '$name', $value);" > tmp-import.sql
- fi
- if [ "$type" = "txt" ]; then
- echo "INSERT INTO annot_p_txt (id, name, value) VALUES ('$id', '$name', '$value');" >> tmp-import.sql
- fi
- if [ "$type" = "bool" ]; then
- echo "INSERT INTO annot_p_bool (id, name, value) VALUES ('$id', '$name', $value);" >> tmp-import.sql
- fi
- done < fields.txt
-done < extrainfo.txt
-
# TODO this could merge with other naming tables
while read start duration thread final_state procname ; do
if [ "$duration" != "last-event-line" ]; then
compoundid=$WFID$thread
- echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-p.sql
+ echo "INSERT INTO process (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-p.sql
fi
done < compound.event
while read start duration thread final_state procname ; do
if [ "$duration" != "last-event-line" ]; then
fqid=$WFID$thread
- echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-p.sql
+ echo "INSERT INTO process (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-p.sql
fi
done < internalproc.event
while read t ; do
thread="${WFID}$t"
- echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-p.sql
+ echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-p.sql
done < scopes.txt
while read thread direction dataset variable rest; do
- if [ "$direction" == "input" ] ; then
- dir=I
- else
- dir=O
- fi
if [ $version -le 3726 ]; then
dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
+ if [ "$direction" == "input" ] ; then
+ table=dataset_consumption
+ else
+ table=dataset_production
+ fi
+
echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
- echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$thread', '$dir', '$dataset', '$variable');" >> tmp-dsu.sql
+ echo "INSERT INTO $table (process_id, dataset_id, param_name) VALUES ('$thread', '$dataset', '$variable');" >> tmp-dsu.sql
done < tie-data-invocs.txt
-while read id ; do
- echo "INSERT INTO createarray (array_id) VALUES ('$id');" >> tmp-import.sql
-done < createarray.txt
-
-while read arrayid index memberid ; do
- echo "INSERT INTO createarray_member (array_id, ix, member_id) VALUES ('$arrayid', '$index', '$memberid');" >> tmp-import.sql
-done < createarray-members.txt
-
-cat tmp-ds.sql | sort | uniq > import.sql
+cat tmp-ds.sql | sort | uniq >> import.sql
cat tmp-f.sql | sort | uniq >> import.sql
cat tmp-v.sql | sort | uniq >> import.sql
cat tmp-p.sql | sort | uniq >> import.sql
@@ -187,7 +163,23 @@
echo Sending SQL to DB
$SQLCMD < import.sql
-rm -f tmp-u.sql tmp-ds.sql tmp-p.sql tmp-e.sql tmp-e2.sql tmp-dsu.sql tmp-dsc.sql tmp-f.sql tmp-v.sql tmp-import.sql import.sql fields.txt
+while read execute2_id extrainfo; do
+ echo $extrainfo | awk -F ";" '{ for (i = 1; i <= NF; i++)
+ print $i
+ }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
+ id=$($SQLCMD --tuples-only -c "select app_invocation_id from app_execution where id='$execute2_id';" | awk '{print $1}')
+ while read name type value; do
+ if [ "$type" = "num" ]; then
+ echo "INSERT INTO annot_process_numeric (id, name, value) VALUES ('$id', '$name', $value);" >> extrainfo.sql
+ fi
+ if [ "$type" = "txt" ]; then
+ echo "INSERT INTO annot_process_text (id, name, value) VALUES ('$id', '$name', '$value');" >> extrainfo.sql
+ fi
+ done < fields.txt
+done < extrainfo.txt
+
+$SQLCMD < extrainfo.sql
+
echo Finished sending SQL to DB
Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs 2011-03-23 19:02:12 UTC (rev 4223)
+++ provenancedb/swift-prov-import-all-logs 2011-03-27 21:54:23 UTC (rev 4224)
@@ -38,7 +38,7 @@
echo -n "Log: $filename ... "
# TODO: does not work in sqlite
- EXISTING=$($SQLCMD --tuples-only -c "select count(*) from workflow where log_filename='$filename';")
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from run where log_filename='$filename';")
if [ "$EXISTING" -eq "0" ]; then
PROV_ENABLED=$(grep provenanceid $filename | wc -l)
@@ -55,7 +55,7 @@
export WF="${RUNID}"
- echo "INSERT INTO workflow (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
+ echo "INSERT INTO run (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
echo version $version in log file $filename
echo ============= will import =============
More information about the Swift-commit
mailing list