[Swift-commit] r3706 - provenancedb
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Wed Nov 10 12:18:39 CST 2010
Author: lgadelha
Date: 2010-11-10 12:18:39 -0600 (Wed, 10 Nov 2010)
New Revision: 3706
Modified:
provenancedb/README
provenancedb/prepare-provenance-chart
provenancedb/prov-init.sql
provenancedb/prov-sql-generate-transitive-closures.sql
provenancedb/prov-to-sql.sh
provenancedb/swift-prov-import-all-logs
Log:
New database schema, and corrections to import scripts.
Modified: provenancedb/README
===================================================================
--- provenancedb/README 2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/README 2010-11-10 18:18:39 UTC (rev 3706)
@@ -1,12 +1,9 @@
-provenance database prototype
+The file etc/provenance.config should be edited to define the local configuration. The location of the directory containing the log files should be defined in the variable LOGREPO. For instance:
-At the moment, you have pretty much no chance of getting this working.
+export LOGREPO=~/swift-logs/
-Working notes are in docbook form in provenance.xml
+The command used for connecting to the database should be defined in the variable SQLCMD. For example, to connect to CI's PostgreSQL? database:
-To import, run:
+export SQLCMD="psql -h db.ci.uchicago.edu -U provdb provdb"
- ./import-all
-
-with no parameters.
-
+The script ./swift-prov-import-all-logs will import provenance information from the log files in $LOGREPO into the database. The command line option -rebuild will initialize the database before importing provenance information. The file prov-init.sql contains the database schema.
Modified: provenancedb/prepare-provenance-chart
===================================================================
--- provenancedb/prepare-provenance-chart 2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prepare-provenance-chart 2010-11-10 18:18:39 UTC (rev 3706)
@@ -8,8 +8,8 @@
export RUNID=$(basename $1 .log)
-export WFID="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
-export EXECUTE2PREFIX="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute2:${RUNID}:"
+export WFID="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
+export EXECUTE2PREFIX="tag:ci.uchicago.edu,2008:swiftlogs:execute2:${RUNID}:"
# will output log information about datasets from a log file passed as $1
@@ -30,8 +30,8 @@
cat $1 | grep ' OPERATOR ' | sed 's/^.*thread=\([^ ]*\) operator="\([^ ]*\)" lhs=\([^ ]*\) rhs=\([^ ]*\) result=\([^ ]*\).*$/\1 \2 \3 \4 \5/' > operators.txt
-# 2009-03-19 19:15:35,244+0100 INFO vdl:arguments FUNCTION id=88000-0-4-4 name="f ilename" result=tag:benc at ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg 13:720000000060
-# 2009-03-19 19:15:35,246+0100 INFO vdl:arguments FUNCTIONPARAMETER id=88001-0-4- 4 input=tag:benc at ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg13:72000 0000058
+# 2009-03-19 19:15:35,244+0100 INFO vdl:arguments FUNCTION id=88000-0-4-4 name="f ilename" result=tag:ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg 13:720000000060
+# 2009-03-19 19:15:35,246+0100 INFO vdl:arguments FUNCTIONPARAMETER id=88001-0-4- 4 input=tag:ci.uchicago.edu,2008:swift:dataset:20090319-1915-xj8flg13:72000 0000058
cat $1 | grep ' FUNCTION ' | sed "s/^.*id=\([^ ]*\) name=\([^ ]*\) result=\([^ ]*\).*\$/$WFID\1 \2 \3/" > functions.txt
# the IDs in functions.txt should be unique...
Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql 2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prov-init.sql 2010-11-10 18:18:39 UTC (rev 3706)
@@ -1,253 +1,259 @@
-
-- this is the schema definition used for the main relational provenance
-- implementation (in both sqlite3 and postgres)
-
-DROP TABLE processes;
-DROP TABLE executes;
-DROP TABLE execute2s;
-DROP TABLE dataset_usage;
-DROP TABLE invocation_procedure_names;
-DROP TABLE dataset_containment;
-DROP TABLE dataset_filenames;
-DROP TABLE processes_in_workflows;
-DROP TABLE dataset_values;
-DROP TABLE known_workflows;
-DROP TABLE workflow_events;
+
+DROP TABLE dataset;
+DROP TABLE file;
+DROP TABLE variable;
+DROP TABLE ds_containment;
+DROP TABLE process;
+DROP TABLE execute;
+DROP TABLE execute2;
+DROP TABLE workflow;
+DROP TABLE ds_usage;
+DROP TABLE annot_ds_num;
+DROP TABLE annot_ds_txt;
+DROP TABLE annot_ds_bool;
+DROP TABLE annot_p_num;
+DROP TABLE annot_p_txt;
+DROP TABLE annot_p_bool;
+DROP TABLE annot_wf_num;
+DROP TABLE annot_wf_txt;
+DROP TABLE annot_wf_bool;
DROP TABLE extrainfo;
DROP TABLE createarray;
DROP TABLE createarray_member;
DROP TABLE array_range;
-DROP TABLE dataset_annotations_numeric;
-DROP TABLE dataset_annotations_varchar;
-DROP TABLE dataset_annotations_boolean;
-DROP TABLE process_annotations_numeric;
-DROP TABLE process_annotations_varchar;
-DROP TABLE process_annotations_boolean;
-DROP TABLE workflow_annotations_numeric;
-DROP TABLE workflow_annotations_varchar;
-DROP TABLE workflow_annotations_boolean;
--- associates each process with its containing workflow
--- TODO - perhaps a workflow is itself a big big process?
--- in which case this looks very much like a compound/app
--- containment?
-CREATE TABLE processes_in_workflows
- (workflow_id varchar(2048),
- process_id varchar(2048),
- primary key (workflow_id, process_id)
- );
--- processes gives information about each process (in the OPM sense)
--- it is augmented by information in other tables
-CREATE TABLE processes
- (id varchar(2048) PRIMARY KEY, -- a uri
- type varchar(16) -- specifies the type of process. for any type, it
- -- must be the case that the specific type table
- -- has an entry for this process.
- -- Having this type here seems poor normalisation, though?
+-- workflow stores some information about each workflow log that has
+-- been seen by the importer: the log filename, swift version and import
+-- status.
+-- Might be interesting to store xml translation of the swiftscript code
+-- here for prospective provenance/versioning
+CREATE TABLE workflow
+ (id varchar(256) PRIMARY KEY,
+ log_filename varchar(2048),
+ swift_version varchar(16),
+ import_status varchar(16),
+ start_time numeric,
+ duration numeric
);
+-- workflow_run stores the start time and duration for each workflow
+-- that has been successfully imported.
+--CREATE TABLE wf_run
+-- (id varchar(256) PRIMARY KEY REFERENCES workflow (id) ON DELETE CASCADE,
+-- start_time numeric,
+-- duration numeric
+-- );
--- this gives information about each execute.
--- each execute is identified by a unique URI. other information from
--- swift logs is also stored here. an execute is an OPM process.
-CREATE TABLE executes
- (id varchar(2048) PRIMARY KEY, -- actually foreign key to processes
- starttime numeric,
- duration numeric,
- finalstate varchar(2048),
- app varchar(2048),
- scratch varchar(2048)
+-- dataset stores all dataset identifiers.
+CREATE TABLE dataset
+ (id varchar(256) PRIMARY KEY
);
--- this gives information about each execute2, which is an attempt to
--- perform an execution. the execute2 id is tied to per-execution-attempt
--- information such as wrapper logs
-
-CREATE TABLE execute2s
- (id varchar(2048) PRIMARY KEY,
- execute_id varchar(2048), -- secondary key to executes and processes tables
- starttime numeric,
- duration numeric,
- finalstate varchar(2048),
- site varchar(2048)
+-- file stores the filename mapped to each dataset.
+CREATE TABLE file
+ ( id varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+ filename varchar(2048)
);
--- dataset_usage records usage relationships between processes and datasets;
--- in SwiftScript terms, the input and output parameters for each
--- application procedure invocation; in OPM terms, the artificts which are
--- input to and output from each process that is a Swift execution
-
--- TODO: no primary key here. should probably index both on execute_id and on
--- dataset_id for common queries? maybe add arbitrary ID for sake of it?
-
-CREATE TABLE dataset_usage
- (process_id varchar(2048), -- foreign key but not enforced because maybe process
- -- doesn't exist at time. same type as processes.id
- direction char(1), -- I or O for input or output
- dataset_id varchar(2048), -- this will perhaps key against dataset table
- param_name varchar(2048) -- the name of the parameter in this execute that
- -- this dataset was bound to. sometimes this must
- -- be contrived (for example, in positional varargs)
+-- dataset_values stores the value for each dataset which is known to have
+-- a value (which is all assigned primitive types). No attempt is made here
+-- to expose that value as an SQL type other than a string, and so (for
+-- example) SQL numerical operations should not be expected to work, even
+-- though the user knows that a particular dataset stores a numeric value.
+CREATE TABLE variable
+ ( id varchar(256) PRIMARY KEY REFERENCES dataset (id) ON DELETE CASCADE,
+ value varchar(2048)
);
-
--- invocation_procedure_name maps each execute ID to the name of its
--- SwiftScript procedure
-
--- TODO probably desirable that this is part of executes table
--- but for now this is the easiest to pull data from logs.
-CREATE TABLE invocation_procedure_names
- (execute_id varchar(2048) PRIMARY KEY,
- procedure_name varchar(2048)
- );
-
-
-- dataset_containment stores the containment hierarchy between
-- container datasets (arrays and structs) and their contents.
-
--- outer_dataset_id contains inner_dataset_id
-
+-- out_id contains in_id
-- TODO this should perhaps be replaced with a more OPM-like model of
-- constructors and accessors, rather than, or in addition to,
-- a containment hierarchy. The relationship (such as array index or
-- structure member name) should also be stored in this table.
-CREATE TABLE dataset_containment
- ( outer_dataset_id varchar(2048),
- inner_dataset_id varchar(2048)
+CREATE TABLE ds_containment
+ ( out_id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ in_id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ PRIMARY KEY (out_id,in_id)
);
-
--- dataset_filenames stores the filename mapped to each dataset. As some
--- datasets do not have filenames, it should not be expected that
--- every dataset will have a row in this table
-CREATE TABLE dataset_filenames
- ( dataset_id varchar(2048) PRIMARY KEY,
- filename varchar(2048)
+-- process gives information about each process (in the OPM sense)
+-- it is augmented by information in other tables
+-- specifies the type of process. for any type, it
+-- must be the case that the specific type table
+-- has an entry for this process.
+-- Having this type here seems poor normalisation, though?
+-- process types: internal, rootthread, execute, function, compound, scope, operator
+-- maybe create a table for each type?
+CREATE TABLE process
+ (id varchar(256) PRIMARY KEY,
+ type varchar(16),
+ name varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
+ workflow_id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE -- normalize: workflow_id of sub-procedure determined
+ -- by compound procedure
);
--- dataset_values stores the value for each dataset which is known to have
--- a value (which is all assigned primitive types). No attempt is made here
--- to expose that value as an SQL type other than a string, and so (for
--- example) SQL numerical operations should not be expected to work, even
--- though the user knows that a particular dataset stores a numeric value.
-CREATE TABLE dataset_values
- ( dataset_id varchar(2048) PRIMARY KEY,
- value varchar(2048)
+-- this gives information about each execute.
+-- each execute is identified by a unique URI. other information from
+-- swift logs is also stored here. an execute is an OPM process.
+CREATE TABLE execute
+ (id varchar(256) PRIMARY KEY REFERENCES process (id) ON DELETE CASCADE,
+ procedure_name varchar(256), -- name of the app procedure that invokes the transformation
+ start_time numeric,
+ duration numeric,
+ final_state varchar(16),
+ scratch varchar(2048)
);
--- The above dataset_* tables are the original containment representation
--- of datasets.
--- Now here are some tables to represent complex datasets differently, in
--- terms of accessors and constructors.
--- It is unclear which is the better representation.
-
-CREATE TABLE createarray
- ( array_id varchar(2048)
+-- this gives information about each execute2, which is an attempt to
+-- perform an execution. the execute2 id is tied to per-execution-attempt
+-- information such as wrapper logs
+CREATE TABLE execute2
+ (id varchar(256) PRIMARY KEY,
+ execute_id varchar(256) REFERENCES execute (id) ON DELETE CASCADE,
+ start_time numeric,
+ duration numeric,
+ final_state varchar(16),
+ site varchar(256)
);
-CREATE TABLE createarray_member
- ( array_id varchar(2048),
- ix varchar(2048),
- member_id varchar(2048)
+-- dataset_usage records usage relationships between processes and datasets;
+-- in SwiftScript terms, the input and output parameters for each
+-- application procedure invocation; in OPM terms, the artificts which are
+-- input to and output from each process that is a Swift execution
+CREATE TABLE ds_usage
+ (process_id varchar(256) REFERENCES process(id) ON DELETE CASCADE,
+ direction char(1), -- I or O for input or output
+ dataset_id varchar(256) REFERENCES dataset(id) ON DELETE CASCADE,
+ param_name varchar(256), -- the name of the parameter in this execute that
+ -- this dataset was bound to. sometimes this must
+ -- be contrived (for example, in positional varargs)
+ PRIMARY KEY (process_id,direction,dataset_id,param_name)
);
--- TODO step
-CREATE TABLE array_range
- ( array_id varchar(2048),
- from_id varchar(2048),
- to_id varchar(2048),
- step_id varchar(2048) -- nullable, if step is unspecified
- );
-
--- known_workflows stores some information about each workflow log that has
--- been seen by the importer: the log filename, swift version and import
--- status.
-CREATE TABLE known_workflows
- (
- workflow_id varchar(2048) PRIMARY KEY,
- workflow_log_filename varchar(2048),
- version varchar(2048),
- importstatus varchar(2048)
- );
-
-
--- workflow_events stores the start time and duration for each workflow
--- that has been successfully imported.
-CREATE TABLE workflow_events
- ( workflow_id varchar(2048) PRIMARY KEY,
- starttime numeric,
- duration numeric
- );
-
--- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
-CREATE TABLE extrainfo
- ( execute2id varchar(2048),
- extrainfo varchar(1024)
- );
-
-- annotations
-CREATE TABLE dataset_annotations_numeric
- ( dataset_id varchar(2048),
- name varchar(2048),
+CREATE TABLE annot_ds_num
+ ( id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ name varchar(256),
value numeric,
- primary key(dataset_id, name)
+ PRIMARY KEY (id, name)
);
-CREATE TABLE dataset_annotations_varchar
- ( dataset_id varchar(2048),
- name varchar(2048),
- value varchar(4096),
- primary key(dataset_id, name)
+CREATE TABLE annot_ds_txt
+ ( id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ name varchar(256),
+ value varchar(2048),
+ PRIMARY KEY (id, name)
);
-CREATE TABLE dataset_annotations_boolean
- ( dataset_id varchar(2048),
- name varchar(2048),
+CREATE TABLE annot_ds_bool
+ ( id varchar(256) REFERENCES dataset (id) ON DELETE CASCADE,
+ name varchar(256),
value boolean,
- primary key(dataset_id, name)
+ PRIMARY KEY (id, name)
);
-CREATE TABLE process_annotations_numeric
- ( process_id varchar(2048),
- name varchar(2048),
+CREATE TABLE annot_p_num
+ ( id varchar(256) REFERENCES process (id) ON DELETE CASCADE,
+ name varchar(256),
value numeric,
- primary key(process_id, name)
+ PRIMARY KEY (id, name)
);
-CREATE TABLE process_annotations_varchar
- ( process_id varchar(2048),
- name varchar(2048),
- value varchar(1024),
- primary key(process_id, name)
+CREATE TABLE annot_p_txt
+ ( id varchar(256) REFERENCES process (id) ON DELETE CASCADE,
+ name varchar(256),
+ value varchar(2048),
+ PRIMARY KEY (id, name)
);
-CREATE TABLE process_annotations_boolean
- ( process_id varchar(2048),
- name varchar(2048),
+CREATE TABLE annot_p_bool
+ ( id varchar(256) REFERENCES process (id) ON DELETE CASCADE,
+ name varchar(256),
value boolean,
- primary key(process_id, name)
+ PRIMARY KEY (id, name)
);
-CREATE TABLE workflow_annotations_numeric
- ( workflow_id varchar(2048),
- name varchar(2048),
+CREATE TABLE annot_wf_num
+ ( id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE,
+ name varchar(256),
value numeric,
- primary key(workflow_id, name)
+ PRIMARY KEY (id, name)
);
-CREATE TABLE workflow_annotations_varchar
- ( workflow_id varchar(2048),
- name varchar(2048),
- value varchar(1024),
- primary key(workflow_id, name)
+CREATE TABLE annot_wf_txt
+ ( id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE,
+ name varchar(256),
+ value varchar(2048),
+ PRIMARY KEY (id, name)
);
-CREATE TABLE workflow_annotations_boolean
- ( workflow_id varchar(2048),
- name varchar(2048),
+CREATE TABLE annot_wf_bool
+ ( id varchar(256) REFERENCES workflow (id) ON DELETE CASCADE,
+ name varchar(2048),
value boolean,
- primary key(workflow_id, name)
+ PRIMARY KEY (id, name)
);
+
+-- extrainfo stores lines generated by the SWIFT_EXTRA_INFO feature
+-- extrainfo will be processes into annotation tables
+-- CREATE TABLE extrainfo
+-- ( id varchar(256) PRIMARY KEY REFERENCES execute (id) ON DELETE CASCADE,
+-- extrainfo varchar(2048)
+-- );
+
+-- The above dataset_* tables are the original containment representation
+-- of datasets.
+-- Now here are some tables to represent complex datasets differently, in
+-- terms of accessors and constructors.
+-- It is unclear which is the better representation.
+
+CREATE TABLE createarray
+ ( id varchar(256) PRIMARY KEY
+ );
+
+CREATE TABLE createarray_member
+ ( array_id varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
+ ix varchar(256),
+ member_id varchar(256),
+ PRIMARY KEY (array_id, ix)
+ );
+
+-- TODO step
+CREATE TABLE array_range
+ ( array_id varchar(256) REFERENCES createarray (id) ON DELETE CASCADE,
+ from_id varchar(256),
+ to_id varchar(256),
+ step_id varchar(256), -- nullable, if step is unspecified
+ PRIMARY KEY (array_id,from_id,to_id,step_id)
+ );
+
-- this GRANT does not work for sqlite; you'll get a syntax error but
-- ignore it, as it is not needed in sqlite
-grant all on dataset_containment, dataset_filenames, dataset_usage, processes_in_workflows, invocation_procedure_names, known_workflows, workflow_events to public, operators;
+grant all on
+ dataset,
+ file,
+ variable,
+ ds_containment,
+ process,
+ execute,
+ execute2,
+ workflow,
+ ds_usage,
+ annot_ds_num,
+ annot_ds_txt,
+ annot_ds_bool,
+ annot_p_num,
+ annot_p_txt,
+ annot_p_bool,
+ annot_wf_num,
+ annot_wf_txt,
+ annot_wf_bool,
+ extrainfo,
+ createarray,
+ createarray_member,
+ array_range
+to public, operators;
Modified: provenancedb/prov-sql-generate-transitive-closures.sql
===================================================================
--- provenancedb/prov-sql-generate-transitive-closures.sql 2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prov-sql-generate-transitive-closures.sql 2010-11-10 18:18:39 UTC (rev 3706)
@@ -10,13 +10,13 @@
-- of same?
CREATE TABLE trans
- (before char(128),
- after char(128),
+ (before varchar(2048),
+ after varchar(2048),
CONSTRAINT no_duplicate_arcs_in_trans UNIQUE (before, after));
CREATE TABLE transbase
- (before char(128),
- after char(128),
+ (before varchar(2048),
+ after varchar(2048),
CONSTRAINT no_duplicate_arcs_in_transbase UNIQUE (before, after)
);
Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh 2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/prov-to-sql.sh 2010-11-10 18:18:39 UTC (rev 3706)
@@ -1,12 +1,11 @@
#!/bin/bash
-
export RUNID=$(basename $1 .log)
-export WFID="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
+export WFID="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:"
# TODO is there already a URI form for identifying workflows?
-export WF="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
+export WF="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
echo Generating SQL for $RUNID
@@ -14,110 +13,147 @@
# this gives a distinction between the root process for a workflow and the
# workflow itself. perhaps better to model the workflow as a process
-echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('${WFID}0', '$WF');" >> tmp-import.sql
- echo "INSERT INTO processes (id, type) VALUES ('${WFID}0', 'rootthread');" >> tmp-import.sql
+echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" >> tmp-import.sql
-while read time duration thread localthread endstate app scratch; do
- echo "INSERT INTO processes (id, type) VALUES ('$thread', 'execute');" >> tmp-import.sql
- echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$thread', '$WF');" >> tmp-import.sql
- echo "INSERT INTO executes (id, starttime, duration, finalstate, app, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$app', '$scratch');" >> tmp-import.sql
+
+while read time duration thread localthread endstate tr_name scratch; do
+ echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-import.sql
+ echo "INSERT INTO execute (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" >> tmp-import.sql
done < execute.global.event
-while read starttime duration globalid id endstate thread site scratch; do
+while read start_time duration globalid id endstate thread site scratch; do
+ # cut off the last component of the thread, so that we end up at the
+ # parent thread id which should correspond with the execute-level ID
+ inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
+ echo "INSERT INTO execute2 (id, execute_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-import.sql
+done < execute2.global.event
- # cut off the last component of the thread, so that we end up at the
- # parent thread id which should correspond with the execute-level ID
- execute_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
- echo "INSERT INTO execute2s (id, execute_id, starttime, duration, finalstate, site) VALUES ('$globalid', '$execute_id', '$starttime', '$duration', '$endstate', '$site');" >> tmp-import.sql
-done < execute2.global.event
-while read thread name lhs rhs result; do
- operatorid="${WFID}operator:$thread"
- echo "INSERT INTO processes (id, type) VALUES ('$operatorid', 'operator');" >> tmp-import.sql
- echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$operatorid','$name'); " >> tmp-import.sql
- echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$operatorid', '$WF');" >> tmp-import.sql
- echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$lhs', 'lhs');" >> tmp-import.sql
- echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$rhs', 'rhs');" >> tmp-import.sql
- echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'O', '$result', 'result');" >> tmp-import.sql
+while read col1 col2 col3 col4 col5 threadst namest lhsst rhsst resultst; do
+ thread=`echo $threadst | awk 'BEGIN { FS = "=" }; {print $2}'`
+ name=`echo $name | awk 'BEGIN { FS = "=" }; {print $2}'`
+ lhs=`echo $lhsst | awk 'BEGIN { FS = "=" }; {print $2}'`
+ rhs=`echo $rhsst | awk 'BEGIN { FS = "=" }; {print $2}'`
+ result=`echo $resultst | awk 'BEGIN { FS = "=" }; {print $2}'`
+
+ operatorid="${WFID}operator:$thread"
+
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$lhs';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$lhs');" >> tmp-import.sql
+ fi
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$rhs';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$rhs');" >> tmp-import.sql
+ fi
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$result';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$result');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-import.sql
+ echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$lhs', 'lhs');" >> tmp-import.sql
+ echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'I', '$rhs', 'rhs');" >> tmp-import.sql
+ echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$operatorid', 'O', '$result', 'result');" >> tmp-import.sql
done < operators.txt
while read id name output; do
- echo "INSERT INTO processes (id, type) VALUES ('$id', 'function');" >> tmp-import.sql
- echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$id', '$WF');" >> tmp-import.sql
- echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'O', '$output', 'result');" >> tmp-import.sql
- echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$id', '$name');" >> tmp-import.sql
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$id';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$id');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-import.sql
+ echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'O', '$output', 'result');" >> tmp-import.sql
done < functions.txt
while read id value; do
-# TODO need ordering/naming
- echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'I', '$value', 'undefined');" >> tmp-import.sql
+ # TODO need ordering/naming
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$id';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$id');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$id', 'I', '$value', 'undefined');" >> tmp-import.sql
done < function-inputs.txt
while read thread direction dataset variable rest; do
- if [ "$direction" == "input" ] ; then
- dir=I
- else
- dir=O
- fi
- echo "INSERT INTO dataset_usage (process_id, direction, dataset_id, param_name) VALUES ('$thread', '$dir', '$dataset', '$variable');" >> tmp-import.sql
+ if [ "$direction" == "input" ] ; then
+ dir=I
+ else
+ dir=O
+ fi
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$dataset';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO ds_usage (process_id, direction, dataset_id, param_name) VALUES ('$thread', '$dir', '$dataset', '$variable');" >> tmp-import.sql
done < tie-data-invocs.txt
while read thread appname; do
- echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$thread', '$appname');" >> tmp-import.sql
-
+ echo "UPDATE execute SET procedure_name='$appname' WHERE id='$thread';" >> tmp-import.sql
done < invocation-procedure-names.txt
while read outer inner; do
- echo "INSERT INTO dataset_containment (outer_dataset_id, inner_dataset_id) VALUES ('$outer', '$inner');" >> tmp-import.sql
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$outer';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$outer');" >> tmp-import.sql
+ fi
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$inner';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$inner');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO ds_containment (out_id, in_id) VALUES ('$outer', '$inner');" >> tmp-import.sql
done < tie-containers.txt
while read dataset filename; do
- echo "INSERT INTO dataset_filenames (dataset_id, filename) VALUES ('$dataset', '$filename');" >> tmp-import.sql
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$dataset';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" >> tmp-import.sql
done < dataset-filenames.txt
while read dataset value; do
- echo "INSERT INTO dataset_values (dataset_id, value) VALUES ('$dataset', '$value');" >> tmp-import.sql
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from dataset where id='$dataset';")
+ if [ "$EXISTING" -eq "0" ]; then
+ echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-import.sql
+ fi
+ echo "INSERT INTO variable (id, value) VALUES ('$dataset', '$value');" >> tmp-import.sql
done < dataset-values.txt
-while read start duration wfid; do
- echo "INSERT INTO workflow_events (workflow_id,starttime, duration) VALUES ('$wfid', '$start', '$duration');" >> tmp-import.sql
+while read start duration wfid rest; do
+ echo "UPDATE workflow SET start_time=$start WHERE id='$WF';" >> tmp-import.sql
+ echo "UPDATE workflow SET duration=$duration WHERE id='$WF';" >> tmp-import.sql
done < workflow.event
while read id extrainfo ; do
-# TODO this will not like quotes and things like that in extrainfo
- echo "INSERT INTO extrainfo (execute2id, extrainfo) VALUES ('$id', '$extrainfo');" >> tmp-import.sql
+ # TODO this will not like quotes and things like that in extrainfo
+ echo "INSERT INTO extrainfo (id, extrainfo) VALUES ('$id', '$extrainfo');" >> tmp-import.sql
done < extrainfo.txt
# TODO this could merge with other naming tables
-while read start duration thread finalstate procname ; do
- compoundid=$WFID$thread
- echo "INSERT INTO processes (id, type) VALUES ('$compoundid', 'compound');" >> tmp-import.sql
- echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$compound', '$WF');" >> tmp-import.sql
- echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$compoundid', '$procname');" >> tmp-import.sql
+while read start duration thread final_state procname ; do
+ compoundid=$WFID$thread
+ echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-import.sql
done < compound.event
-while read start duration thread finalstate procname ; do
- fqid=$WFID$thread
- echo "INSERT INTO processes (id, type) VALUES ('$fqid', 'internal');" >> tmp-import.sql
- echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$fqid', '$WF');" >> tmp-import.sql
- echo "INSERT INTO invocation_procedure_names (execute_id, procedure_name) VALUES ('$fqid', '$procname');" >> tmp-import.sql
+while read start duration thread final_state procname ; do
+ fqid=$WFID$thread
+ echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-import.sql
done < internalproc.event
+while read t ; do
+ thread="${WFID}$t"
+ echo "INSERT INTO process (id, type, name, workflow_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-import.sql
+done < scopes.txt
+
while read id ; do
- echo "INSERT INTO createarray (array_id) VALUES ('$id');" >> tmp-import.sql
+ echo "INSERT INTO createarray (array_id) VALUES ('$id');" >> tmp-import.sql
done < createarray.txt
while read arrayid index memberid ; do
- echo "INSERT INTO createarray_member (array_id, ix, member_id) VALUES ('$arrayid', '$index', '$memberid');" >> tmp-import.sql
+ echo "INSERT INTO createarray_member (array_id, ix, member_id) VALUES ('$arrayid', '$index', '$memberid');" >> tmp-import.sql
done < createarray-members.txt
-while read t ; do
- thread="${WFID}$t"
- echo "INSERT INTO processes (id, type) VALUES ('$thread', 'scope');" >> tmp-import.sql
- echo "INSERT INTO processes_in_workflows (process_id, workflow_id) VALUES ('$thread', '$WF');" >> tmp-import.sql
-done < scopes.txt
-
echo Sending SQL to DB
$SQLCMD < tmp-import.sql
Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs 2010-11-08 20:30:09 UTC (rev 3705)
+++ provenancedb/swift-prov-import-all-logs 2010-11-10 18:18:39 UTC (rev 3706)
@@ -11,11 +11,13 @@
source $PROVDIR/etc/provenance.config
export PATH=$PROVDIR:$PATH
+# this generates a file with pairs like:
+# <starttime> <swift version> <logfilename>
swift-plot-log $LOGREPO everylog-vs-versions.data
if [ "$?" != "0" ]; then
- echo swift-plot-log failed when building everylog-vs-versions.data
- exit 1
+ echo swift-plot-log failed when building everylog-vs-versions.data
+ exit 1
fi
# TODO better tmp handling that always using the same name in a shared
@@ -24,53 +26,55 @@
echo first commandline param is $1
if [ "$1" == "rebuild" ]; then
- echo CLEANING DATABASE
- $SQLCMD < $PROVDIR/prov-init.sql
+ echo CLEANING DATABASE
+ $SQLCMD < $PROVDIR/prov-init.sql
fi
while read start version filename; do
+
+ export IDIR=$(echo $filename | sed 's/\.log$/.d/')
+ echo IDIR=$IDIR
+ if [ $version -ge 1538 ]; then
+ echo -n "Log: $filename ... "
+
+ # TODO: does not work in sqlite
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from workflow where log_filename='$filename';")
+
+ if [ "$EXISTING" -eq "0" ]; then
+ echo IMPORTING
+
+ if grep --silent "DEBUG Loader Swift finished with no errors" $filename; then
+ wfstatus="SUCCESS"
+ else
+ wfstatus="FAIL"
+ fi
- export IDIR=$(echo $filename | sed 's/\.log$/.d/')
- echo IDIR=$IDIR
- if [ $version -ge 1538 ]; then
- echo -n "Log: $filename ... "
+ export RUNID=$(basename $filename .log)
+ # changing tag authority from e-mail benc at ci.uchicago.edu to dnsname ci.uchicago.edu
+ export WF="tag:ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
+
+ echo "INSERT INTO workflow (id, log_filename, swift_version, import_status) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
-# TODO: make it work in general (sqlite, mysql, ...)
-# Works only with PostgreSQL
-EXISTING=$($SQLCMD --tuples-only -c "select count(*) from known_workflows where workflow_log_filename='$filename';")
-
-if [ "$EXISTING" -eq "0" ]; then
- echo IMPORTING
-
- if grep --silent "DEBUG Loader Swift finished with no errors" $filename; then
- wfstatus="SUCCESS"
- else
- wfstatus="FAIL"
+
+ echo version $version in log file $filename
+ echo ============= will import =============
+ prepare-for-import $filename
+ if [ "$?" != "0" ]; then
+ echo prepare-for-import failed
+ exit 2
+ fi
+ import-run-to-sql $filename
+ if [ "$?" != "0" ]; then
+ echo import-run-to-sql failed
+ exit 3
+ fi
+
+ # import-run-to-xml $filename
+
+ else
+ echo SKIP: Already known in workflow
+ fi
fi
-
- echo version $version in log file $filename
- echo ============= will import =============
- prepare-for-import $filename
- if [ "$?" != "0" ]; then
- echo prepare-for-import failed
- exit 2
- fi
- import-run-to-sql $filename
- if [ "$?" != "0" ]; then
- echo import-run-to-sql failed
- exit 3
- fi
-
-# import-run-to-xml $filename
-
- export RUNID=$(basename $filename .log)
- export WF="tag:benc at ci.uchicago.edu,2008:swiftlogs:execute:${RUNID}:run"
-
- echo "INSERT INTO known_workflows (workflow_id, workflow_log_filename, version, importstatus) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
-else
- echo SKIP: Already known in workflow
-fi
- fi
done < /tmp/everylog-vs-versions.data
# now pull the XML data into eXist, in as few runs as possible to avoid
More information about the Swift-commit
mailing list