[Swift-commit] r4469 - in provenancedb: . apps/oops
lgadelha at ci.uchicago.edu
lgadelha at ci.uchicago.edu
Thu May 12 19:24:49 CDT 2011
Author: lgadelha
Date: 2011-05-12 19:24:49 -0500 (Thu, 12 May 2011)
New Revision: 4469
Modified:
provenancedb/apps/oops/oops_extractor.sh
provenancedb/pql_functions.sql
provenancedb/prov-init.sql
provenancedb/prov-to-sql.sh
provenancedb/swift-prov-import-all-logs
Log:
Schema update.
Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================
--- provenancedb/apps/oops/oops_extractor.sh 2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/apps/oops/oops_extractor.sh 2011-05-13 00:24:49 UTC (rev 4469)
@@ -4,9 +4,10 @@
#
PROVDB_HOME=~/provenancedb
-PROTESTS_HOME=/gpfs/pads/oops/aashish/CASP
+PROTESTS_HOME=/gpfs/pads/oops/aashish/CASP9
IMPORT_HOME=~/protests
+cp $PROVDB_HOME/etc/provenance.config.ci $PROVDB_HOME/etc/provenance.config
source $PROVDB_HOME/etc/provenance.config
# provdb_imported records runs already imported to the provenance database
@@ -39,13 +40,11 @@
LOG_FILENAME=$(ls *-*-*-*.log)
WORKFLOW_ID=$(echo "select id from run where log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}')
cd $IMPORT_HOME/swift-logs
- echo "insert into annot_run_text (run_id, name, value) values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
+ $SQLCMD -c "insert into a_run_t (run_id, name, value) values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');"
#extracts scientific parameters given as input to the workflow in file *.params.
- echo "select file.id,file.filename from process, dataset_production, dataset_containment, file where process.id=dataset_production.process_id and dataset_production.dataset_id=out_id and file.id=dataset_containment.in_id and filename like '%.params' and process.name='PrepLoop' and process.run_id='$WORKFLOW_ID';" > query.sql;
+ $SQLCMD -t -A -F " " -c "select file.id,file.name from proc, ds_out, ds_cont, file where proc.id=ds_out.proc_id and ds_out.ds_id=out_id and file.id=ds_cont.in_id and file.name like '%.params' and proc.name='PrepLoop' and proc.run_id='$WORKFLOW_ID';" -o result.txt
- $SQLCMD -t -A -F " " -f query.sql -o result.txt
-
DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
@@ -55,26 +54,24 @@
NAME=$(echo $line | awk 'BEGIN { FS = "=" }; {print $1}')
RIGHT=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}')
if [ "$NAME" = "SAMPLE RANGE" ]; then
- echo "insert into annot_dataset_text values ('$DATASET_ID', 'sample_range', '$RIGHT');" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_t values ('$DATASET_ID', 'sample_range', '$RIGHT');"
fi
if [ "$NAME" = "RESTRAIN DISTANCE" ]; then
VALUE1=$(echo $RIGHT | awk 'BEGIN { FS = "," }; {print $1}')
VALUE2=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $2}')
- echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'restrain_distance_1', $VALUE1);" | $SQLCMD
- echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'restrain_distance_2', $VALUE2);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'restrain_distance_1', $VALUE1);"
+ $SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'restrain_distance_2', $VALUE2);"
fi
if [ "$NAME" = "MAXIMUM NUMBER OF STEPS" ]; then
- echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'maximum_number_of_steps', $RIGHT);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'maximum_number_of_steps', $RIGHT);"
fi
done < $FILENAME
#extracts length of the fasta sequence given as input to the workflow in file *.fasta.
cd $IMPORT_HOME/swift-logs
- echo "select file.id,file.filename from process, dataset_production, dataset_containment, file where process.id=dataset_production.process_id and dataset_production.dataset_id=out_id and file.id=dataset_containment.in_id and filename like '%.fasta' and process.name='PrepLoop' and process.run_id='$WORKFLOW_ID';" > query.sql;
+ $SQLCMD -t -A -F " " -c "select file.id,file.name from proc, ds_out, ds_cont, file where proc.id=ds_out.proc_id and ds_out.ds_id=out_id and file.id=ds_cont.in_id and file.name like '%.fasta' and proc.name='PrepLoop' and proc.run_id='$WORKFLOW_ID';" -o result.txt
- $SQLCMD -t -A -F " " -f query.sql -o result.txt
-
DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
@@ -82,7 +79,7 @@
if [ -n "$FILENAME" ]; then
SEQLENGTH=$(awk '{if (NR==2) print $1}' $FILENAME | wc -c)
- echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);"
fi
# extracts scientific parameters given as output by the workflow in *.log.
@@ -99,10 +96,8 @@
cd $IMPORT_HOME/swift-logs
- echo "select file.id,file.filename from process, dataset_production, dataset_containment, file where process.id=dataset_production.process_id and dataset_production.dataset_id=out_id and file.id=dataset_containment.in_id and filename like '%.log' and process.name='LoopModel' and process.run_id='$WORKFLOW_ID';" > query.sql;
+ $SQLCMD -t -A -F " " -c "select file.id,file.name from proc, ds_out, ds_cont, file where proc.id=ds_out.proc_id and ds_out.ds_id=out_id and file.id=ds_cont.in_id and file.name like '%.log' and proc.name='LoopModel' and proc.run_id='$WORKFLOW_ID';" -o result.txt
- $SQLCMD -t -A -F " " -f query.sql -o result.txt
-
cd $PROTESTS_HOME/$k/$i
while read dataset filename; do
@@ -110,28 +105,28 @@
while read token1 token2 token3 token4; do
if [ "$token2" = "(Initial Energy:" ]; then
initialenergy=$(echo $token3 | awk 'BEGIN { FS = "\)" }; {print $1}')
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'initial_energy', $initialenergy);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'initial_energy', $initialenergy);"
fi
if [ "$token1" = "Total" ] && [ "$token2" = "Function" ] && [ "$token3" = "Evaluations:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'total_function_evaluations', $token4);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'total_function_evaluations', $token4);"
fi
if [ "$token1" = "Increasing" ] && [ "$token2" = "transitions:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'accepted_increasing_transitions', $token3);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'accepted_increasing_transitions', $token3);"
fi
if [ "$token1" = "Decreasing" ] && [ "$token2" = "transitions:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'accepted_decreasing_transitions', $token3);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'accepted_decreasing_transitions', $token3);"
fi
if [ "$token1" = "Rejected" ] && [ "$token2" = "transitions:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'rejected_transitions', $token3);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'rejected_transitions', $token3);"
fi
if [ "$token1" = "Final" ] && [ "$token2" = "Energy:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'final_energy', $token3);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'final_energy', $token3);"
fi
if [ "$token1" = "Final" ] && [ "$token2" = "Temp:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'final_temp', $token3);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'final_temp', $token3);"
fi
if [ "$token1" = "Total" ] && [ "$token2" = "Running" ] && [ "$token3" = "Time:" ]; then
- echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'total_running_time', $token4);" | $SQLCMD
+ $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'total_running_time', $token4);"
fi
done < $FILENAME
done < $IMPORT_HOME/swift-logs/result.txt
Modified: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql 2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/pql_functions.sql 2011-05-13 00:24:49 UTC (rev 4469)
@@ -4,37 +4,158 @@
-- SQL Functions
-- lists variations in a parameter's value across workflows, for parameters that are in-memory variables
+drop view in_mem_in cascade;
-CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name VARCHAR)
-RETURNS TABLE (
- run_id VARCHAR,
- param_name VARCHAR,
- value VARCHAR
-)
-AS $$
- SELECT process.workflow_id, ds_usage.param_name, variable.value
- FROM variable, ds_usage, process
- WHERE variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND
- ds_usage.param_name=$1
- GROUP BY process.workflow_id, ds_usage.param_name, variable.value;
-$$ LANGUAGE SQL;
+create view in_mem_in as
+ select proc.run_id, proc.id as proc_id,
+ proc.name as proc_name, ds_in.ds_id,
+ ds_in.param, in_mem.value
+ from proc, ds_in, in_mem
+ where proc.id=ds_in.proc_id and ds_in.ds_id=in_mem.id;
+drop view in_mem_out cascade;
+
+create view in_mem_out as
+ select proc.run_id, proc.id as proc_id,
+ proc.name as proc_name, ds_out.ds_id,
+ ds_out.param, in_mem.value
+ from proc, ds_out, in_mem
+ where proc.id=ds_out.proc_id and ds_out.ds_id=in_mem.id;
+
+drop view in_mem_use cascade;
+
+create view in_mem_use as
+ select * from in_mem_in
+ union
+ select * from in_mem_out;
+
+drop view file_in cascade;
+
+create view file_in as
+ select proc.run_id, proc.id as proc_id,
+ proc.name as proc_name, ds_in.ds_id,
+ ds_in.param, file.name as value
+ from proc, ds_in, file
+ where proc.id=ds_in.proc_id and ds_in.ds_id=file.id;
+
+drop view file_out cascade;
+
+create view file_out as
+ select proc.run_id, proc.id as proc_id,
+ proc.name as proc_name, ds_out.ds_id,
+ ds_out.param, file.name as value
+ from proc, ds_out, file
+ where proc.id=ds_out.proc_id and ds_out.ds_id=file.id;
+
+drop view file_use cascade;
+
+create view file_use as
+ select * from file_in
+ union
+ select * from file_out;
+
+drop view ds_use cascade;
+
+create view ds_use as
+ select * from in_mem_use
+ union
+ select * from file_use;
+
+drop view a_t cascade;
+
+create view a_t as
+ select run.id, a_run_t.name, a_run_t.value
+ from run, a_run_t
+ where run.id=a_run_t.run_id
+ union
+ select proc.run_id, a_proc_t.name, a_proc_t.value
+ from proc, a_proc_t
+ where proc.id=a_proc_t.proc_id
+ union
+ select ds_use.run_id, a_ds_t.name, a_ds_t.value
+ from ds_use, a_ds_t
+ where ds_use.ds_id=a_ds_t.ds_id;
+
+drop view a_n cascade;
+
+create view a_n as
+ select run.id, a_run_n.name, a_run_n.value
+ from run, a_run_n
+ where run.id=a_run_n.run_id
+ union
+ select proc.run_id, a_proc_n.name, a_proc_n.value
+ from proc, a_proc_n
+ where proc.id=a_proc_n.proc_id
+ union
+ select ds_use.run_id, a_ds_n.name, a_ds_n.value
+ from ds_use, a_ds_n
+ where ds_use.ds_id=a_ds_n.ds_id;
+
+
+
+
+drop type compare_run_by_parameter_type cascade;
+create type compare_run_by_parameter_type as (run_id varchar, param varchar, value varchar);
+
+create or replace function compare_run_by_parameter(param_name varchar)
+returns setof compare_run_by_parameter_type
+as $$
+ select run_id, param, value
+ from ds_use
+ where param=$1;
+$$ language sql;
+
+-- PostgreSQL >= 9.0
+-- CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name VARCHAR)
+-- RETURNS TABLE (
+-- run_id VARCHAR,
+-- param VARCHAR,
+-- value VARCHAR
+-- )
+-- AS $$
+-- SELECT proc.run_id, ds_out.param, in_mem.value
+-- FROM in_mem, ds_out, proc
+-- WHERE in_mem.id=ds_out.ds_id AND ds_out.proc_id=proc.id AND
+-- ds_out.param=$1
+-- GROUP BY proc.run_id, ds_out.param, in_mem.value
+-- UNION
+-- SELECT proc.run_id, ds_in.param, in_mem.value
+-- FROM in_mem, ds_in, proc
+-- WHERE in_mem.id=ds_in.ds_id AND ds_in.proc_id=proc.id AND
+-- ds_in.param=$1
+-- GROUP BY proc.run_id, ds_in.param, in_mem.value
+--$$ LANGUAGE SQL;
+
+DROP TYPE compare_run_by_parameter_type2;
+CREATE TYPE compare_run_by_parameter_type2 AS (run_id VARCHAR, param1 VARCHAR, value1 VARCHAR, param2 VARCHAR, value2 VARCHAR);
+
CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR)
-RETURNS TABLE (
- workflow_id VARCHAR,
- param_name1 VARCHAR,
- value1 VARCHAR,
- param_name2 VARCHAR,
- value2 VARCHAR
-)
+RETURNS SETOF compare_run_by_parameter_type2
AS $$
SELECT *
FROM compare_run_by_parameter($1) as t
INNER JOIN
compare_run_by_parameter($2) as s
- USING (workflow_id);
+ USING (run_id);
$$ LANGUAGE SQL;
+--CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR)
+--RETURNS TABLE (
+-- workflow_id VARCHAR,
+-- param_name1 VARCHAR,
+-- value1 VARCHAR,
+-- param_name2 VARCHAR,
+-- value2 VARCHAR
+--)
+--AS $$
+-- SELECT *
+-- FROM compare_run_by_parameter($1) as t
+-- INNER JOIN
+-- compare_run_by_parameter($2) as s
+-- USING (workflow_id);
+--$$ LANGUAGE SQL;
+
+
CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR, param_name3 VARCHAR)
RETURNS TABLE (
workflow_id VARCHAR,
@@ -80,6 +201,7 @@
WHERE workflow.id=annot_wf_num.id and annot_wf_num.name=$1
$$ LANGUAGE SQL;
+
CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
RETURNS TABLE (
workflow_id VARCHAR,
@@ -100,6 +222,7 @@
WHERE workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
$$ LANGUAGE SQL;
+
CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
RETURNS TABLE (
workflow_id VARCHAR,
@@ -121,6 +244,7 @@
WHERE workflow.id=annot_wf_bool.id and annot_wf_bool.name=$1
$$ LANGUAGE SQL;
+
-- correlate a parameter with workflow runtime statistics
CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR)
RETURNS TABLE (
@@ -137,16 +261,7 @@
process.workflow_id=workflow.id AND ds_usage.param_name=$1
$$ LANGUAGE SQL;
--- lists distinct processes by name in a workflow
-CREATE OR REPLACE FUNCTION process_names(wf_id VARCHAR)
-RETURNS SETOF VARCHAR AS $$
- SELECT DISTINCT(process.name)
- FROM process
- WHERE process.workflow_id=$1;
-$$ LANGUAGE SQL;
-
-- recursive query to find ancestor entities in a provenance graph
-
CREATE OR REPLACE FUNCTION ancestors(varchar) RETURNS SETOF varchar AS $$
WITH RECURSIVE anc(ancestor,descendant) AS
(
@@ -159,8 +274,8 @@
SELECT ancestor FROM anc
$$ LANGUAGE SQL;
+
-- compare(<entity>, <list of param_names, annotations keys>
-
CREATE OR REPLACE FUNCTION compare_run(VARIADIC args VARCHAR[])
RETURNS SETOF RECORD AS $$
DECLARE
@@ -198,38 +313,3 @@
$$ LANGUAGE plpgsql;
-CREATE OR REPLACE FUNCTION compare_run_print(VARIADIC args VARCHAR[])
-RETURNS VARCHAR AS $$
-DECLARE
- i INTEGER;
- q VARCHAR;
- selectq VARCHAR;
- fromq VARCHAR;
- property VARCHAR;
- property_type VARCHAR;
- function_name VARCHAR;
-BEGIN
- selectq := 'SELECT *';
- FOR i IN array_lower(args, 1)..array_upper(args, 1) LOOP
- property_type := split_part(args[i], '=', 1);
- property := split_part(args[i], '=', 2);
- CASE property_type
- WHEN 'param_name' THEN
- function_name := 'compare_run_by_parameter';
- WHEN 'annot_num' THEN
- function_name := 'compare_run_by_annot_num';
- WHEN 'annot_txt' THEN
- function_name := 'compare_run_by_annot_txt';
- WHEN 'annot_bool' THEN
- function_name := 'compare_run_by_annot_bool';
- END CASE;
- IF i = 1 THEN
- fromq := function_name || '(''' || property || ''') as t' || i;
- ELSE
- fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
- END IF;
- END LOOP;
- q := selectq || ' FROM ' || fromq;
- RETURN q;
-END;
-$$ LANGUAGE plpgsql;
Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql 2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/prov-init.sql 2011-05-13 00:24:49 UTC (rev 4469)
@@ -1,24 +1,25 @@
-- this is the schema definition used for the main relational provenance
-- implementation (in both sqlite3 and postgres)
-
+
drop table run cascade;
-drop table process cascade;
-drop table app_invocation cascade;
-drop table app_execution cascade;
-drop table runtime_info cascade;
-drop table dataset cascade;
+drop table proc cascade;
+drop table app_inv cascade;
+drop table app_exec cascade;
+drop table rt_info cascade;
+drop table ds cascade;
drop table file cascade;
-drop table in_memory cascade;
-drop table dataset_containment cascade;
-drop table dataset_consumption cascade;
-drop table dataset_production cascade;
-drop table annot_dataset_numeric cascade;
-drop table annot_dataset_text cascade;
-drop table annot_process_numeric cascade;
-drop table annot_process_text cascade;
-drop table annot_run_numeric cascade;
-drop table annot_run_text cascade;
+drop table in_mem cascade;
+drop table ds_cont cascade;
+drop table ds_in cascade;
+drop table ds_out cascade;
+drop table a_ds_n cascade;
+drop table a_ds_t cascade;
+drop table a_proc_n cascade;
+drop table a_proc_t cascade;
+drop table a_run_n cascade;
+drop table a_run_t cascade;
+
-- run stores information about each script run log that has
-- been seen by the importer: the log filename, swift version and import
-- status.
@@ -28,7 +29,8 @@
(id varchar(256) primary key,
log_filename varchar(2048),
swift_version varchar(16),
- final_state varchar(16),
+ cog_version varchar(16),
+ final_state varchar(16),
start_time numeric,
duration numeric
);
@@ -39,7 +41,7 @@
-- must be the case that the specific type table
-- has an entry for this process.
-- process types: internal, rootthread, execute, function, compound, scope, operator
-create table process
+create table proc
(id varchar(256) primary key,
type varchar(16),
name varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
@@ -50,9 +52,9 @@
-- this gives information about each execute.
-- each execute is identified by a unique URI. other information from
-- swift logs is also stored here. an execute is an OPM process.
-create table app_invocation
- (id varchar(256) primary key references process (id) on delete cascade,
- procedure_name varchar(256), -- name of the app procedure that invokes the transformation
+create table app_inv
+ (id varchar(256) primary key references proc (id) on delete cascade,
+ proc_name varchar(256), -- name of the app procedure that invokes the transformation
start_time numeric,
duration numeric,
final_state varchar(16),
@@ -62,9 +64,9 @@
-- this gives information about each execute2, which is an attempt to
-- perform an execution. the execute2 id is tied to per-execution-attempt
-- information such as wrapper logs
-create table app_execution
+create table app_exec
(id varchar(256) primary key,
- app_invocation_id varchar(256) references app_invocation (id) on delete cascade,
+ app_inv_id varchar(256) references app_inv (id) on delete cascade,
start_time numeric,
duration numeric,
final_state varchar(16),
@@ -73,26 +75,26 @@
-- app execution runtime info extracted from the /proc filesystem (assumes the app executed
-- in a Linux host)
-create table runtime_info
- ( app_execution_id varchar(256) references app_execution (id) on delete cascade,
+create table rt_info
+ ( app_exec_id varchar(256) references app_exec (id) on delete cascade,
tstamp numeric,
cpu_usage numeric,
max_phys_mem numeric,
- max_virtual_mem numeric,
- io_read_bytes numeric,
- io_write_bytes numeric,
- primary key (app_execution_id, tstamp)
+ max_virt_mem numeric,
+ io_read numeric,
+ io_write numeric,
+ primary key (app_exec_id, tstamp)
);
-- ds stores all dataset identifiers.
-create table dataset
+create table ds
(id varchar(256) primary key
);
-- file stores the filename mapped to each dataset.
create table file
- ( id varchar(256) primary key references dataset (id) on delete cascade,
- filename varchar(2048)
+ ( id varchar(256) primary key references ds (id) on delete cascade,
+ name varchar(2048)
);
-- dataset_values stores the value for each dataset which is known to have
@@ -100,8 +102,8 @@
-- to expose that value as an SQL type other than a string, and so (for
-- example) SQL numerical operations should not be expected to work, even
-- though the user knows that a particular dataset stores a numeric value.
-create table in_memory
- ( id varchar(256) primary key references dataset (id) on delete cascade,
+create table in_mem
+ ( id varchar(256) primary key references ds (id) on delete cascade,
value varchar(2048)
);
@@ -112,9 +114,9 @@
-- constructors and accessors, rather than, or in addition to,
-- a containment hierarchy. The relationship (such as array index or
-- structure member name) should also be stored in this table.
-create table dataset_containment
- ( out_id varchar(256) references dataset (id) on delete cascade,
- in_id varchar(256) references dataset (id) on delete cascade,
+create table ds_cont
+ ( out_id varchar(256) references ds (id) on delete cascade,
+ in_id varchar(256) references ds (id) on delete cascade,
primary key (out_id,in_id)
);
@@ -123,98 +125,99 @@
-- in SwiftScript terms, the input and output parameters for each
-- application procedure invocation; in OPM terms, the artificts which are
-- input to and output from each process that is a Swift execution
-create table dataset_consumption
- (process_id varchar(256) references process(id) on delete cascade,
- dataset_id varchar(256) references dataset(id) on delete cascade,
- param_name varchar(256), -- the name of the parameter in this execute that
+create table ds_in
+ (proc_id varchar(256) references proc(id) on delete cascade,
+ ds_id varchar(256) references ds(id) on delete cascade,
+ param varchar(256), -- the name of the parameter in this execute that
-- this dataset was bound to. sometimes this must
-- be contrived (for example, in positional varargs)
- primary key (process_id,dataset_id,param_name)
+ primary key (proc_id,ds_id,param)
);
-create table dataset_production
- (process_id varchar(256) references process(id) on delete cascade,
- dataset_id varchar(256) references dataset(id) on delete cascade,
- param_name varchar(256), -- the name of the parameter in this execute that
+create table ds_out
+ (proc_id varchar(256) references proc(id) on delete cascade,
+ ds_id varchar(256) references ds(id) on delete cascade,
+ param varchar(256), -- the name of the parameter in this execute that
-- this dataset was bound to. sometimes this must
-- be contrived (for example, in positional varargs)
- primary key (process_id,dataset_id,param_name)
+ primary key (proc_id,ds_id,param)
);
--- annotations
-create table annot_dataset_numeric
- ( dataset_id varchar(256) references dataset (id) on delete cascade,
+-- annotations (_n: numeric, _t: text)
+create table a_ds_n
+ ( ds_id varchar(256) references ds (id) on delete cascade,
name varchar(256),
value numeric,
- primary key (dataset_id, name)
+ primary key (ds_id, name)
);
-create table annot_dataset_text
- ( dataset_id varchar(256) references dataset (id) on delete cascade,
+create table a_ds_t
+ ( ds_id varchar(256) references ds (id) on delete cascade,
name varchar(256),
value varchar(2048),
- primary key (dataset_id, name)
+ primary key (ds_id, name)
);
-create table annot_process_numeric
- ( process_id varchar(256) references process (id) on delete cascade,
- name varchar(256),
- value numeric,
- primary key (process_id, name)
+create table a_proc_n
+ ( proc_id varchar(256) references proc (id) on delete cascade,
+ name varchar(256),
+ value numeric,
+ primary key (proc_id, name)
);
-create table annot_process_text
- ( process_id varchar(256) references process (id) on delete cascade,
- name varchar(256),
- value varchar(2048),
- primary key (process_id, name)
+create table a_proc_t
+ ( proc_id varchar(256) references proc (id) on delete cascade,
+ name varchar(256),
+ value varchar(2048),
+ primary key (proc_id, name)
);
-create table annot_run_numeric
+create table a_run_n
( run_id varchar(256) references run (id) on delete cascade,
- name varchar(256),
- value numeric,
+ name varchar(256),
+ value numeric,
primary key (run_id, name)
);
-create table annot_run_text
+create table a_run_t
( run_id varchar(256) references run (id) on delete cascade,
- name varchar(256),
- value varchar(2048),
+ name varchar(256),
+ value varchar(2048),
primary key (run_id, name)
);
-create view provenance_graph as
- select process_id as parent,dataset_id as child from dataset_production
+create view pgraph_edge as
+ select proc_id as parent,ds_id as child from ds_out
union
- select dataset_id as parent,process_id as child from dataset_consumption
+ select ds_id as parent,proc_id as child from ds_in
union
- select out_id as parent,in_id as child from dataset_containment;
+ select out_id as parent,in_id as child from ds_cont;
+-- continue renaming from here
CREATE VIEW run_date AS
- SELECT id, log_filename, swift_version, final_state,
+ SELECT id, log_filename, swift_version, cog_version, final_state,
to_timestamp(start_time) as start_time, duration
FROM run;
-CREATE VIEW annot_text AS
+CREATE VIEW a_t AS
SELECT *
- FROM annot_run_text
+ FROM a_run_t
UNION
SELECT *
- FROM annot_dataset_text
+ FROM a_ds_t
UNION
SELECT *
- FROM annot_process_text;
+ FROM a_proc_t;
-CREATE VIEW annot_numeric AS
+CREATE VIEW a_n AS
SELECT *
- FROM annot_run_numeric
+ FROM a_run_n
UNION
SELECT *
- FROM annot_dataset_numeric
+ FROM a_ds_n
UNION
SELECT *
- FROM annot_process_numeric;
+ FROM a_proc_n;
Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh 2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/prov-to-sql.sh 2011-05-13 00:24:49 UTC (rev 4469)
@@ -8,23 +8,21 @@
export WF="${RUNID}"
echo Generating SQL for $RUNID
-rm -f tmp-u.sql tmp-ds.sql tmp-p.sql tmp-e.sql tmp-e2.sql tmp-dsu.sql tmp-dsc.sql tmp-f.sql tmp-v.sql tmp-import.sql import.sql extrainfo.sql fields.txt runtime.sql
-
# this gives a distinction between the root process for a workflow and the
# workflow itself. perhaps better to model the workflow as a process
-echo "INSERT INTO process (id, type, name, run_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" > tmp-p.sql
+$SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');"
while read time duration thread localthread endstate tr_name scratch; do
- echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-p.sql
- echo "INSERT INTO app_invocation (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" >> tmp-e.sql
+ $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');"
+ $SQLCMD -c "INSERT INTO app_inv (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');"
done < execute.global.event
while read start_time duration globalid id endstate thread site scratch; do
# cut off the last component of the thread, so that we end up at the
# parent thread id which should correspond with the execute-level ID
inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
- echo "INSERT INTO app_execution (id, app_invocation_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-e2.sql
+ $SQLCMD -c "INSERT INTO app_exec (id, app_inv_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');"
done < execute2.global.event
while read col1 col2 col3 col4 col5 thread name lhs rhs result; do
@@ -42,22 +40,22 @@
result=$(echo $result | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$lhs');" >> tmp-ds.sql
- echo "INSERT INTO dataset (id) VALUES ('$rhs');" >> tmp-ds.sql
- echo "INSERT INTO dataset (id) VALUES ('$result');" >> tmp-ds.sql
- echo "INSERT INTO process (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-p.sql
- echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$lhs', 'lhs');" >> tmp-dsu.sql
- echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$rhs', 'rhs');" >> tmp-dsu.sql
- echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$operatorid', '$result', 'result');" >> tmp-dsu.sql
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$lhs');"
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$rhs');"
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$result');"
+ $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');"
+ $SQLCMD -c "INSERT INTO ds_in (proc_id, ds_id, param) VALUES ('$operatorid', '$lhs', 'lhs');"
+ $SQLCMD -c "INSERT INTO ds_in (proc_id, ds_id, param) VALUES ('$operatorid', '$rhs', 'rhs');"
+ $SQLCMD -c "INSERT INTO ds_out (proc_id, ds_id, param) VALUES ('$operatorid', '$result', 'result');"
done < operators.txt
while read id name output; do
if [ $version -le 3726 ]; then
output=$(echo $output | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$output');" >> tmp-ds.sql
- echo "INSERT INTO process (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-p.sql
- echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$id', '$output', 'result');" >> tmp-dsu.sql
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$output');"
+ $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');"
+ $SQLCMD -c "INSERT INTO ds_out (proc_id, ds_id, param) VALUES ('$id', '$output', 'result');"
done < functions.txt
while read id value; do
@@ -67,13 +65,13 @@
value=$(echo $value | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$value');" >> tmp-ds.sql
- echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$id', '$value', 'undefined');" >> tmp-dsu.sql
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$value');"
+ $SQLCMD -c "INSERT INTO ds_in (proc_id, ds_id, param) VALUES ('$id', '$value', 'undefined');"
done < function-inputs.txt
while read thread appname; do
- echo "UPDATE app_invocation SET procedure_name='$appname' WHERE id='$thread';" >> tmp-e.sql
+ $SQLCMD -c "UPDATE app_inv SET proc_name='$appname' WHERE id='$thread';"
done < invocation-procedure-names.txt
while read outer inner; do
@@ -83,9 +81,9 @@
inner=$(echo $inner | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$outer');" >> tmp-ds.sql
- echo "INSERT INTO dataset (id) VALUES ('$inner');" >> tmp-ds.sql
- echo "INSERT INTO dataset_containment (out_id, in_id) VALUES ('$outer', '$inner');" >> tmp-dsc.sql
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$outer');"
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$inner');"
+ $SQLCMD -c "INSERT INTO ds_cont (out_id, in_id) VALUES ('$outer', '$inner');"
done < tie-containers.txt
while read dataset filename; do
@@ -94,8 +92,8 @@
dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
- echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" >> tmp-f.sql
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$dataset');"
+ $SQLCMD -c "INSERT INTO file (id, name) VALUES ('$dataset', '$filename');"
done < dataset-filenames.txt
while read dataset value; do
@@ -104,13 +102,13 @@
dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
fi
- echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
- echo "INSERT INTO in_memory (id, value) VALUES ('$dataset', '$value');" >> tmp-v.sql
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$dataset');"
+ $SQLCMD -c "INSERT INTO in_mem (id, value) VALUES ('$dataset', '$value');"
done < dataset-values.txt
while read start duration wfid rest; do
- echo "UPDATE run SET start_time=$start WHERE id='$WF';" >> tmp-u.sql
- echo "UPDATE run SET duration=$duration WHERE id='$WF';" >> tmp-u.sql
+ $SQLCMD -c "UPDATE run SET start_time=$start WHERE id='$WF';"
+ $SQLCMD -c "UPDATE run SET duration=$duration WHERE id='$WF';"
done < workflow.event
@@ -118,20 +116,20 @@
while read start duration thread final_state procname ; do
if [ "$duration" != "last-event-line" ]; then
compoundid=$WFID$thread
- echo "INSERT INTO process (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-p.sql
+ $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');"
fi
done < compound.event
while read start duration thread final_state procname ; do
if [ "$duration" != "last-event-line" ]; then
fqid=$WFID$thread
- echo "INSERT INTO process (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-p.sql
+ $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');"
fi
done < internalproc.event
while read t ; do
thread="${WFID}$t"
- echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-p.sql
+ $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');"
done < scopes.txt
while read thread direction dataset variable rest; do
@@ -141,57 +139,46 @@
fi
if [ "$direction" == "input" ] ; then
- table=dataset_consumption
+ table=ds_in
else
- table=dataset_production
+ table=ds_out
fi
-
- echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
- echo "INSERT INTO $table (process_id, dataset_id, param_name) VALUES ('$thread', '$dataset', '$variable');" >> tmp-dsu.sql
+
+ EXISTING=$($SQLCMD --tuples-only -c "select count(*) from ds where ds.id='$dataset';")
+
+ if [ "$EXISTING" -eq "0" ]; then
+ $SQLCMD -c "INSERT INTO ds (id) VALUES ('$dataset');"
+ fi
+ $SQLCMD -c "INSERT INTO $table (proc_id, ds_id, param) VALUES ('$thread', '$dataset', '$variable');"
done < tie-data-invocs.txt
-cat tmp-ds.sql | sort | uniq >> import.sql
-cat tmp-f.sql | sort | uniq >> import.sql
-cat tmp-v.sql | sort | uniq >> import.sql
-cat tmp-p.sql | sort | uniq >> import.sql
-cat tmp-e.sql | sort | uniq >> import.sql
-cat tmp-e2.sql | sort | uniq >> import.sql
-cat tmp-dsu.sql | sort | uniq >> import.sql
-cat tmp-dsc.sql | sort | uniq >> import.sql
-cat tmp-import.sql | sort | uniq >> import.sql
-cat tmp-u.sql | sort | uniq >> import.sql
-echo Sending SQL to DB
-
-$SQLCMD < import.sql
-
-while read execute2_id extrainfo; do
- echo $extrainfo | awk -F ";" '{ for (i = 1; i <= NF; i++)
- print $i
+if [ -f extrainfo.txt ]; then
+ while read execute2_id extrainfo; do
+ echo $extrainfo | awk -F ";" '{ for (i = 1; i <= NF; i++)
+ print $i
}' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
- id=$($SQLCMD --tuples-only -c "select app_invocation_id from app_execution where id='$execute2_id';" | awk '{print $1}')
- while read name type value; do
- if [ "$type" = "num" ]; then
- echo "INSERT INTO annot_process_numeric (id, name, value) VALUES ('$id', '$name', $value);" >> extrainfo.sql
- fi
- if [ "$type" = "txt" ]; then
- echo "INSERT INTO annot_process_text (id, name, value) VALUES ('$id', '$name', '$value');" >> extrainfo.sql
- fi
- done < fields.txt
-done < extrainfo.txt
+ id=$($SQLCMD --tuples-only -c "select app_inv_id from app_exec where id='$execute2_id';" | awk '{print $1}')
+ while read name type value; do
+ if [ "$type" = "num" ]; then
+ $SQLCMD -c "INSERT INTO a_proc_n (id, name, value) VALUES ('$id', '$name', $value);"
+ fi
+ if [ "$type" = "txt" ]; then
+ $SQLCMD -c "INSERT INTO a_proc_t (id, name, value) VALUES ('$id', '$name', '$value');"
+ fi
+ done < fields.txt
+ done < extrainfo.txt
+fi
-$SQLCMD < extrainfo.sql
+if [ -f runtime.txt ]; then
+ while read execute2_id runtime; do
+ timestamp=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
+ cpu_usage=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
+ max_phys_mem=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
+ max_virtual_mem=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
+ io_read_bytes=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}')
+ io_write_bytes=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
+ $SQLCMD -c "INSERT INTO runtime_info (app_execution_id, tstamp, cpu_usage, max_phys_mem, max_virtual_mem, io_read_bytes, io_write_bytes) VALUES ('$execute2_id', $timestamp, $cpu_usage, $max_phys_mem, $max_virtual_mem, $io_read_bytes, $io_write_bytes);"
+ done < runtime.txt
+fi
-while read execute2_id runtime; do
- timestamp=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
- cpu_usage=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
- max_phys_mem=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
- max_virtual_mem=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
- io_read_bytes=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}')
- io_write_bytes=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
- echo "INSERT INTO runtime_info (app_execution_id, tstamp, cpu_usage, max_phys_mem, max_virtual_mem, io_read_bytes, io_write_bytes) VALUES ('$execute2_id', $timestamp, $cpu_usage, $max_phys_mem, $max_virtual_mem, $io_read_bytes, $io_write_bytes);" >> runtime.sql
-done < runtime.txt
-
-$SQLCMD < runtime.sql
-
-echo Finished sending SQL to DB
-
+echo Finished sending SQL to DB
\ No newline at end of file
Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs 2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/swift-prov-import-all-logs 2011-05-13 00:24:49 UTC (rev 4469)
@@ -11,7 +11,7 @@
source $PROVDIR/etc/provenance.config
export PATH=$PROVDIR:$PATH
-# this generates a file with pairs like:
+# this generates a file with tuples like:
# <starttime> <swift version> <logfilename>
swift-plot-log $LOGREPO everylog-vs-versions.data
@@ -55,7 +55,7 @@
export WF="${RUNID}"
- echo "INSERT INTO run (id, log_filename, swift_version, final_state) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
+ $SQLCMD -c "INSERT INTO run (id, log_filename, swift_version, cog_version, final_state) VALUES ('$WF','$filename','$version', '', '$wfstatus');"
echo version $version in log file $filename
echo ============= will import =============
More information about the Swift-commit
mailing list