[Swift-commit] r4469 - in provenancedb: . apps/oops

lgadelha at ci.uchicago.edu lgadelha at ci.uchicago.edu
Thu May 12 19:24:49 CDT 2011


Author: lgadelha
Date: 2011-05-12 19:24:49 -0500 (Thu, 12 May 2011)
New Revision: 4469

Modified:
   provenancedb/apps/oops/oops_extractor.sh
   provenancedb/pql_functions.sql
   provenancedb/prov-init.sql
   provenancedb/prov-to-sql.sh
   provenancedb/swift-prov-import-all-logs
Log:
Schema update.


Modified: provenancedb/apps/oops/oops_extractor.sh
===================================================================
--- provenancedb/apps/oops/oops_extractor.sh	2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/apps/oops/oops_extractor.sh	2011-05-13 00:24:49 UTC (rev 4469)
@@ -4,9 +4,10 @@
 # 
 
 PROVDB_HOME=~/provenancedb
-PROTESTS_HOME=/gpfs/pads/oops/aashish/CASP
+PROTESTS_HOME=/gpfs/pads/oops/aashish/CASP9
 IMPORT_HOME=~/protests
 
+cp $PROVDB_HOME/etc/provenance.config.ci $PROVDB_HOME/etc/provenance.config
 source $PROVDB_HOME/etc/provenance.config
 
 # provdb_imported records runs already imported to the provenance database
@@ -39,13 +40,11 @@
 		LOG_FILENAME=$(ls *-*-*-*.log)
  		WORKFLOW_ID=$(echo "select id from run where log_filename like '%$LOG_FILENAME%'" | $SQLCMD -t | awk '{print $1}')
 		cd $IMPORT_HOME/swift-logs
-		echo "insert into annot_run_text (run_id, name, value) values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" | $SQLCMD
+		$SQLCMD -c "insert into a_run_t (run_id, name, value) values ('$WORKFLOW_ID','oops_run_id','$OOPS_RUN_ID');" 
 		
 		#extracts scientific parameters given as input to the workflow in file *.params.
-		echo "select file.id,file.filename from process, dataset_production, dataset_containment, file where  process.id=dataset_production.process_id and dataset_production.dataset_id=out_id and file.id=dataset_containment.in_id and filename like '%.params' and process.name='PrepLoop' and process.run_id='$WORKFLOW_ID';" > query.sql;
+		$SQLCMD -t -A -F " " -c "select file.id,file.name from proc, ds_out, ds_cont, file where  proc.id=ds_out.proc_id and ds_out.ds_id=out_id and file.id=ds_cont.in_id and file.name like '%.params' and proc.name='PrepLoop' and proc.run_id='$WORKFLOW_ID';" -o result.txt
 		
-		$SQLCMD -t -A -F " " -f query.sql -o result.txt
-		
 		DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
 		FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
 		
@@ -55,26 +54,24 @@
 		    NAME=$(echo $line | awk 'BEGIN { FS = "=" }; {print $1}')
 		    RIGHT=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}')
 		    if [ "$NAME" = "SAMPLE RANGE" ]; then
-			echo "insert into annot_dataset_text values ('$DATASET_ID', 'sample_range', '$RIGHT');" | $SQLCMD
+			$SQLCMD -c "insert into a_ds_t values ('$DATASET_ID', 'sample_range', '$RIGHT');"
 		    fi 
 		    if [ "$NAME" = "RESTRAIN DISTANCE" ]; then
 			VALUE1=$(echo $RIGHT | awk 'BEGIN { FS = "," }; {print $1}')
 			VALUE2=$(echo $line | awk 'BEGIN { FS = "=" }; {print $2}' | awk 'BEGIN { FS = "," }; {print $2}')
-			echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'restrain_distance_1', $VALUE1);" | $SQLCMD
-			echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'restrain_distance_2', $VALUE2);" | $SQLCMD
+			$SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'restrain_distance_1', $VALUE1);"
+			$SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'restrain_distance_2', $VALUE2);"
 		    fi 
 		    if [ "$NAME" = "MAXIMUM NUMBER OF STEPS" ]; then
-			echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'maximum_number_of_steps', $RIGHT);" | $SQLCMD
+			$SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'maximum_number_of_steps', $RIGHT);"
 		    fi 
 		done < $FILENAME
 
 		#extracts length of the fasta sequence given as input to the workflow in file *.fasta.
 		cd $IMPORT_HOME/swift-logs
 
-		echo "select file.id,file.filename from process, dataset_production, dataset_containment, file where  process.id=dataset_production.process_id and dataset_production.dataset_id=out_id and file.id=dataset_containment.in_id and filename like '%.fasta' and process.name='PrepLoop' and process.run_id='$WORKFLOW_ID';" > query.sql;
+		$SQLCMD -t -A -F " " -c "select file.id,file.name from proc, ds_out, ds_cont, file where  proc.id=ds_out.proc_id and ds_out.ds_id=out_id and file.id=ds_cont.in_id and file.name like '%.fasta' and proc.name='PrepLoop' and proc.run_id='$WORKFLOW_ID';" -o result.txt
 		
-		$SQLCMD -t -A -F " " -f query.sql -o result.txt
-		
 		DATASET_ID=$(awk '{if (NR==1) print $1}' result.txt)
 		FILENAME=$(awk '{if (NR==1) print $2}' result.txt | sed 's/file:\/\/localhost\///g')
 		
@@ -82,7 +79,7 @@
 		
 		if [ -n "$FILENAME" ]; then
 		    SEQLENGTH=$(awk '{if (NR==2) print $1}' $FILENAME | wc -c)
-		    echo "insert into annot_dataset_numeric values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);" | $SQLCMD
+		    $SQLCMD -c "insert into a_ds_n values ('$DATASET_ID', 'fasta_sequence_length', $SEQLENGTH);"
 		fi
 
 		# extracts scientific parameters given as output by the workflow in *.log.
@@ -99,10 +96,8 @@
 
 		cd $IMPORT_HOME/swift-logs
 	    
-		echo "select file.id,file.filename from process, dataset_production, dataset_containment, file where  process.id=dataset_production.process_id and dataset_production.dataset_id=out_id and file.id=dataset_containment.in_id and filename like '%.log' and process.name='LoopModel' and process.run_id='$WORKFLOW_ID';" > query.sql;
+		$SQLCMD -t -A -F " " -c "select file.id,file.name from proc, ds_out, ds_cont, file where  proc.id=ds_out.proc_id and ds_out.ds_id=out_id and file.id=ds_cont.in_id and file.name like '%.log' and proc.name='LoopModel' and proc.run_id='$WORKFLOW_ID';" -o result.txt
 
-		$SQLCMD -t -A -F " " -f query.sql -o result.txt
-
 		cd $PROTESTS_HOME/$k/$i
 		
 		while read dataset filename; do
@@ -110,28 +105,28 @@
 		    while read token1 token2 token3 token4; do
 			if [ "$token2" = "(Initial Energy:" ]; then
 			    initialenergy=$(echo $token3 | awk 'BEGIN { FS = "\)" }; {print $1}')
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'initial_energy', $initialenergy);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'initial_energy', $initialenergy);"
 			fi
 			if [ "$token1" = "Total" ] && [ "$token2" = "Function" ] && [ "$token3" = "Evaluations:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'total_function_evaluations', $token4);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'total_function_evaluations', $token4);"
 			fi 
 			if [ "$token1" = "Increasing" ] && [ "$token2" = "transitions:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'accepted_increasing_transitions', $token3);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'accepted_increasing_transitions', $token3);"
 			fi 
 			if [ "$token1" = "Decreasing" ] && [ "$token2" = "transitions:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'accepted_decreasing_transitions', $token3);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'accepted_decreasing_transitions', $token3);"
 			fi 
 			if [ "$token1" = "Rejected" ] && [ "$token2" = "transitions:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'rejected_transitions', $token3);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'rejected_transitions', $token3);"
 			fi 
 			if [ "$token1" = "Final" ] && [ "$token2" = "Energy:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'final_energy', $token3);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'final_energy', $token3);"
 			fi 
 			if [ "$token1" = "Final" ] && [ "$token2" = "Temp:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'final_temp', $token3);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'final_temp', $token3);"
 			fi 
 			if [ "$token1" = "Total" ] && [ "$token2" = "Running" ] && [ "$token3" = "Time:" ]; then
-			    echo "insert into annot_dataset_numeric (dataset_id, name, value) values ('$dataset', 'total_running_time', $token4);" | $SQLCMD
+			    $SQLCMD -c "insert into a_ds_n (ds_id, name, value) values ('$dataset', 'total_running_time', $token4);"
 			fi 
 		    done < $FILENAME
 		done < $IMPORT_HOME/swift-logs/result.txt

Modified: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql	2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/pql_functions.sql	2011-05-13 00:24:49 UTC (rev 4469)
@@ -4,37 +4,158 @@
 -- SQL Functions
 
 -- lists variations in a parameter's value across workflows, for parameters that are in-memory variables
+drop view in_mem_in cascade;
 
-CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name VARCHAR) 
-RETURNS TABLE (
-  run_id VARCHAR, 
-  param_name VARCHAR, 
-  value VARCHAR
-)
-AS $$
-  SELECT   process.workflow_id, ds_usage.param_name, variable.value
-  FROM     variable, ds_usage, process
-  WHERE    variable.id=ds_usage.dataset_id AND ds_usage.process_id=process.id AND 
-           ds_usage.param_name=$1 
-  GROUP BY process.workflow_id, ds_usage.param_name, variable.value;
-$$ LANGUAGE SQL;
+create view in_mem_in as
+       select proc.run_id, proc.id as proc_id, 
+              proc.name as proc_name, ds_in.ds_id, 
+              ds_in.param, in_mem.value
+       from   proc, ds_in, in_mem
+       where  proc.id=ds_in.proc_id and ds_in.ds_id=in_mem.id;
 
+drop view in_mem_out cascade;
+
+create view in_mem_out as
+       select proc.run_id, proc.id as proc_id, 
+              proc.name as proc_name, ds_out.ds_id, 
+              ds_out.param, in_mem.value
+       from   proc, ds_out, in_mem
+       where  proc.id=ds_out.proc_id and ds_out.ds_id=in_mem.id;
+ 
+drop view in_mem_use cascade;
+	    
+create view in_mem_use as
+       select * from in_mem_in
+     union
+       select * from in_mem_out;      
+
+drop view file_in cascade;
+
+create view file_in as
+       select proc.run_id, proc.id as proc_id, 
+              proc.name as proc_name, ds_in.ds_id, 
+              ds_in.param, file.name as value
+       from   proc, ds_in, file
+       where  proc.id=ds_in.proc_id and ds_in.ds_id=file.id;
+
+drop view file_out cascade;
+
+create view file_out as
+       select proc.run_id, proc.id as proc_id, 
+              proc.name as proc_name, ds_out.ds_id, 
+              ds_out.param, file.name as value
+       from   proc, ds_out, file
+       where  proc.id=ds_out.proc_id and ds_out.ds_id=file.id;
+ 	    
+drop view file_use cascade;
+
+create view file_use as
+       select * from file_in
+     union
+       select * from file_out;
+
+drop view ds_use cascade;
+
+create view ds_use as
+       select * from in_mem_use
+     union
+       select * from file_use;
+
+drop view a_t cascade;
+
+create view a_t as
+       select run.id, a_run_t.name, a_run_t.value
+       from   run, a_run_t 
+       where  run.id=a_run_t.run_id
+     union 
+       select proc.run_id, a_proc_t.name, a_proc_t.value
+       from   proc, a_proc_t 
+       where  proc.id=a_proc_t.proc_id
+     union
+       select ds_use.run_id, a_ds_t.name, a_ds_t.value
+       from   ds_use, a_ds_t
+       where  ds_use.ds_id=a_ds_t.ds_id;
+
+drop view a_n cascade;
+
+create view a_n as
+       select run.id, a_run_n.name, a_run_n.value
+       from   run, a_run_n 
+       where  run.id=a_run_n.run_id
+     union 
+       select proc.run_id, a_proc_n.name, a_proc_n.value
+       from   proc, a_proc_n 
+       where  proc.id=a_proc_n.proc_id
+     union
+       select ds_use.run_id, a_ds_n.name, a_ds_n.value
+       from   ds_use, a_ds_n
+       where  ds_use.ds_id=a_ds_n.ds_id;
+
+
+
+
+drop type compare_run_by_parameter_type cascade;
+create type compare_run_by_parameter_type as (run_id varchar, param varchar, value varchar);
+
+create or replace function compare_run_by_parameter(param_name varchar)
+returns setof compare_run_by_parameter_type
+as $$
+   select run_id, param, value
+   from   ds_use
+   where  param=$1;
+$$ language sql;
+
+-- PostgreSQL >= 9.0
+-- CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name VARCHAR) 
+-- RETURNS TABLE (
+--   run_id VARCHAR, 
+--   param VARCHAR, 
+--   value VARCHAR
+-- )
+-- AS $$
+--    SELECT   proc.run_id, ds_out.param, in_mem.value
+--    FROM     in_mem, ds_out, proc
+--    WHERE    in_mem.id=ds_out.ds_id AND ds_out.proc_id=proc.id AND 
+--             ds_out.param=$1 
+--    GROUP BY proc.run_id, ds_out.param, in_mem.value
+--  UNION
+--    SELECT   proc.run_id, ds_in.param, in_mem.value
+--    FROM     in_mem, ds_in, proc
+--    WHERE    in_mem.id=ds_in.ds_id AND ds_in.proc_id=proc.id AND 
+--             ds_in.param=$1 
+--    GROUP BY proc.run_id, ds_in.param, in_mem.value	
+--$$ LANGUAGE SQL;
+
+DROP TYPE compare_run_by_parameter_type2; 
+CREATE TYPE compare_run_by_parameter_type2 AS (run_id VARCHAR, param1 VARCHAR, value1 VARCHAR, param2 VARCHAR, value2 VARCHAR);
+
 CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR) 
-RETURNS TABLE (
-  workflow_id VARCHAR, 
-  param_name1 VARCHAR, 
-  value1 VARCHAR, 
-  param_name2 VARCHAR, 
-  value2 VARCHAR
-) 
+RETURNS SETOF compare_run_by_parameter_type2
 AS $$
   SELECT * 
   FROM   compare_run_by_parameter($1) as t 
          INNER JOIN 
          compare_run_by_parameter($2) as s 
-         USING (workflow_id); 
+         USING (run_id); 
 $$ LANGUAGE SQL;
 
+--CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR) 
+--RETURNS TABLE (
+--  workflow_id VARCHAR, 
+--  param_name1 VARCHAR, 
+--  value1 VARCHAR, 
+--  param_name2 VARCHAR, 
+--  value2 VARCHAR
+--) 
+--AS $$
+--  SELECT * 
+--  FROM   compare_run_by_parameter($1) as t 
+--         INNER JOIN 
+--         compare_run_by_parameter($2) as s 
+--         USING (workflow_id); 
+--$$ LANGUAGE SQL;
+
+
 CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR, param_name3 VARCHAR) 
 RETURNS TABLE (
   workflow_id VARCHAR, 
@@ -80,6 +201,7 @@
     WHERE  workflow.id=annot_wf_num.id and annot_wf_num.name=$1
 $$ LANGUAGE SQL;
 
+
 CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
 RETURNS TABLE (
   workflow_id VARCHAR, 
@@ -100,6 +222,7 @@
     WHERE    workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
 $$ LANGUAGE SQL;
 
+
 CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
 RETURNS TABLE (
   workflow_id VARCHAR,
@@ -121,6 +244,7 @@
     WHERE    workflow.id=annot_wf_bool.id and annot_wf_bool.name=$1
 $$ LANGUAGE SQL;
 
+
 -- correlate a parameter with workflow runtime statistics
 CREATE OR REPLACE FUNCTION correlate_param_runtime(param_name VARCHAR) 
 RETURNS TABLE (
@@ -137,16 +261,7 @@
 	       process.workflow_id=workflow.id AND ds_usage.param_name=$1 
 $$ LANGUAGE SQL;
 
--- lists distinct processes by name in a workflow
-CREATE OR REPLACE FUNCTION process_names(wf_id VARCHAR) 
-RETURNS SETOF VARCHAR AS $$
-	SELECT DISTINCT(process.name)
-	FROM   process
-	WHERE  process.workflow_id=$1;
-$$ LANGUAGE SQL;
-
 -- recursive query to find ancestor entities in a provenance graph
-
 CREATE OR REPLACE FUNCTION ancestors(varchar) RETURNS SETOF varchar AS $$
        WITH RECURSIVE anc(ancestor,descendant) AS
          (    
@@ -159,8 +274,8 @@
        SELECT ancestor FROM anc
 $$ LANGUAGE SQL;
 
+
 -- compare(<entity>, <list of param_names, annotations keys>
-
 CREATE OR REPLACE FUNCTION compare_run(VARIADIC args VARCHAR[])
 RETURNS SETOF RECORD AS $$
 DECLARE 
@@ -198,38 +313,3 @@
 $$ LANGUAGE plpgsql;
 
 
-CREATE OR REPLACE FUNCTION compare_run_print(VARIADIC args VARCHAR[])
-RETURNS VARCHAR AS $$
-DECLARE 
-  i             INTEGER;
-  q             VARCHAR;
-  selectq       VARCHAR;
-  fromq         VARCHAR;
-  property      VARCHAR;
-  property_type VARCHAR;
-  function_name VARCHAR;
-BEGIN 
-  selectq := 'SELECT *';
-  FOR i IN array_lower(args, 1)..array_upper(args, 1) LOOP
-    property_type := split_part(args[i], '=', 1);
-    property := split_part(args[i], '=', 2);
-    CASE property_type
-    WHEN 'param_name' THEN
-      function_name := 'compare_run_by_parameter';
-    WHEN 'annot_num' THEN
-      function_name := 'compare_run_by_annot_num';
-    WHEN 'annot_txt' THEN
-      function_name := 'compare_run_by_annot_txt';
-    WHEN 'annot_bool' THEN
-      function_name := 'compare_run_by_annot_bool';
-    END CASE;
-    IF i = 1 THEN
-      fromq := function_name || '(''' || property || ''') as t' || i;
-    ELSE
-      fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
-    END IF;
-  END LOOP;
-  q := selectq || ' FROM ' || fromq;
-  RETURN q;
-END;
-$$ LANGUAGE plpgsql;

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/prov-init.sql	2011-05-13 00:24:49 UTC (rev 4469)
@@ -1,24 +1,25 @@
 -- this is the schema definition used for the main relational provenance
 -- implementation (in both sqlite3 and postgres)
-   
+
 drop table run cascade;
-drop table process cascade;
-drop table app_invocation cascade;
-drop table app_execution cascade;
-drop table runtime_info cascade;
-drop table dataset cascade;
+drop table proc cascade;
+drop table app_inv cascade;
+drop table app_exec cascade;
+drop table rt_info cascade;
+drop table ds cascade;
 drop table file cascade;
-drop table in_memory cascade;
-drop table dataset_containment cascade;
-drop table dataset_consumption cascade;
-drop table dataset_production cascade;
-drop table annot_dataset_numeric cascade;
-drop table annot_dataset_text cascade;
-drop table annot_process_numeric cascade;
-drop table annot_process_text cascade;
-drop table annot_run_numeric cascade;
-drop table annot_run_text cascade;
+drop table in_mem cascade;
+drop table ds_cont cascade;
+drop table ds_in cascade;
+drop table ds_out cascade;
+drop table a_ds_n cascade;
+drop table a_ds_t cascade;
+drop table a_proc_n cascade;
+drop table a_proc_t cascade;
+drop table a_run_n cascade;
+drop table a_run_t cascade;
 
+
 -- run stores information about each script run log that has
 -- been seen by the importer: the log filename, swift version and import
 -- status.
@@ -28,7 +29,8 @@
     (id            varchar(256) primary key,
      log_filename  varchar(2048),
      swift_version varchar(16),
-     final_state  varchar(16),
+     cog_version   varchar(16),
+     final_state   varchar(16),
      start_time    numeric,
      duration      numeric
     );
@@ -39,7 +41,7 @@
 -- must be the case that the specific type table
 -- has an entry for this process.
 -- process types: internal, rootthread, execute, function, compound, scope, operator
-create table process
+create table proc
     (id     varchar(256) primary key, 
      type   varchar(16),
      name   varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
@@ -50,9 +52,9 @@
 -- this gives information about each execute.
 -- each execute is identified by a unique URI. other information from
 -- swift logs is also stored here. an execute is an OPM process.
-create table app_invocation
-    (id             varchar(256) primary key references process (id) on delete cascade,  
-     procedure_name varchar(256), -- name of the app procedure that invokes the transformation
+create table app_inv
+    (id             varchar(256) primary key references proc (id) on delete cascade,  
+     proc_name      varchar(256), -- name of the app procedure that invokes the transformation
      start_time     numeric,
      duration       numeric,
      final_state    varchar(16),
@@ -62,9 +64,9 @@
 -- this gives information about each execute2, which is an attempt to
 -- perform an execution. the execute2 id is tied to per-execution-attempt
 -- information such as wrapper logs
-create table app_execution
+create table app_exec
     (id                varchar(256) primary key,
-     app_invocation_id varchar(256) references app_invocation (id) on delete cascade, 
+     app_inv_id        varchar(256) references app_inv (id) on delete cascade, 
      start_time        numeric,
      duration          numeric,
      final_state       varchar(16),
@@ -73,26 +75,26 @@
 
 -- app execution runtime info extracted from the /proc filesystem (assumes the app executed
 -- in a Linux host) 
-create table runtime_info
-   ( app_execution_id   varchar(256) references app_execution (id) on delete cascade, 
+create table rt_info
+   ( app_exec_id        varchar(256) references app_exec (id) on delete cascade, 
      tstamp		numeric,
      cpu_usage          numeric,
      max_phys_mem	numeric,
-     max_virtual_mem	numeric,
-     io_read_bytes	numeric,
-     io_write_bytes	numeric,
-     primary key (app_execution_id, tstamp)
+     max_virt_mem	numeric,
+     io_read		numeric,
+     io_write		numeric,
+     primary key (app_exec_id, tstamp)
    );
 
 -- ds stores all dataset identifiers.
-create table dataset
+create table ds
     (id varchar(256) primary key
     );
 
 -- file stores the filename mapped to each dataset. 
 create table file
-    ( id         varchar(256) primary key references dataset (id) on delete cascade,
-      filename   varchar(2048)
+    ( id         varchar(256) primary key references ds (id) on delete cascade,
+      name       varchar(2048)
     );
 
 -- dataset_values stores the value for each dataset which is known to have
@@ -100,8 +102,8 @@
 -- to expose that value as an SQL type other than a string, and so (for
 -- example) SQL numerical operations should not be expected to work, even
 -- though the user knows that a particular dataset stores a numeric value.
-create table in_memory
-    ( id    varchar(256) primary key references dataset (id) on delete cascade,
+create table in_mem
+    ( id    varchar(256) primary key references ds (id) on delete cascade,
       value varchar(2048)
     );
 
@@ -112,9 +114,9 @@
 -- constructors and accessors, rather than, or in addition to,
 -- a containment hierarchy. The relationship (such as array index or
 -- structure member name) should also be stored in this table.
-create table dataset_containment
-    ( out_id varchar(256) references dataset (id) on delete cascade,
-      in_id  varchar(256) references dataset (id) on delete cascade,
+create table ds_cont
+    ( out_id varchar(256) references ds (id) on delete cascade,
+      in_id  varchar(256) references ds (id) on delete cascade,
       primary key (out_id,in_id)
     );
 
@@ -123,98 +125,99 @@
 -- in SwiftScript terms, the input and output parameters for each
 -- application procedure invocation; in OPM terms, the artificts which are
 -- input to and output from each process that is a Swift execution
-create table dataset_consumption
-    (process_id varchar(256) references process(id) on delete cascade, 
-     dataset_id varchar(256) references dataset(id) on delete cascade,
-     param_name varchar(256), -- the name of the parameter in this execute that
+create table ds_in
+    (proc_id varchar(256) references proc(id) on delete cascade, 
+     ds_id   varchar(256) references ds(id) on delete cascade,
+     param   varchar(256), -- the name of the parameter in this execute that
                               -- this dataset was bound to. sometimes this must
                               -- be contrived (for example, in positional varargs)
-     primary key (process_id,dataset_id,param_name)
+     primary key (proc_id,ds_id,param)
     );
 
-create table dataset_production
-    (process_id varchar(256) references process(id) on delete cascade, 
-     dataset_id varchar(256) references dataset(id) on delete cascade,
-     param_name varchar(256), -- the name of the parameter in this execute that
+create table ds_out
+    (proc_id varchar(256) references proc(id) on delete cascade, 
+     ds_id   varchar(256) references ds(id) on delete cascade,
+     param   varchar(256), -- the name of the parameter in this execute that
                               -- this dataset was bound to. sometimes this must
                               -- be contrived (for example, in positional varargs)
-     primary key (process_id,dataset_id,param_name)
+     primary key (proc_id,ds_id,param)
     );
 
 
--- annotations
-create table annot_dataset_numeric
-   ( dataset_id    varchar(256) references dataset (id) on delete cascade, 
+-- annotations (_n: numeric, _t: text)
+create table a_ds_n
+   ( ds_id varchar(256) references ds (id) on delete cascade, 
      name  varchar(256),
      value numeric,
-     primary key (dataset_id, name)
+     primary key (ds_id, name)
    );
 
-create table annot_dataset_text
-   ( dataset_id    varchar(256) references dataset (id) on delete cascade, 
+create table a_ds_t
+   ( ds_id varchar(256) references ds (id) on delete cascade, 
      name  varchar(256),
      value varchar(2048),
-     primary key (dataset_id, name)
+     primary key (ds_id, name)
    );
 
-create table annot_process_numeric
-   ( process_id    varchar(256) references process (id) on delete cascade, 
-     name  varchar(256),
-     value numeric,
-     primary key (process_id, name)
+create table a_proc_n
+   ( proc_id    varchar(256) references proc (id) on delete cascade, 
+     name       varchar(256),
+     value      numeric,
+     primary key (proc_id, name)
    );
 
-create table annot_process_text
-   ( process_id    varchar(256) references process (id) on delete cascade, 
-     name  varchar(256),
-     value varchar(2048),
-     primary key (process_id, name)
+create table a_proc_t
+   ( proc_id    varchar(256) references proc (id) on delete cascade, 
+     name       varchar(256),
+     value      varchar(2048),
+     primary key (proc_id, name)
    );
 
-create table annot_run_numeric
+create table a_run_n
    ( run_id    varchar(256) references run (id) on delete cascade, 
-     name  varchar(256),
-     value numeric,
+     name      varchar(256),
+     value     numeric,
      primary key (run_id, name)
    );
 
-create table annot_run_text
+create table a_run_t
    ( run_id    varchar(256) references run (id) on delete cascade, 
-     name  varchar(256),
-     value varchar(2048),
+     name      varchar(256),
+     value     varchar(2048),
      primary key (run_id, name)
    );
 
-create view provenance_graph as 
-       select process_id as parent,dataset_id as child from dataset_production
+create view pgraph_edge as 
+       select proc_id as parent,ds_id as child from ds_out
        union
-       select dataset_id as parent,process_id as child from dataset_consumption
+       select ds_id as parent,proc_id as child from ds_in
        union 
-       select out_id as parent,in_id as child from dataset_containment;
+       select out_id as parent,in_id as child from ds_cont;
 
+-- continue renaming from here
 
 CREATE VIEW run_date AS
-  SELECT id, log_filename, swift_version, final_state, 
+  SELECT id, log_filename, swift_version, cog_version, final_state, 
          to_timestamp(start_time) as start_time, duration
   FROM   run;
 
-CREATE VIEW annot_text AS
+CREATE VIEW a_t AS
     SELECT *
-    FROM annot_run_text 
+    FROM a_run_t 
   UNION
     SELECT * 
-    FROM annot_dataset_text 
+    FROM a_ds_t 
   UNION 
     SELECT * 
-    FROM annot_process_text;
+    FROM a_proc_t;
 
-CREATE VIEW annot_numeric AS
+CREATE VIEW a_n AS
     SELECT *
-    FROM annot_run_numeric 
+    FROM a_run_n 
   UNION
     SELECT * 
-    FROM annot_dataset_numeric 
+    FROM a_ds_n 
   UNION 
     SELECT * 
-    FROM annot_process_numeric;
+    FROM a_proc_n;
 

Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh	2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/prov-to-sql.sh	2011-05-13 00:24:49 UTC (rev 4469)
@@ -8,23 +8,21 @@
 export WF="${RUNID}"
 
 echo Generating SQL for $RUNID
-rm -f tmp-u.sql tmp-ds.sql tmp-p.sql tmp-e.sql tmp-e2.sql tmp-dsu.sql tmp-dsc.sql tmp-f.sql tmp-v.sql tmp-import.sql import.sql extrainfo.sql fields.txt runtime.sql
 
-
 # this gives a distinction between the root process for a workflow and the
 # workflow itself. perhaps better to model the workflow as a process
-echo "INSERT INTO process (id, type, name, run_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');" > tmp-p.sql
+$SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('${WFID}0', 'rootthread', '$RUNID', '$WF');"
 
 while read time duration thread localthread endstate tr_name scratch; do
-    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" >> tmp-p.sql
-    echo "INSERT INTO app_invocation (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" >> tmp-e.sql
+    $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$thread', 'execute', '$tr_name', '$WF');" 
+    $SQLCMD -c "INSERT INTO app_inv (id, start_time, duration, final_state, scratch) VALUES ('$thread', $time, $duration, '$endstate', '$scratch');" 
 done < execute.global.event
 
 while read start_time duration globalid id endstate thread site scratch; do
     # cut off the last component of the thread, so that we end up at the
     # parent thread id which should correspond with the execute-level ID
     inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
-    echo "INSERT INTO app_execution (id, app_invocation_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');" >> tmp-e2.sql
+    $SQLCMD -c  "INSERT INTO app_exec (id, app_inv_id, start_time, duration, final_state, site) VALUES ('$globalid', '$inv_id', $start_time, $duration, '$endstate', '$site');"
 done < execute2.global.event
 
 while read col1 col2 col3 col4 col5 thread name lhs rhs result; do
@@ -42,22 +40,22 @@
 	result=$(echo $result | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
     
-    echo "INSERT INTO dataset (id) VALUES ('$lhs');" >> tmp-ds.sql
-    echo "INSERT INTO dataset (id) VALUES ('$rhs');" >> tmp-ds.sql
-    echo "INSERT INTO dataset (id) VALUES ('$result');" >> tmp-ds.sql
-    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');" >> tmp-p.sql
-    echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$lhs', 'lhs');" >> tmp-dsu.sql
-    echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$operatorid', '$rhs', 'rhs');" >> tmp-dsu.sql
-    echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$operatorid', '$result', 'result');" >> tmp-dsu.sql
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$lhs');"
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$rhs');"
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$result');"
+    $SQLCMD -c  "INSERT INTO proc (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');"
+    $SQLCMD -c  "INSERT INTO ds_in (proc_id, ds_id, param) VALUES ('$operatorid', '$lhs', 'lhs');"
+    $SQLCMD -c  "INSERT INTO ds_in (proc_id, ds_id, param) VALUES ('$operatorid', '$rhs', 'rhs');"
+    $SQLCMD -c  "INSERT INTO ds_out (proc_id, ds_id, param) VALUES ('$operatorid', '$result', 'result');"
 done < operators.txt
 
 while read id name output; do
     if [ $version -le 3726 ]; then
 	output=$(echo $output | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
-    echo "INSERT INTO dataset (id) VALUES ('$output');" >> tmp-ds.sql
-    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');" >> tmp-p.sql
-    echo "INSERT INTO dataset_production (process_id, dataset_id, param_name) VALUES ('$id', '$output', 'result');" >> tmp-dsu.sql
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$output');"
+    $SQLCMD -c  "INSERT INTO proc (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');"
+    $SQLCMD -c  "INSERT INTO ds_out (proc_id, ds_id, param) VALUES ('$id', '$output', 'result');"
 done < functions.txt
 
 while read id value; do
@@ -67,13 +65,13 @@
 	value=$(echo $value | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
     
-    echo "INSERT INTO dataset (id) VALUES ('$value');" >> tmp-ds.sql
-    echo "INSERT INTO dataset_consumption (process_id, dataset_id, param_name) VALUES ('$id', '$value', 'undefined');" >> tmp-dsu.sql
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$value');"
+    $SQLCMD -c  "INSERT INTO ds_in (proc_id, ds_id, param) VALUES ('$id', '$value', 'undefined');"
 done < function-inputs.txt
 
 
 while read thread appname; do
-    echo "UPDATE app_invocation SET procedure_name='$appname' WHERE id='$thread';" >> tmp-e.sql
+    $SQLCMD -c  "UPDATE app_inv SET proc_name='$appname' WHERE id='$thread';"
 done < invocation-procedure-names.txt
 
 while read outer inner; do
@@ -83,9 +81,9 @@
 	inner=$(echo $inner | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
 
-    echo "INSERT INTO dataset (id) VALUES ('$outer');" >> tmp-ds.sql
-    echo "INSERT INTO dataset (id) VALUES ('$inner');" >> tmp-ds.sql
-    echo "INSERT INTO dataset_containment (out_id, in_id) VALUES ('$outer', '$inner');" >> tmp-dsc.sql
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$outer');"
+    $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$inner');"
+    $SQLCMD -c  "INSERT INTO ds_cont (out_id, in_id) VALUES ('$outer', '$inner');"
 done < tie-containers.txt
 
 while read dataset filename; do
@@ -94,8 +92,8 @@
 	dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
 
-    echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
-    echo "INSERT INTO file (id, filename) VALUES ('$dataset', '$filename');" >> tmp-f.sql
+    $SQLCMD -c "INSERT INTO ds (id) VALUES ('$dataset');"
+    $SQLCMD -c "INSERT INTO file (id, name) VALUES ('$dataset', '$filename');"
 done < dataset-filenames.txt
 
 while read dataset value; do
@@ -104,13 +102,13 @@
 	dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
 
-    echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
-    echo "INSERT INTO in_memory (id, value) VALUES ('$dataset', '$value');" >> tmp-v.sql
+    $SQLCMD -c "INSERT INTO ds (id) VALUES ('$dataset');"
+    $SQLCMD -c "INSERT INTO in_mem (id, value) VALUES ('$dataset', '$value');"
 done < dataset-values.txt
 
 while read start duration wfid rest; do
-    echo "UPDATE run SET start_time=$start WHERE id='$WF';" >> tmp-u.sql
-    echo "UPDATE run SET duration=$duration WHERE id='$WF';" >> tmp-u.sql
+    $SQLCMD -c "UPDATE run SET start_time=$start WHERE id='$WF';"
+    $SQLCMD -c "UPDATE run SET duration=$duration WHERE id='$WF';"
 done < workflow.event
 
 
@@ -118,20 +116,20 @@
 while read start duration thread final_state procname ; do
     if [ "$duration" != "last-event-line" ]; then
 	compoundid=$WFID$thread
-	echo "INSERT INTO process (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');" >> tmp-p.sql
+	$SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');"
     fi
 done < compound.event
 
 while read start duration thread final_state procname ; do
     if [ "$duration" != "last-event-line" ]; then
 	fqid=$WFID$thread
-	echo "INSERT INTO process (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');" >> tmp-p.sql
+	$SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');"
     fi	
 done < internalproc.event
 
 while read t ; do 
     thread="${WFID}$t"
-    echo "INSERT INTO process (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');" >> tmp-p.sql
+    $SQLCMD -c "INSERT INTO proc (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');"
 done < scopes.txt
 
 while read thread direction dataset variable rest; do 
@@ -141,57 +139,46 @@
     fi
 
     if [ "$direction" == "input" ] ; then
-	table=dataset_consumption
+	table=ds_in
     else
-	table=dataset_production
+	table=ds_out
     fi
-
-    echo "INSERT INTO dataset (id) VALUES ('$dataset');" >> tmp-ds.sql
-    echo "INSERT INTO $table (process_id, dataset_id, param_name) VALUES ('$thread', '$dataset', '$variable');" >> tmp-dsu.sql
+    
+    EXISTING=$($SQLCMD --tuples-only -c "select count(*) from ds where ds.id='$dataset';")
+    
+    if [ "$EXISTING" -eq "0" ];  then
+	$SQLCMD -c "INSERT INTO ds (id) VALUES ('$dataset');"
+    fi 
+    $SQLCMD -c "INSERT INTO $table (proc_id, ds_id, param) VALUES ('$thread', '$dataset', '$variable');"
 done < tie-data-invocs.txt
 
-cat tmp-ds.sql | sort | uniq >> import.sql
-cat tmp-f.sql | sort | uniq >> import.sql
-cat tmp-v.sql | sort | uniq >> import.sql
-cat tmp-p.sql | sort | uniq >> import.sql
-cat tmp-e.sql | sort | uniq >> import.sql
-cat tmp-e2.sql | sort | uniq >> import.sql
-cat tmp-dsu.sql | sort | uniq >> import.sql
-cat tmp-dsc.sql | sort | uniq >> import.sql
-cat tmp-import.sql | sort | uniq >> import.sql
-cat tmp-u.sql | sort | uniq >> import.sql
-echo Sending SQL to DB
-
-$SQLCMD < import.sql
-
-while read execute2_id extrainfo; do
-    echo $extrainfo | awk -F ";"  '{ for (i = 1; i <= NF; i++)
-                                             print $i
+if [ -f extrainfo.txt ]; then
+    while read execute2_id extrainfo; do
+	echo $extrainfo | awk -F ";"  '{ for (i = 1; i <= NF; i++)
+                                               print $i
                                          }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
-    id=$($SQLCMD --tuples-only -c "select app_invocation_id from app_execution where id='$execute2_id';" | awk '{print $1}')
-    while read name type value; do
-	if [ "$type" = "num" ]; then
-	    echo "INSERT INTO annot_process_numeric (id, name, value) VALUES ('$id', '$name', $value);" >> extrainfo.sql
-	fi 
-	if [ "$type" = "txt" ]; then
-	    echo "INSERT INTO annot_process_text (id, name, value) VALUES ('$id', '$name', '$value');" >> extrainfo.sql
-	fi
-    done < fields.txt
-done < extrainfo.txt
+	id=$($SQLCMD --tuples-only -c "select app_inv_id from app_exec where id='$execute2_id';" | awk '{print $1}')
+	while read name type value; do
+	    if [ "$type" = "num" ]; then
+		$SQLCMD -c "INSERT INTO a_proc_n (id, name, value) VALUES ('$id', '$name', $value);"
+	    fi 
+	    if [ "$type" = "txt" ]; then
+		$SQLCMD -c "INSERT INTO a_proc_t (id, name, value) VALUES ('$id', '$name', '$value');"
+	    fi
+	done < fields.txt
+    done < extrainfo.txt
+fi
 
-$SQLCMD < extrainfo.sql
+if [ -f runtime.txt ]; then
+    while read execute2_id runtime; do
+	timestamp=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
+	cpu_usage=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
+	max_phys_mem=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
+	max_virtual_mem=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
+	io_read_bytes=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}')
+	io_write_bytes=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
+	$SQLCMD -c "INSERT INTO runtime_info (app_execution_id, tstamp, cpu_usage, max_phys_mem, max_virtual_mem, io_read_bytes, io_write_bytes) VALUES ('$execute2_id', $timestamp, $cpu_usage, $max_phys_mem, $max_virtual_mem, $io_read_bytes, $io_write_bytes);"
+    done < runtime.txt
+fi
 
-while read execute2_id runtime; do
-    timestamp=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
-    cpu_usage=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
-    max_phys_mem=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
-    max_virtual_mem=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
-    io_read_bytes=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}')
-    io_write_bytes=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
-    echo "INSERT INTO runtime_info (app_execution_id, tstamp, cpu_usage, max_phys_mem, max_virtual_mem, io_read_bytes, io_write_bytes) VALUES ('$execute2_id', $timestamp, $cpu_usage, $max_phys_mem, $max_virtual_mem, $io_read_bytes, $io_write_bytes);" >> runtime.sql
-done < runtime.txt
-
-$SQLCMD < runtime.sql
-
-echo Finished sending SQL to DB
-
+echo Finished sending SQL to DB
\ No newline at end of file

Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs	2011-05-12 21:22:39 UTC (rev 4468)
+++ provenancedb/swift-prov-import-all-logs	2011-05-13 00:24:49 UTC (rev 4469)
@@ -11,7 +11,7 @@
 source $PROVDIR/etc/provenance.config
 export PATH=$PROVDIR:$PATH
 
-# this generates a file with pairs like:
+# this generates a file with tuples like:
 # <starttime> <swift version> <logfilename> 
 swift-plot-log $LOGREPO everylog-vs-versions.data
 
@@ -55,7 +55,7 @@
 		
 		export WF="${RUNID}"
 		
-		echo "INSERT INTO run (id, log_filename, swift_version, final_state) VALUES ('$WF','$filename','$version','$wfstatus');" | $SQLCMD
+		$SQLCMD -c "INSERT INTO run (id, log_filename, swift_version, cog_version, final_state) VALUES ('$WF','$filename','$version', '', '$wfstatus');" 
 		
 		echo version $version in log file $filename
 		echo ============= will import =============




More information about the Swift-commit mailing list