[Swift-commit] r5786 - in provenancedb: . swift_mod

lgadelha at ci.uchicago.edu lgadelha at ci.uchicago.edu
Thu May 10 07:19:03 CDT 2012


Author: lgadelha
Date: 2012-05-10 07:27:54 -0500 (Thu, 10 May 2012)
New Revision: 5786

Added:
   provenancedb/swift_mod/
   provenancedb/swift_mod/_swiftwrap_runtime_aggregate
   provenancedb/swift_mod/_swiftwrap_runtime_snapshots
   provenancedb/swift_mod/create-everylog-vs-versions-data
Modified:
   provenancedb/pql_functions.sql
   provenancedb/prepare-provenance-chart
   provenancedb/prov-init.sql
   provenancedb/prov-to-sql.sh
   provenancedb/swift-prov-import-all-logs
Log:
- Prospective provenance import scripts.
- Created swift_mod directory containing the modifications needed to gather runtime information in Swift about application executions.


Modified: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql	2012-05-09 22:18:28 UTC (rev 5785)
+++ provenancedb/pql_functions.sql	2012-05-10 12:27:54 UTC (rev 5786)
@@ -12,8 +12,8 @@
 returns setof compare_run_by_parameter_type
 as $$
    select run_id, parameter, value
-   from   ds_io,fun_call,primitive
-   where  fun_call.id=ds_io.fun_call_id and ds_io.ds_id=primitive.id and parameter=$1;
+   from   dataset_io,fun_call,primitive
+   where  fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=primitive.id and parameter=$1;
 $$ language sql;
 
 -- PostgreSQL >= 9.0
@@ -26,13 +26,13 @@
 -- AS $$
 --    SELECT   fun_call.run_id, ds_out.parameter, primitive.value
 --    FROM     primitive, ds_out, fun_call
---    WHERE    primitive.id=ds_out.ds_id AND ds_out.fun_call_id=fun_call.id AND 
+--    WHERE    primitive.id=ds_out.dataset_id AND ds_out.fun_call_id=fun_call.id AND 
 --             ds_out.parameter=$1 
 --    GROUP BY fun_call.run_id, ds_out.parameter, primitive.value
 --  UNION
 --    SELECT   fun_call.run_id, ds_in.parameter, primitive.value
 --    FROM     primitive, ds_in, fun_call
---    WHERE    primitive.id=ds_in.ds_id AND ds_in.fun_call_id=fun_call.id AND 
+--    WHERE    primitive.id=ds_in.dataset_id AND ds_in.fun_call_id=fun_call.id AND 
 --             ds_in.parameter=$1 
 --    GROUP BY fun_call.run_id, ds_in.parameter, primitive.value	
 --$$ LANGUAGE SQL;
@@ -54,105 +54,105 @@
 --         USING (workflow_id); 
 --$$ LANGUAGE SQL;
 
-DROP TYPE compare_run_by_annot_num_type;
+DROP TYPE compare_run_by_annot_num_type CASCADE;
 CREATE TYPE compare_run_by_annot_num_type as (run_id VARCHAR, name VARCHAR, value NUMERIC);
 
 CREATE OR REPLACE FUNCTION compare_run_by_annot_num(name VARCHAR)
 RETURNS SETOF compare_run_by_annot_num_type
 AS $$
-    SELECT fun_call.run_id, annot_ds_num.name, annot_ds_num.value
-    FROM   annot_ds_num,ds_io,ds_cont,fun_call
-    WHERE  annot_ds_num.ds_id=ds_cont.in_id AND ds_cont.out_id=ds_io.ds_id AND
-           ds_io.fun_call_id=fun_call.id AND annot_ds_num.name=$1
+    SELECT fun_call.run_id, annot_dataset_num.name, annot_dataset_num.value
+    FROM   annot_dataset_num,dataset_io,dataset_containment,fun_call
+    WHERE  annot_dataset_num.dataset_id=dataset_containment.in_id AND dataset_containment.out_id=dataset_io.dataset_id AND
+           dataset_io.function_call_id=fun_call.id AND annot_dataset_num.name=$1
   UNION
-    SELECT fun_call.run_id, annot_ds_num.name, annot_ds_num.value 
-    FROM   fun_call, ds_io, annot_ds_num
-    WHERE  fun_call.id=ds_io.fun_call_id and ds_io.ds_id=annot_ds_num.ds_id and
-           annot_ds_num.name=$1
+    SELECT fun_call.run_id, annot_dataset_num.name, annot_dataset_num.value 
+    FROM   fun_call, dataset_io, annot_dataset_num
+    WHERE  fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_num.dataset_id and
+           annot_dataset_num.name=$1
   UNION
-    SELECT fun_call.run_id, annot_fun_call_num.name, annot_fun_call_num.value 
-    FROM   fun_call, annot_fun_call_num
-    WHERE  fun_call.id=annot_fun_call_num.fun_call_id and annot_fun_call_num.name=$1
+    SELECT fun_call.run_id, annot_function_call_num.name, annot_function_call_num.value 
+    FROM   fun_call, annot_function_call_num
+    WHERE  fun_call.id=annot_function_call_num.function_call_id and annot_function_call_num.name=$1
   UNION
-    SELECT run.id as run_id, annot_run_num.name, annot_run_num.value 
-    FROM   run, annot_run_num
-    WHERE  run.id=annot_run_num.run_id and annot_run_num.name=$1
+    SELECT run.id as run_id, annot_script_run_num.name, annot_script_run_num.value 
+    FROM   run, annot_script_run_num
+    WHERE  run.id=annot_script_run_num.script_run_id and annot_script_run_num.name=$1
 $$ LANGUAGE SQL;
 
-DROP TYPE compare_run_by_key_numeric_type;
+DROP TYPE compare_run_by_key_numeric_type CASCADE;
 CREATE TYPE compare_run_by_key_numeric_type as (run_id VARCHAR, name VARCHAR, value NUMERIC);
 
 CREATE OR REPLACE FUNCTION compare_run_by_key_numeric(name VARCHAR)
 RETURNS SETOF compare_run_by_key_numeric_type
 AS $$
-    SELECT fun_call.run_id, annot_ds_n.name, annot_ds_n.value
-    FROM   annot_ds_n,ds_io,ds_cont,fun_call
-    WHERE  annot_ds_n.ds_id=ds_cont.in_id AND ds_cont.out_id=ds_io.ds_id AND
-           ds_io.fun_call_id=fun_call.id AND annot_ds_n.name=$1
+    SELECT fun_call.run_id, annot_dataset_num.name, annot_dataset_num.value
+    FROM   annot_dataset_num,dataset_io,dataset_containment,fun_call
+    WHERE  annot_dataset_num.dataset_id=dataset_containment.in_id AND dataset_containment.out_id=dataset_io.dataset_id AND
+           dataset_io.function_call_id=fun_call.id AND annot_dataset_num.name=$1
   UNION
-    SELECT fun_call.run_id, annot_ds_n.name, annot_ds_n.value 
-    FROM   fun_call, ds_io, annot_ds_n
-    WHERE  fun_call.id=ds_io.fun_call_id and ds_io.ds_id=annot_ds_n.ds_id and
-           annot_ds_n.name=$1
+    SELECT fun_call.run_id, annot_dataset_num.name, annot_dataset_num.value 
+    FROM   fun_call, dataset_io, annot_dataset_num
+    WHERE  fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_num.dataset_id and
+           annot_dataset_num.name=$1
   UNION
-    SELECT fun_call.run_id, annot_fun_call_n.name, annot_fun_call_n.value 
-    FROM   fun_call, annot_fun_call_n
-    WHERE  fun_call.id=annot_fun_call_n.fun_call_id and annot_fun_call_n.name=$1
+    SELECT fun_call.run_id, annot_function_call_num.name, annot_function_call_num.value 
+    FROM   fun_call, annot_function_call_num
+    WHERE  fun_call.id=annot_function_call_num.function_call_id and annot_function_call_num.name=$1
   UNION
-    SELECT run.id as run_id, annot_run_n.name, annot_run_n.value 
-    FROM   run, annot_run_n
-    WHERE  run.id=annot_run_n.run_id and annot_run_n.name=$1
+    SELECT run.id as run_id, annot_script_run_num.name, annot_script_run_num.value 
+    FROM   run, annot_script_run_num
+    WHERE  run.id=annot_script_run_num.script_run_id and annot_script_run_num.name=$1
 $$ LANGUAGE SQL;
 
 
-DROP TYPE compare_run_by_annot_txt_type;
+DROP TYPE compare_run_by_annot_txt_type CASCADE;
 CREATE TYPE compare_run_by_annot_txt_type as (run_id VARCHAR, name VARCHAR, value VARCHAR);
 
 CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
 RETURNS SETOF compare_run_by_annot_txt_type
 AS $$
-    SELECT fun_call.run_id, annot_ds_text.name, annot_ds_text.value
-    FROM   annot_ds_text,ds_io,ds_cont,fun_call
-    WHERE  annot_ds_text.ds_id=ds_cont.in_id AND ds_cont.out_id=ds_io.ds_id AND
-           ds_io.fun_call_id=fun_call.id AND annot_ds_text.name=$1
+    SELECT fun_call.run_id, annot_dataset_text.name, annot_dataset_text.value
+    FROM   annot_dataset_text,dataset_io,dataset_containment,fun_call
+    WHERE  annot_dataset_text.dataset_id=dataset_containment.in_id AND dataset_containment.out_id=dataset_io.dataset_id AND
+           dataset_io.function_call_id=fun_call.id AND annot_dataset_text.name=$1
   UNION
-    SELECT fun_call.run_id, annot_ds_text.name, annot_ds_text.value 
-    FROM   fun_call, ds_io, annot_ds_text
-    WHERE  fun_call.id=ds_io.fun_call_id and ds_io.ds_id=annot_ds_text.ds_id and
-           annot_ds_text.name=$1
+    SELECT fun_call.run_id, annot_dataset_text.name, annot_dataset_text.value 
+    FROM   fun_call, dataset_io, annot_dataset_text
+    WHERE  fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_text.dataset_id and
+           annot_dataset_text.name=$1
   UNION
-    SELECT fun_call.run_id, annot_fun_call_text.name, annot_fun_call_text.value 
-    FROM   fun_call, annot_fun_call_text
-    WHERE  fun_call.id=annot_fun_call_text.fun_call_id and annot_fun_call_text.name=$1
+    SELECT fun_call.run_id, annot_function_call_text.name, annot_function_call_text.value 
+    FROM   fun_call, annot_function_call_text
+    WHERE  fun_call.id=annot_function_call_text.function_call_id and annot_function_call_text.name=$1
   UNION
-    SELECT run.id as run_id, annot_run_text.name, annot_run_text.value 
-    FROM   run, annot_run_text
-    WHERE  run.id=annot_run_text.run_id and annot_run_text.name=$1
+    SELECT run.id as run_id, annot_script_run_text.name, annot_script_run_text.value 
+    FROM   run, annot_script_run_text
+    WHERE  run.id=annot_script_run_text.script_run_id and annot_script_run_text.name=$1
 $$ LANGUAGE SQL;
 
-DROP TYPE compare_run_by_key_text_type;
+DROP TYPE compare_run_by_key_text_type CASCADE;
 CREATE TYPE compare_run_by_key_text_type as (run_id VARCHAR, name VARCHAR, value VARCHAR);
 
 CREATE OR REPLACE FUNCTION compare_run_by_key_text(name VARCHAR)
 RETURNS SETOF compare_run_by_key_text_type
 AS $$
-    SELECT fun_call.run_id, annot_ds_text.name, annot_ds_text.value
-    FROM   annot_ds_text,ds_io,ds_cont,fun_call
-    WHERE  annot_ds_text.ds_id=ds_cont.in_id AND ds_cont.out_id=ds_io.ds_id AND
-           ds_io.fun_call_id=fun_call.id AND annot_ds_text.name=$1
+    SELECT fun_call.run_id, annot_dataset_text.name, annot_dataset_text.value
+    FROM   annot_dataset_text,dataset_io,dataset_containment,fun_call
+    WHERE  annot_dataset_text.dataset_id=dataset_containment.in_id AND dataset_containment.out_id=dataset_io.dataset_id AND
+           dataset_io.function_call_id=fun_call.id AND annot_dataset_text.name=$1
   UNION
-    SELECT fun_call.run_id, annot_ds_text.name, annot_ds_text.value 
-    FROM   fun_call, ds_io, annot_ds_text
-    WHERE  fun_call.id=ds_io.fun_call_id and ds_io.ds_id=annot_ds_text.ds_id and
-           annot_ds_text.name=$1
+    SELECT fun_call.run_id, annot_dataset_text.name, annot_dataset_text.value 
+    FROM   fun_call, dataset_io, annot_dataset_text
+    WHERE  fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_text.dataset_id and
+           annot_dataset_text.name=$1
   UNION
-    SELECT fun_call.run_id, annot_fun_call_text.name, annot_fun_call_text.value 
-    FROM   fun_call, annot_fun_call_text
-    WHERE  fun_call.id=annot_fun_call_text.fun_call_id and annot_fun_call_text.name=$1
+    SELECT fun_call.run_id, annot_function_call_text.name, annot_function_call_text.value 
+    FROM   fun_call, annot_function_call_text
+    WHERE  fun_call.id=annot_function_call_text.function_call_id and annot_function_call_text.name=$1
   UNION
-    SELECT run.id as run_id, annot_run_text.name, annot_run_text.value 
-    FROM   run, annot_run_text
-    WHERE  run.id=annot_run_text.run_id and annot_run_text.name=$1
+    SELECT run.id as run_id, annot_script_run_text.name, annot_script_run_text.value 
+    FROM   run, annot_script_run_text
+    WHERE  run.id=annot_script_run_text.script_run_id and annot_script_run_text.name=$1
 $$ LANGUAGE SQL;
 
 -- CREATE OR REPLACE FUNCTION compare_run_by_annot_num(name VARCHAR)
@@ -162,23 +162,23 @@
 --   value NUMERIC
 -- )
 -- AS $$
---     SELECT fun_call.workflow_id, annot_ds_num.name, annot_ds_num.value
---     FROM   annot_ds_num,ds_usage,ds_containment,fun_call
---     WHERE  annot_ds_num.id=ds_containment.in_id AND ds_containment.out_id=ds_usage.dataset_id AND
---            ds_usage.fun_call_id=fun_call.id AND annot_ds_num.name=$1
+--     SELECT fun_call.workflow_id, annot_dataset_num.name, annot_dataset_num.value
+--     FROM   annot_dataset_num,dataset_io,dataset_containment,fun_call
+--     WHERE  annot_dataset_num.id=dataset_containment.in_id AND dataset_containment.out_id=dataset_io.dataset_id AND
+--            dataset_io.function_call_id=fun_call.id AND annot_dataset_num.name=$1
 --   UNION
---     SELECT fun_call.workflow_id, annot_ds_num.name, annot_ds_num.value 
---     FROM   fun_call, ds_usage, annot_ds_num
---     WHERE  fun_call.id=ds_usage.fun_call_id and ds_usage.dataset_id=annot_ds_num.id and
---            annot_ds_num.name=$1
+--     SELECT fun_call.workflow_id, annot_dataset_num.name, annot_dataset_num.value 
+--     FROM   fun_call, dataset_io, annot_dataset_num
+--     WHERE  fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_num.id and
+--            annot_dataset_num.name=$1
 --   UNION
 --     SELECT fun_call.workflow_id, annot_p_num.name, annot_p_num.value 
 --     FROM   fun_call, annot_p_num
 --     WHERE  fun_call.id=annot_p_num.id and annot_p_num.name=$1
 --   UNION
---     SELECT workflow.id as workflow_id, annot_wf_num.name, annot_wf_num.value 
+--     SELECT script_run.id as workflow_id, annot_wf_num.name, annot_wf_num.value 
 --     FROM   workflow, annot_wf_num
---     WHERE  workflow.id=annot_wf_num.id and annot_wf_num.name=$1
+--     WHERE  script_run.id=annot_wf_num.id and annot_wf_num.name=$1
 -- $$ LANGUAGE SQL;
 
 
@@ -188,57 +188,57 @@
 --   name VARCHAR, 
 --   value VARCHAR) 
 -- AS $$
---     SELECT   fun_call.workflow_id, annot_ds_txt.name, annot_ds_txt.value 
---     FROM     fun_call, ds_usage, annot_ds_txt
---     WHERE    fun_call.id=ds_usage.fun_call_id and ds_usage.dataset_id=annot_ds_txt.id and
---              annot_ds_txt.name=$1
+--     SELECT   fun_call.workflow_id, annot_dataset_txt.name, annot_dataset_txt.value 
+--     FROM     fun_call, dataset_io, annot_dataset_txt
+--     WHERE    fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_txt.id and
+--              annot_dataset_txt.name=$1
 --   UNION
 --     SELECT   fun_call.workflow_id, annot_p_txt.name, annot_p_txt.value 
 --     FROM     fun_call, annot_p_txt
 --     WHERE    fun_call.id=annot_p_txt.id and annot_p_txt.name=$1
 --   UNION
---     SELECT   workflow.id as workflow_id, annot_wf_txt.name, annot_wf_txt.value 
+--     SELECT   script_run.id as workflow_id, annot_wf_txt.name, annot_wf_txt.value 
 --     FROM     workflow, annot_wf_txt
---     WHERE    workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
+--     WHERE    script_run.id=annot_wf_txt.id and annot_wf_txt.name=$1
 -- $$ LANGUAGE SQL;
 
 
-CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
-RETURNS TABLE (
-  workflow_id VARCHAR,
-  name VARCHAR, 
-  value BOOLEAN
-) 
-AS $$
-    SELECT   fun_call.workflow_id, annot_ds_bool.name, annot_ds_bool.value 
-    FROM     fun_call, ds_usage, annot_ds_bool
-    WHERE    fun_call.id=ds_usage.fun_call_id and ds_usage.dataset_id=annot_ds_bool.id and
-             annot_ds_bool.name=$1
-  UNION
-    SELECT   fun_call.workflow_id, annot_p_bool.name, annot_p_bool.value 
-    FROM     fun_call, annot_p_bool
-    WHERE    fun_call.id=annot_p_bool.id and annot_p_bool.name=$1
-  UNION
-    SELECT   workflow.id as workflow_id, annot_wf_bool.name, annot_wf_bool.value 
-    FROM     workflow, annot_wf_bool
-    WHERE    workflow.id=annot_wf_bool.id and annot_wf_bool.name=$1
-$$ LANGUAGE SQL;
+--CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
+--RETURNS TABLE (
+--  workflow_id VARCHAR,
+--  name VARCHAR, 
+--  value BOOLEAN
+--) 
+--AS $$
+--    SELECT   fun_call.workflow_id, annot_dataset_bool.name, annot_dataset_bool.value 
+--    FROM     fun_call, dataset_io, annot_dataset_bool
+--    WHERE    fun_call.id=dataset_io.function_call_id and dataset_io.dataset_id=annot_dataset_bool.id and
+--             annot_dataset_bool.name=$1
+--  UNION
+--    SELECT   fun_call.workflow_id, annot_p_bool.name, annot_p_bool.value 
+--    FROM     fun_call, annot_p_bool
+--    WHERE    fun_call.id=annot_p_bool.id and annot_p_bool.name=$1
+--  UNION
+--    SELECT   script_run.id as workflow_id, annot_wf_bool.name, annot_wf_bool.value 
+--    FROM     workflow, annot_wf_bool
+--   WHERE    script_run.id=annot_wf_bool.id and annot_wf_bool.name=$1
+--$$ LANGUAGE SQL;
 
 
 -- correlate a parameter with workflow runtime statistics
 CREATE OR REPLACE FUNCTION correlate_parameter_runtime(parameter_name VARCHAR) 
 RETURNS TABLE (
-    workflow VARCHAR,  
-    workflow_starttime TIMESTAMP WITH TIME ZONE, 
-    workflow_duration NUMERIC, 
+    run VARCHAR,  
+    starttime TIMESTAMP WITH TIME ZONE, 
+    duration NUMERIC, 
     parameter VARCHAR, 
-    parameter_value VARCHAR
+    value VARCHAR
 ) 
 AS $$
-	SELECT workflow.id,to_timestamp(workflow.start_time),workflow.duration,ds_usage.parameter_name,variable.value
-	FROM   variable,ds_usage,fun_call,workflow
-	WHERE  variable.id=ds_usage.dataset_id AND ds_usage.fun_call_id=fun_call.id AND 
-	       fun_call.workflow_id=workflow.id AND ds_usage.param_name=$1 
+	SELECT script_run.id,script_run.start_time,script_run.duration,dataset_io.parameter,dataset.value
+	FROM   dataset,dataset_io,fun_call,script_run
+	WHERE  dataset.id=dataset_io.dataset_id AND dataset_io.function_call_id=fun_call.id AND 
+	       fun_call.run_id=script_run.id AND dataset_io.parameter=$1 
 $$ LANGUAGE SQL;
 
 -- recursive query to find ancestor entities in a provenance graph
@@ -247,13 +247,13 @@
   WITH RECURSIVE anc(ancestor,descendant) AS
   (    
        SELECT parent AS ancestor, child AS descendant 
-       FROM   parent_of 
+       FROM   provenance_graph_edge 
        WHERE child=$1
      UNION
-       SELECT parent_of.parent AS ancestor, 
+       SELECT provenance_graph_edge.parent AS ancestor, 
               anc.descendant AS descendant
-       FROM   anc,parent_of
-       WHERE  anc.ancestor=parent_of.child
+       FROM   anc, provenance_graph_edge
+       WHERE  anc.ancestor=provenance_graph_edge.child
   )
   SELECT ancestor FROM anc
 $$ LANGUAGE SQL;
@@ -280,15 +280,13 @@
       function_name := 'compare_run_by_parameter';
     WHEN 'annot_num' THEN
       function_name := 'compare_run_by_annot_num';
-    WHEN 'annot_txt' THEN
-      function_name := 'compare_run_by_annot_txt';
-    WHEN 'annot_bool' THEN
-      function_name := 'compare_run_by_annot_bool';
+    WHEN 'annot_text' THEN
+      function_name := 'compare_run_by_annot_text';
     END CASE;
     IF i = 1 THEN
       fromq := function_name || '(''' || property || ''') as t' || i;
     ELSE
-      fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (workflow_id)';
+      fromq := fromq || ' INNER JOIN ' || function_name || '(''' || property || ''') as t' || i || ' USING (run_id)';
     END IF;
   END LOOP;
   q := selectq || ' FROM ' || fromq;

Modified: provenancedb/prepare-provenance-chart
===================================================================
--- provenancedb/prepare-provenance-chart	2012-05-09 22:18:28 UTC (rev 5785)
+++ provenancedb/prepare-provenance-chart	2012-05-10 12:27:54 UTC (rev 5786)
@@ -1,52 +1,22 @@
 #!/bin/bash
 
-# invent an ID. doesn't need to have any meaning. needs to be unique.
-# uuidgen for now.
-# any time we ouptut a thread, prefix the WFID on the front.
-# should we do this in the logging code in swift proper? - it would be a
-# nicer way of getting the runid into the URI
-
 export RUNID=$(basename $1 .log)
-
-export WFID="execute:${RUNID}:"
-export EXECUTE2PREFIX="execute2:${RUNID}:"
-
-# will output log information about datasets from a log file passed as $1
-
+export WFID="${RUNID}:"
+export EXECUTE2PREFIX="${RUNID}:"
 cat $1 | grep ' PARAM ' | sed "s/^.* thread=\([^ ]*\).*direction=\([^ ]*\).*variable=\([^ ]*\).*provenanceid=\([^ ]*\).*\$/${WFID}\1 \2 \4 \3/" > tie-data-invocs.txt
-
-# 2007-12-13 14:29:13,967+0000 INFO  AbstractDataNode dataset 2461363 has child 938665
 cat $1 | grep ' CONTAINMENT ' | sed 's/^.*parent=\([^ ]*\) child=\([^ ]*\)$/\1 \2/' > tie-containers.txt
-
-#AbstractDataNode dataset 3814442 has filename file://localhost/q.out
 cat $1 | grep ' FILENAME ' | sed 's/^.*dataset=\([^ ]*\) filename=\([^ ]*\).*$/\1 \2/' | sort | uniq > dataset-filenames.txt
-
 cat $1 | grep ' VALUE ' | sed 's/^.*dataset=\([^ ]*\) VALUE=\(.*\)$/\1 \2/' | sort | uniq > dataset-values.txt
-
 cat $1 | grep ' PROCEDURE ' | sed "s/^.*thread=\([^ ]*\) name=\([^ ]*\)\$/${WFID}\1 \2/" > invocation-procedure-names.txt
-
-info-to-extrainfo > extrainfo.txt
-
-info-to-runtime > runtime.txt
-
 cat $1 | grep ' OPERATOR ' | sed 's/^.*thread=\([^ ]*\) operator="\([^ ]*\)" lhs=\([^ ]*\) rhs=\([^ ]*\) result=\([^ ]*\).*$/\1 \2 \3 \4 \5/' > operators.txt
-
-
-# 2009-03-19 19:15:35,244+0100 INFO  vdl:arguments FUNCTION id=88000-0-4-4 name="f ilename" result=dataset:20090319-1915-xj8flg 13:720000000060
-# 2009-03-19 19:15:35,246+0100 INFO  vdl:arguments FUNCTIONPARAMETER id=88001-0-4- 4 input=dataset:20090319-1915-xj8flg13:72000 0000058
-
 cat $1 | grep ' FUNCTION ' | sed "s/^.*id=\([^ ]*\) name=\([^ ]*\) result=\([^ ]*\).*\$/$WFID\1 \2 \3/" > functions.txt
-# the IDs in functions.txt should be unique...
-# we could even test that...
-
 cat $1 | grep ' FUNCTIONPARAMETER ' | sed "s/^.*id=\([^ ]*\) input=\([^ ]*\).*\$/$WFID\1 \2/" > function-inputs.txt
-
 cat $1 | grep ' CREATEARRAY START ' | sed 's/^.* array=\([^ ]*\).*$/\1/' > createarray.txt
-
 cat $1 | grep ' CREATEARRAY MEMBER ' | sed 's/^.* array=\([^ ]*\) index=\([^ ]*\) member=\([^ ]*\).*$/\1 \2 \3/' > createarray-members.txt
-
 cat $1 | grep ' ARRAYRANGE ' | sed 's/^.* thread=\([^ ]*\).*$/\1/' > arrayrange.txt
-
-# 2009-05-18 21:19:20,295+0200 INFO  vdl:mains SCOPE thread=0-6-5-1-5
-
 cat $1 | grep ' SCOPE ' | sed 's/^.* thread=\([^ ]*\).*/\1/' > scopes.txt
+awk '/BEGIN SWIFTSCRIPT/,/END SWIFTSCRIPT/{if (!/BEGIN SWIFTSCRIPT/&&!/END SWIFTSCRIPT/)print}' $1 > script.txt
+awk '/BEGIN SITES/,/END SITES/{if (!/BEGIN SITES/&&!/END SITES/)print}' $1 > sites.txt
+awk '/BEGIN TC/,/END TC/{if (!/BEGIN TC/&&!/END TC/)print}' $1 > tc.txt
+info-to-extrainfo > extrainfo.txt
+info-to-runtime > runtime.txt

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2012-05-09 22:18:28 UTC (rev 5785)
+++ provenancedb/prov-init.sql	2012-05-10 12:27:54 UTC (rev 5786)
@@ -1,46 +1,52 @@
 -- this is the schema definition used for the main relational provenance
 -- implementation (in both sqlite3 and postgres)
 
--- base relations
-drop table app_catalog cascade;
-drop table site_catalog cascade;
-drop table script cascade;
-drop table run cascade;
-drop table fun_call cascade;
-drop table app_fun_call cascade;
-drop table app_exec cascade;
+drop view  script_run;
+drop view  function_call;
+drop view  application_execution;
+drop view  runtime_info;
+drop view  dataset;
+drop view  dataset_io;
+drop view  provenance_graph_edge;
+drop table annot_script_run_num cascade;
+drop table annot_script_run_text cascade;
+drop table annot_function_call_num cascade;
+drop table annot_function_call_text cascade;
+drop table annot_app_exec_num cascade;
+drop table annot_app_exec_text cascade;
+drop table annot_dataset_num cascade;
+drop table annot_dataset_text cascade;
 drop table rt_info cascade;
-drop table ds cascade;
+drop table app_exec cascade;
+drop table app_fun_call cascade;
+drop table dataset_in cascade;
+drop table dataset_out cascade;
+drop table fun_call cascade;
+drop table run cascade;
+drop table tc_file cascade;
+drop table sites_file cascade;
+drop table script cascade;
 drop table mapped cascade;
 drop table primitive cascade;
-drop table ds_containment cascade;
-drop table ds_in cascade;
-drop table ds_out cascade;
-drop table annot_run_num cascade;
-drop table annot_run_text cascade;
-drop table annot_fun_call_num cascade;
-drop table annot_fun_call_text cascade;
-drop table annot_app_exec_num cascade;
-drop table annot_app_exec_text cascade;
-drop table annot_ds_num cascade;
-drop table annot_ds_text cascade;
+drop table dataset_containment cascade;
+drop table ds cascade;
 
 -- application_catalog stores tc.file
-create table app_catalog (
+create table tc_file (
        hash_value		 varchar(256) primary key,
        content			 text
 );
 
 -- application_catalog stores tc.file
-create table site_catalog (
-	hash_value		varchar(256) primary key,
-	content			text
+create table sites_file (
+	hash_value      varchar(256) primary key,
+	content         text
 );
-	
+
 -- script stores Swift script source codes
 create table script (
-	hash_value		varchar(256) primary key,
-	content			text
+	hash_value      varchar(256) primary key,
+	content         text
 );
 
 -- run stores information about each script run log that has
@@ -48,122 +54,134 @@
 -- status.
 -- Might be interesting to store xml translation of the Swift script
 -- here for prospective provenance and versioning.
-create table run
-    (
-     id				varchar(256) primary key,
-     log_filename 		varchar(2048),
-     swift_version 		varchar(16),
-     cog_version   		varchar(16),
-     final_state   		varchar(32),
-     start_time    		numeric,
-     duration      		numeric,
-     script_filename		varchar(2048),
-     script_hash		varchar(256) references script (hash_value),
-     application_catalog_hash	varchar(256) references app_catalog (hash_value),
-     site_catalog_hash	   	varchar(256) references site_catalog (hash_value)
+create table run (
+     id                 varchar(256) primary key,
+     log_filename       varchar(2048),
+     swift_version      varchar(16),
+     cog_version        varchar(16),
+     final_state        varchar(32),
+     start_time         numeric,
+     duration           numeric,
+     script_filename    varchar(2048),
+     script_hash        varchar(256) references script (hash_value),
+     tc_file_hash   	varchar(256) references tc_file (hash_value),
+     sites_file_hash    varchar(256) references sites_file (hash_value)
 );
 
+create view script_run as
+  select id, log_filename, swift_version, cog_version, final_state, 
+         to_timestamp(start_time) as start_time, duration, script_filename,
+	 script_hash, tc_file_hash, sites_file_hash
+  from   run;
+
 -- process gives information about each process (in the OPM sense)
 -- it is augmented by information in other tables
 -- specifies the type of process. for any type, it
 -- must be the case that the specific type table
 -- has an entry for this process.
 -- process types: internal, rootthread, execute, function, compound, scope, operator
-create table fun_call
-    (
-     id     varchar(256), 
-     run_id varchar(256) references run (id) on delete cascade,  
-     type   varchar(16),
-     name   varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
-     primary key (id, run_id)
-									   									      	 
+create table fun_call (
+     id      	    varchar(256) primary key, 
+     run_id  	    varchar(256) references run (id) on delete cascade,  
+     type    	    varchar(16),
+     name    	    varchar(256) -- in the case of an execute this refers to the transformation name in tc.data
 );
 
 -- this gives information about each execute.
 -- each execute is identified by a unique URI. other information from
 -- swift logs is also stored here. an execute is an OPM process.
-create table app_fun_call
-    (
-     id			varchar(256),  
-     run_id		varchar(256),
-     name      		varchar(256), -- name of the app procedure that invokes the transformation
-     start_time	    	numeric,
-     duration		numeric,
-     final_state	varchar(32),
-     scratch		varchar(2048),
-     foreign key (id, run_id) references fun_call,
-     primary key (id, run_id)
+create table app_fun_call (
+     id             varchar(256) primary key references fun_call (id),  
+     name      	    varchar(256), -- name of the app procedure that invokes the transformation
+     start_time     numeric,
+     duration       numeric,
+     final_state    varchar(32),
+     scratch        varchar(2048)
 );
 
+create view function_call as 
+    select fun_call.id, fun_call.name, fun_call.type, app_fun_call.name as app_catalog_name, fun_call.run_id as script_run_id,  
+           to_timestamp(app_fun_call.start_time) as start_time, app_fun_call.duration, app_fun_call.final_state, app_fun_call.scratch
+    from
+      fun_call
+    left outer join
+      app_fun_call 
+    on fun_call.id=app_fun_call.id;
+
 -- this gives information about each application execution attempt, including
 -- aggregate resource consumption. the app_exec_id is tied to per-execution-attempt
 -- information such as wrapper logs
-create table app_exec
-    (
-     id                varchar(256),
-     app_fun_call_id   varchar(256), 
-     run_id	       varchar(256),
+create table app_exec (
+     id                varchar(256) primary key,
+     app_fun_call_id   varchar(256) references app_fun_call (id), 
      start_time        numeric,
      duration          numeric,
      final_state       varchar(32),
-     site              varchar(256),
-     maxrss	       numeric,
-     walltime	       numeric,
-     systime	       numeric,
-     usertime	       numeric,
-     cpu	       numeric,
-     fsin	       numeric,
-     fsout	       numeric,
-     timesswpd         numeric,
-     socketrecv	       numeric,
-     socketsent	       numeric,
-     majpfaults        numeric,
-     minpfaults        numeric,
-     ctxswinv	       numeric,
-     ctxswvol	       numeric,
-     foreign key(app_fun_call_id, run_id) references app_fun_call,
-     primary key(id, app_fun_call_id, run_id)
+     site              varchar(256)
 );
+--     maxrss            numeric,
+--     walltime          numeric,
+--     systime           numeric,
+--     usertime          numeric,
+--     cpu               numeric,
+--     fsin              numeric,
+--     fsout             numeric,
+--     timesswpd         numeric,
+--     socketrecv        numeric,
+--     socketsent        numeric,
+--     majpfaults        numeric,
+--     minpfaults        numeric,
+--     ctxswinv          numeric,
+--     ctxswvol          numeric,
 
+create view application_execution as
+  select id, app_fun_call_id as function_call_id, to_timestamp(start_time) as start_time, duration, final_state, site
+  from   app_exec;
+
+
 -- app execution runtime info extracted from the /proc filesystem (assumes the app executed
 -- in a Linux host) 
-create table rt_info
-   ( 
-     app_exec_id        varchar(256), 
-     app_fun_call_id    varchar(256), 
-     run_id	        varchar(256),    
+create table rt_info ( 
+     app_exec_id        varchar(256) references app_exec (id), 
      timestamp		numeric,
      cpu_usage          numeric,
      max_phys_mem	numeric,
      max_virt_mem	numeric,
      io_read		numeric,
      io_write		numeric,
-     foreign key (app_exec_id, app_fun_call_id, run_id) references app_exec,
-     primary key (app_exec_id, app_fun_call_id, run_id, timestamp)
+     primary key (app_exec_id, timestamp)
 );
 
+create view runtime_info as
+  select app_exec_id, 
+	 to_timestamp(timestamp) as timestamp, 
+	 cpu_usage, 
+	 max_phys_mem, 
+	 max_virt_mem, 
+	 io_read, 
+	 io_write
+  from rt_info;
+
 -- ds stores all dataset identifiers.
-create table ds
-    (
+create table ds (
       id	 varchar(256) primary key
-    );
+);
 
 -- file stores the filename mapped to each dataset. 
-create table mapped
-    ( 
+create table mapped ( 
       id	 varchar(256) primary key references ds (id) on delete cascade,
       filename   varchar(2048)
-    );
+);
 
 -- dataset_values stores the value for each dataset which is known to have
 -- a value (which is all assigned primitive types). No attempt is made here
 -- to expose that value as an SQL type other than a string, and so (for
 -- example) SQL numerical operations should not be expected to work, even
 -- though the user knows that a particular dataset stores a numeric value.
-create table primitive
-    ( id    varchar(256) primary key references ds (id) on delete cascade,
-      value varchar(2048)
-    );
+create table primitive ( 
+       id    varchar(256) primary key references ds (id) on delete cascade,
+       value varchar(2048)
+);
 
 -- dataset_containment stores the containment hierarchy between
 -- container datasets (arrays and structs) and their contents.
@@ -172,215 +190,108 @@
 -- constructors and accessors, rather than, or in addition to,
 -- a containment hierarchy. The relationship (such as array index or
 -- structure member name) should also be stored in this table.
-create table ds_containment
-    ( out_id varchar(256) references ds (id) on delete cascade,
-      in_id  varchar(256) references ds (id) on delete cascade,
-      primary key (out_id,in_id)
-    );
+create table dataset_containment ( 
+       out_id varchar(256) references ds (id) on delete cascade,
+       in_id  varchar(256) references ds (id) on delete cascade,
+       primary key (out_id,in_id)
+);
 
+create view dataset as 
+    select mapped.id, 'mapped' as type, mapped.filename, null as value
+    from mapped
+  union all 
+    select primitive.id, 'primitive' as type, null as filename, primitive.value
+    from primitive
+  union all
+    select dataset_containment.out_id as id, 'composite' as type, null as filename, null as value 
+    from dataset_containment;
 
 -- dataset_usage records usage relationships between processes and datasets;
 -- in SwiftScript terms, the input and output parameters for each
 -- application procedure invocation; in OPM terms, the artificts which are
 -- input to and output from each process that is a Swift execution
-create table ds_in
-    (
-     fun_call_id	varchar(256), 
-     run_id		varchar(256),
-     ds_id   		varchar(256) references ds (id) on delete cascade,
-     parameter   	varchar(256), -- the name of the parameter in this execute that
-                             	      -- this dataset was bound to. sometimes this must
-                              	      -- be contrived (for example, in positional varargs)
-     foreign key (fun_call_id, run_id) references fun_call,
-     primary key (fun_call_id, run_id, ds_id, parameter)
-    );
+create table dataset_in (
+       function_call_id	varchar(256) references fun_call (id), 
+       dataset_id   	varchar(256) references ds (id) on delete cascade,
+       parameter   	varchar(256), -- the name of the parameter in this execute that
+                             	  -- this dataset was bound to. sometimes this must
+                              	  -- be contrived (for example, in positional varargs)
+       primary key (function_call_id, dataset_id, parameter)
+ );
 
-create table ds_out
-    (
-     fun_call_id	varchar(256), 
-     run_id		varchar(256),
-     ds_id   		varchar(256) references ds (id) on delete cascade,
-     parameter   	varchar(256), -- the name of the parameter in this execute that
-                              	      -- this dataset was bound to. sometimes this must
-                              	      -- be contrived (for example, in positional varargs)
-     foreign key (fun_call_id, run_id) references fun_call,
-     primary key (fun_call_id, run_id, ds_id, parameter)
-    );
+create table dataset_out (
+       function_call_id	varchar(256) references fun_call (id), 
+       dataset_id   	varchar(256) references ds (id) on delete cascade,
+       parameter   	varchar(256), -- the name of the parameter in this execute that
+                              	  -- this dataset was bound to. sometimes this must
+                              	  -- be contrived (for example, in positional varargs)
+       primary key (function_call_id, dataset_id, parameter)
+);
 
+create view dataset_io as
+ select dataset_in.function_call_id, dataset_in.dataset_id, dataset_in.parameter, 'I' as type
+ from   dataset_in
+union all 
+ select dataset_out.function_call_id, dataset_out.dataset_id, dataset_out.parameter, 'O' as type
+ from   dataset_out;
 
--- annotations 
-create table annot_ds_num
-   ( ds_id varchar(256) references ds (id) on delete cascade, 
-     name  varchar(256),
-     value numeric,
-     primary key (ds_id, name)
-   );
+create table annot_script_run_num ( 
+       script_run_id    varchar(256) references run (id) on delete cascade, 
+       name      varchar(256),
+       value     numeric,
+       primary key (script_run_id, name)
+);
 
-create table annot_ds_text
-   ( ds_id varchar(256) references ds (id) on delete cascade, 
-     name  varchar(256),
-     value varchar(2048),
-     primary key (ds_id, name)
-   );
-
-create table annot_fun_call_num
-   ( fun_call_id	varchar(256), 
-     run_id	        varchar(256),    
-     name       	varchar(256),
-     value      	numeric,
-     foreign key (fun_call_id, run_id) references fun_call,
-     primary key (fun_call_id, run_id, name)
-   );
-
-create table annot_fun_call_text
-   ( fun_call_id	varchar(256), 
-     run_id	        varchar(256),    
-     name       	varchar(256),
-     value      	varchar(2048),
-     foreign key (fun_call_id, run_id) references fun_call,
-     primary key (fun_call_id, run_id, name)
-   );
-
-create table annot_run_num
-   ( run_id    varchar(256) references run (id) on delete cascade, 
+create table annot_script_run_text ( script_run_id    varchar(256) references run (id) on delete cascade, 
      name      varchar(256),
-     value     numeric,
-     primary key (run_id, name)
-   );
-
-create table annot_run_text
-   ( run_id    varchar(256) references run (id) on delete cascade, 
-     name      varchar(256),
      value     varchar(2048),
-     primary key (run_id, name)
-   );
+     primary key (script_run_id, name)
+);
 
-create table annot_app_exec_num
-   ( id			varchar(256), 
-     app_fun_call_id    varchar(256), 
-     run_id	        varchar(256),    
-     name      		varchar(256),
-     value     		numeric,
-     foreign key (id, app_fun_call_id, run_id) references app_exec,
-     primary key (id, app_fun_call_id, run_id, name)
-   );
+create table annot_function_call_num ( 
+       function_call_id	varchar(256) references fun_call (id) on delete cascade, 
+       name       	varchar(256),
+       value      	numeric,
+       primary key (function_call_id, name)
+);
 
-create table annot_app_exec_text
-   ( id			varchar(256), 
-     app_fun_call_id    varchar(256), 
-     run_id	        varchar(256),    
-     name      		varchar(256),
-     value     		varchar(2048),
-     foreign key (id, app_fun_call_id, run_id) references app_exec,
-     primary key (id, app_fun_call_id, run_id, name)
-   );
+create table annot_function_call_text ( 
+       function_call_id	varchar(256) references fun_call (id) on delete cascade, 
+       name       	varchar(256),
+       value      	varchar(2048),
+       primary key (function_call_id, name)
+);
 
+create table annot_app_exec_num ( 
+       app_exec_id            varchar(256) references app_exec (id) on delete cascade, 
+       name      		    varchar(256),
+       value     		    numeric,
+       primary key (app_exec_id, name)
+);
 
--- create table iq
---    ( idx      serial primary key,
---     q        varchar(2048)
---   );
-drop view ds_io;
-create view ds_io as
- select ds_in.fun_call_id as function_call_id, ds_in.ds_id as variable_id, ds_in.parameter
- from   ds_in
-union all 
- select ds_out.fun_call_id as function_call_id, ds_out.ds_id as variable_id, ds_out.parameter
- from   ds_out;
+create table annot_app_exec_text ( 
+       app_exec_id            varchar(256) references app_exec (id) on delete cascade, 
+       name      		    varchar(256),
+       value     		    varchar(2048),
+       primary key (app_exec_id, name)
+);
 
-drop view pgraph_edge;
-create view pgraph_edge as 
-       select fun_call_id as parent,ds_id as child from ds_out
-       union all
-       select ds_id as parent,fun_call_id as child from ds_in
-       union all
-       select out_id as parent,in_id as child from ds_containment;
+create table annot_dataset_num ( 
+       dataset_id varchar(256) references ds (id) on delete cascade, 
+       name  varchar(256),
+       value numeric,
+       primary key (dataset_id, name)
+);
 
-drop view annot_text cascade;
-create view annot_text as
-    select *
-    from annot_run_text 
-  union all
-    select * 
-    from annot_ds_text 
-  union all 
-    select * 
-    from annot_fun_call_text;
+create table annot_dataset_text( 
+       dataset_id varchar(256) references ds (id) on delete cascade, 
+       name  varchar(256),
+       value varchar(2048),
+       primary key (dataset_id, name)
+);
 
-drop view annot_num cascade;
-create view annot_num as
-    select *
-    from annot_run_num 
-  union all
-    select * 
-    from annot_ds_num 
-  union all 
-    select * 
-    from annot_fun_call_num;
-
--- views used for queries based on the schema summary
-
-drop view function_call;
-
-create view function_call as 
-    select fun_call.id, fun_call.name as name, fun_call.type, app_fun_call.name as app_catalog_name, fun_call.run_id as script_run_id,  
-           to_timestamp(app_fun_call.start_time) as start_time, app_fun_call.duration, app_fun_call.final_state, app_fun_call.scratch 
-    from fun_call 
-    left outer join 
-    app_fun_call on fun_call.id=app_fun_call.id;
-
-drop view variable;
-
-create view variable as 
-    select mapped.id, 'mapped' as type, mapped.filename, null as value
-    from mapped
-  union all 
-    select primitive.id, 'primitive' as type, null as filename, primitive.value
-    from primitive
-  union all
-    select ds_containment.out_id as id, 'composite' as type, null as filename, null as value 
-    from ds_containment;
-
-drop view annotation;
-
-create view annotation as 
-    select annot_text.run_id as id, annot_text.name as key, annot_text.value as string_value, null as numeric_value
-    from annot_text
-  union all
-    select annot_num.run_id as id, annot_num.name as key, null as string_value, annot_num.value as numeric_value
-    from annot_num;
-   
-drop view script_run;
-
-create view script_run as
-  select id, log_filename, swift_version, cog_version, final_state, 
-         to_timestamp(start_time) as start_time, duration
-  from   run;
-
-drop view application_execution;
-
-create view application_execution as
-  select id, app_fun_call_id as function_call_id, to_timestamp(start_time) as start_time, duration, final_state, site
-  from   app_exec;
-
-drop view runtime_info;
-
-create view runtime_info as
-  select app_exec_id as application_execution_id, to_timestamp(timestamp) as timestamp, cpu_usage, max_phys_mem, max_virt_mem, io_read, io_write
-  from rt_info;
-
-drop view produces;
-
-create view produces as
-  select fun_call_id as function_call_id, ds_id as variable_id, parameter from ds_out;
-
-drop view consumes;
-
-create view consumes as
-  select fun_call_id as function_call_id, ds_id as variable_id, parameter from ds_in;
-
-drop view variable_containment;
-
-create view variable_containment as
-  select out_id as container, in_id as containee
-  from   ds_containment;
+create view provenance_graph_edge as 
+       select function_call_id as parent, dataset_id as child from dataset_out
+    union all
+       select dataset_id as parent, function_call_id as child from dataset_in;
+	

Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh	2012-05-09 22:18:28 UTC (rev 5785)
+++ provenancedb/prov-to-sql.sh	2012-05-10 12:27:54 UTC (rev 5786)
@@ -2,7 +2,7 @@
 
 export RUNID=$(basename $1 .log)
 
-export WFID="execute:${RUNID}:"
+export WFID="${RUNID}:"
 
 # TODO is there already a URI form for identifying workflows?
 export WF="${RUNID}"
@@ -16,16 +16,18 @@
 
 echo "    - Function calls."
 while read time duration thread localthread endstate tr_name scratch; do
-    echo "INSERT INTO fun_call (id, type, run_id) VALUES ('$localthread', 'execute', '$WF');"  >> /tmp/$RUNID.sql
-    echo "INSERT INTO app_fun_call (id, run_id, name, start_time, duration, final_state, scratch) VALUES ('$localthread', '$WF', '$tr_name', $time, $duration, '$endstate', '$scratch');"   >> /tmp/$RUNID.sql
+    id=$(echo "$thread" | sed "s/execute\://")
+    echo "INSERT INTO fun_call (id, type, run_id) VALUES ('$id', 'execute', '$WF');"  >> /tmp/$RUNID.sql
+    echo "INSERT INTO app_fun_call (id, name, start_time, duration, final_state, scratch) VALUES ('$id', '$tr_name', $time, $duration, '$endstate', '$scratch');"   >> /tmp/$RUNID.sql
 done < execute.global.event
 
 echo "    - Application executions."
 while read start_time duration globalid id endstate thread site scratch; do
     # cut off the last component of the thread, so that we end up at the
     # parent thread id which should correspond with the execute-level ID
-    inv_id="$(echo $thread | sed 's/-[^-]*$//')"
-    echo  "INSERT INTO app_exec (id, app_fun_call_id, run_id, start_time, duration, final_state, site) VALUES ('$id', '$inv_id', '$WF', $start_time, $duration, '$endstate', '$site');"  >> /tmp/$RUNID.sql
+    inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"
+    eid=$(echo "$globalid" | sed "s/execute2\://")   
+    echo  "INSERT INTO app_exec (id, app_fun_call_id, start_time, duration, final_state, site) VALUES ('$eid', '$inv_id', $start_time, $duration, '$endstate', '$site');"  >> /tmp/$RUNID.sql
 done < execute2.global.event
 
 echo "    - Mapped variables."
@@ -44,10 +46,11 @@
 while read outer inner; do
     echo  "INSERT INTO ds (id) VALUES ('$outer');"  >> /tmp/$RUNID.sql
     echo  "INSERT INTO ds (id) VALUES ('$inner');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_containment (out_id, in_id) VALUES ('$outer', '$inner');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO fun_call (id, type, name, run_id) VALUES ('constructor:$outer', 'constructor', 'constructor', '$WF');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_in (fun_call_id, ds_id, parameter) VALUES ('constructor:$outer', '$inner', 'element');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_out (fun_call_id, ds_id, parameter) VALUES ('constructor:$outer', '$outer', 'collection');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_containment (out_id, in_id) VALUES ('$outer', '$inner');"  >> /tmp/$RUNID.sql
+    cid=$(echo $outer | awk -F "-" '{print $3}')
+    echo  "INSERT INTO fun_call (id, type, name, run_id) VALUES ('${WFID}$cid', 'constructor', 'constructor', '$WF');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_in (function_call_id, dataset_id, parameter) VALUES ('${WFID}$cid', '$inner', 'element');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_out (function_call_id, dataset_id, parameter) VALUES ('${WFID}$cid', '$outer', 'collection');"  >> /tmp/$RUNID.sql
 done < tie-containers.txt
 
 echo "    - Operator calls."
@@ -58,29 +61,27 @@
     rhs=$(echo $rhs | awk 'BEGIN { FS = "=" }; {print $2}')
     result=$(echo $result | awk 'BEGIN { FS = "=" }; {print $2}')
     
-    operatorid="operator:$thread"
+    operatorid="${WFID}operator:$thread"
     
     echo  "INSERT INTO ds (id) VALUES ('$lhs');" >> /tmp/$RUNID.sql
     echo  "INSERT INTO ds (id) VALUES ('$rhs');" >> /tmp/$RUNID.sql
     echo  "INSERT INTO ds (id) VALUES ('$result');" >> /tmp/$RUNID.sql
     echo  "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$operatorid', 'operator', '$name', '$WF');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_in (fun_call_id, ds_id, parameter) VALUES ('$operatorid', '$lhs', 'lhs');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_in (fun_call_id, ds_id, parameter) VALUES ('$operatorid', '$rhs', 'rhs');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_out (fun_call_id, ds_id, parameter) VALUES ('$operatorid', '$result', 'result');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_in (function_call_id, dataset_id, parameter) VALUES ('$operatorid', '$lhs', 'lhs');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_in (function_call_id, dataset_id, parameter) VALUES ('$operatorid', '$rhs', 'rhs');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_out (function_call_id, dataset_id, parameter) VALUES ('$operatorid', '$result', 'result');"  >> /tmp/$RUNID.sql
 done < operators.txt
 
 echo "    - Built-in function calls."
 while read id name output; do
-    fid=$(echo $id | awk -F ":" '{print $3}')
     echo  "INSERT INTO ds (id) VALUES ('$output');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$fid', 'function', '$name', '$WF');"  >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_out (fun_call_id, ds_id, parameter) VALUES ('$fid', '$output', 'result');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$id', 'function', '$name', '$WF');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_out (function_call_id, dataset_id, parameter) VALUES ('$id', '$output', 'result');"  >> /tmp/$RUNID.sql
 done < functions.txt
 
 while read id value; do
-    fid=$(echo $id | awk -F ":" '{print $3}')
     echo  "INSERT INTO ds (id) VALUES ('$value');" >> /tmp/$RUNID.sql
-    echo  "INSERT INTO ds_in (fun_call_id, ds_id, parameter) VALUES ('$fid', '$value', 'undefined');"  >> /tmp/$RUNID.sql
+    echo  "INSERT INTO dataset_in (function_call_id, dataset_id, parameter) VALUES ('$id', '$value', 'undefined');"  >> /tmp/$RUNID.sql
 done < function-inputs.txt
 
 
@@ -95,87 +96,99 @@
 echo "    - Compound functions."
 while read start duration thread final_state procname ; do
     if [ "$duration" != "last-event-line" ]; then
-	echo "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$thread', 'compound', '$procname', '$WF');"  >> /tmp/$RUNID.sql
+	compoundid=$WFID$thread
+	echo "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$compoundid', 'compound', '$procname', '$WF');"  >> /tmp/$RUNID.sql
     fi
 done < compound.event
 
 while read start duration thread final_state procname ; do
     if [ "$duration" != "last-event-line" ]; then
-	echo "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$thread', 'internal', '$procname', '$WF');"  >> /tmp/$RUNID.sql
+	fqid=$WFID$thread
+	echo "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$fqid', 'internal', '$procname', '$WF');"  >> /tmp/$RUNID.sql
     fi	
 done < internalproc.event
 
 while read t ; do 
-    echo "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$t', 'scope', 'scope', '$WF');"  >> /tmp/$RUNID.sql
+    thread="${WFID}$t"
+    echo "INSERT INTO fun_call (id, type, name, run_id) VALUES ('$thread', 'scope', 'scope', '$WF');"  >> /tmp/$RUNID.sql
 done < scopes.txt
 
-echo "    - Variable consumption and production."
+echo "    - Dataset consumption and production."
 while read thread direction dataset variable rest; do 
-    fid=$(echo $thread | awk -F ":" '{print $3}')
     if [ "$direction" == "input" ] ; then
-	table=ds_in
+	table=dataset_in
     else
-	table=ds_out
+	table=dataset_out
     fi
     
     echo "INSERT INTO ds (id) VALUES ('$dataset');"  >> /tmp/$RUNID.sql
-    echo "INSERT INTO $table (fun_call_id, ds_id, parameter) VALUES ('$fid', '$dataset', '$variable');"  >> /tmp/$RUNID.sql
+    echo "INSERT INTO $table (function_call_id, dataset_id, parameter) VALUES ('$thread', '$dataset', '$variable');"  >> /tmp/$RUNID.sql
 done < tie-data-invocs.txt
 
 
 echo "    - Wrapper log resource consumption info."
 if [ -f runtime.txt ]; then
     while read execute2_id runtime; do
-	#timestamp=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
-	#cpu_usage=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
-	#max_phys_mem=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
-	#max_virtual_mem=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
-	#io_read_bytes=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}')
-	#io_write_bytes=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
-	#echo "INSERT INTO rt_info (app_exec_id, tstamp, cpu_usage, max_phys_mem, max_virt_mem, io_read, io_write) VALUES ('$execute2_id', $timestamp, $cpu_usage, $max_phys_mem, $max_virtual_mem, $io_read_bytes, $io_write_bytes);"  >> /tmp/$RUNID.sql
-	maxrss=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
-	walltime=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
-	systime=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
-	usertime=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
-	cpu=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}' | awk -F "%" '{print $1}')
-	fsin=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
-	fsout=$(echo $runtime | awk -F "," '{print $7}' | awk -F ":" '{print $2}')
-	timesswapped=$(echo $runtime | awk -F "," '{print $8}' | awk -F ":" '{print $2}')
-	socketrecv=$(echo $runtime | awk -F "," '{print $9}' | awk -F ":" '{print $2}')
-	socketsent=$(echo $runtime | awk -F "," '{print $10}' | awk -F ":" '{print $2}')
-	majorpagefaults=$(echo $runtime | awk -F "," '{print $11}' | awk -F ":" '{print $2}')
-	minorpagefaults=$(echo $runtime | awk -F "," '{print $12}' | awk -F ":" '{print $2}')
-	contextswitchesinv=$(echo $runtime | awk -F "," '{print $13}' | awk -F ":" '{print $2}')
-	contextswitchesvol=$(echo $runtime | awk -F "," '{print $14}' | awk -F ":" '{print $2}')
-	echo "UPDATE app_exec SET maxrss=$maxrss, walltime=$walltime, systime=$systime, usertime=$usertime, cpu=$cpu, fsin=$fsin, fsout=$fsout, timesswpd=$timesswapped, socketrecv=$socketrecv, socketsent=$socketsent, majpfaults=$majorpagefaults, minpfaults=$minorpagefaults, ctxswinv=$contextswitchesinv, ctxswvol=$contextswitchesvol where id='$execute2_id' and run_id='$WF';"  >> /tmp/$RUNID.sql
+	for key in $(echo maxrss walltime systime usertime cpu fsin fsout timesswapped socketrecv socketsent majorpagefaults minorpagefaults contextswitchesinv contextswitchesvol); do
+	    value=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
+	    echo "INSERT INTO annot_app_exec_num VALUES ('$execute2_id','$key',$value)"  >>  /tmp/$RUNID.sql
+	done
     done < runtime.txt
 fi
 
 echo "    - Function call names."
 while read thread appname; do
-    fid=$(echo $thread | awk -F ":" '{print $3}')
-    echo  "UPDATE fun_call SET name='$appname' WHERE id='$fid';"  >> /tmp/$RUNID.sql
+    echo  "UPDATE fun_call SET name='$appname' WHERE id='$thread';"  >> /tmp/$RUNID.sql
 done < invocation-procedure-names.txt
 
 echo "    - Wrapper log extra info."
 if [ -f extrainfo.txt ]; then
     while read execute2_id extrainfo; do
-	eid=$(echo $execute2_id | awk -F ":" '{print $3}')
 	echo $extrainfo | awk -F ";"  '{ for (i = 1; i <= NF; i++)
                                                print $i
                                          }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
-	fid=$($SQLCMD --tuples-only -c "select app_fun_call_id from app_exec where id='$eid' and run_id='$WF';" | awk '{print $1}')
+	id=$($SQLCMD --tuples-only -c "select app_fun_call_id from app_exec where id='$execute2_id';" | awk '{print $1}')
 	while read name type value; do
 	    if [ "$type" = "num" ]; then
-		echo "INSERT INTO annot_app_exec_num (id, fun_call_id, run_id, name, value) VALUES ('$eid', '$fid', '$name', $value);"  >> /tmp/$RUNID.sql
+		echo "INSERT INTO annot_app_exec_num (id, name, value) VALUES ('$id', '$name', $value);"  >> /tmp/$RUNID.sql
 	    fi 
 	    if [ "$type" = "txt" ]; then
-		echo "INSERT INTO annot_app_exec_text (id, fun_call_id, run_id, name, value) VALUES ('$eid', '$fid', '$name', '$value');"  >> /tmp/$RUNID.sql
+		echo "INSERT INTO annot_app_exec_text (id, name, value) VALUES ('$id', '$name', '$value');"  >> /tmp/$RUNID.sql
 	    fi
 	done < fields.txt
     done < extrainfo.txt
 fi
 
+echo "    - Prospective provenance (script, tc, sites)."
+script_hash=$(openssl dgst -sha1  script.txt | awk  '{ print $2 }')
+EXISTING=$($SQLCMD --tuples-only -c "select count(*) from script where hash_value='$script_hash';")
+if [ "$EXISTING" -eq "0" ];  then
+    content=$(cat script.txt | sed "s/'/''/g")
+    echo "INSERT INTO script VALUES ('$script_hash', '$content');" >> /tmp/$RUNID.sql
+fi
+echo "UPDATE run SET script_hash='$script_hash' WHERE id='$WF';" >> /tmp/$RUNID.sql
+
+
+    
+tc_hash=$(openssl dgst -sha1  tc.txt | awk  '{ print $2 }')
+EXISTING=$($SQLCMD --tuples-only -c "select count(*) from tc_file where hash_value='$tc_hash';")
+if [ "$EXISTING" -eq "0" ];  then
+    content=$(cat tc.txt | sed "s/'/''/g")
+    echo "INSERT INTO tc_file VALUES ('$tc_hash', '$content');" >> /tmp/$RUNID.sql
+fi
+echo "UPDATE run SET tc_file_hash='$tc_hash' WHERE id='$WF';" >> /tmp/$RUNID.sql
+
+
+sites_hash=$(openssl dgst -sha1  sites.txt | awk  '{ print $2 }')
+EXISTING=$($SQLCMD --tuples-only -c "select count(*) from sites_file where hash_value='$sites_hash';")
+if [ "$EXISTING" -eq "0" ];  then
+    content=$(cat sites.txt | sed "s/'/''/g")
+    echo "INSERT INTO sites_file VALUES ('$sites_hash', '$content');" >> /tmp/$RUNID.sql
+fi
+echo "UPDATE run SET sites_file_hash='$sites_hash' WHERE id='$WF';" >> /tmp/$RUNID.sql
+
+
+
 echo "Finished SQL generation."
 echo "Exporting provenance to database..."
 $SQLCMD -f /tmp/$RUNID.sql 1> /dev/null 2> /tmp/$RUNID-provenancedb-error.log

Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs	2012-05-09 22:18:28 UTC (rev 5785)
+++ provenancedb/swift-prov-import-all-logs	2012-05-10 12:27:54 UTC (rev 5786)
@@ -31,16 +31,16 @@
     $SQLCMD < $PROVDIR/prov-init.sql
 fi
 
-while read start version filename; do
+while read start version cogversion filename; do
     
     export IDIR=$(echo $filename | sed 's/\.log$/.d/')
     COG_VERSION=$(grep -m 1 -E 'Swift .* swift-r[0-9]*' $filename | sed 's/.*Swift .* cog-r\([0-9]*\).*/\1/')
     echo IDIR=$IDIR
     if [ $version -ge 1538 ]; then
 	echo -n "Log: $filename ... "
-	
         # TODO: does not work in sqlite
-	EXISTING=$($SQLCMD --tuples-only -c "select count(*) from run where log_filename='$filename';")
+	fname=$(echo $filename | sed -e 's:[^\`].*/::')
+	EXISTING=$($SQLCMD --tuples-only -c "select count(*) from run where log_filename='$fname';")
 	
 	if [ "$EXISTING" -eq "0" ];  then
 	    PROV_ENABLED=$(grep provenanceid $filename | wc -l)
@@ -54,11 +54,11 @@
 		fi
 		
 		export RUNID=$(basename $filename .log)
-		
+		source_file=$(echo $fname | sed "s/-[0-9]*-[0-9]*-[0-9a-z]*\.log$/\.swift/")	
 		export WF="${RUNID}"
 		
 		#echo "BEGIN TRANSACTION;" > /tmp/$WF.sql
-		echo "INSERT INTO run (id, log_filename, swift_version, cog_version, final_state) VALUES ('$WF','$filename','$version', '$COG_VERSION', '$wfstatus');" >> /tmp/$WF.sql
+		echo "INSERT INTO run (id, log_filename, script_filename, swift_version, cog_version, final_state) VALUES ('$WF','$fname','$source_file','$version', '$COG_VERSION', '$wfstatus');" >> /tmp/$WF.sql
 		
 		echo version $version in log file $filename
 		echo ============= will import =============

Added: provenancedb/swift_mod/_swiftwrap_runtime_aggregate
===================================================================
--- provenancedb/swift_mod/_swiftwrap_runtime_aggregate	                        (rev 0)
+++ provenancedb/swift_mod/_swiftwrap_runtime_aggregate	2012-05-10 12:27:54 UTC (rev 5786)
@@ -0,0 +1,605 @@
+#!/bin/bash
+# this script must be invoked inside of bash, not plain sh
+# note that this script modifies $IFS
+
+# Toggle debugging output from debug()
+DEBUG=0
+
+infosection() {
+        echo >& "$INFO"
+	echo "_____________________________________________________________________________" >& "$INFO"
+	echo >& "$INFO"
+	echo "        $1" >& "$INFO"
+	echo "_____________________________________________________________________________" >& "$INFO"
+	echo >& "$INFO"
+}
+
+gather_proc_info()
+{
+        #TIME_PID=$!
+		#sleep 0.1
+		##EXEC_PID=$#(ps -o pid --ppid $TIME_PID --no-headers)
+	    EXEC_PID=$!
+		SAMPLING_INTERVAL=0.1
+        while true
+        do
+
+                MAX_VIRTUAL_MEM=$(grep "VmPeak:" /proc/$EXEC_PID/status | awk '{print $2}')
+                if [ -z "$MAX_VIRTUAL_MEM" ]; then
+            #process finished
+                        break
+                fi
+
+                MAX_PHYS_MEM=$(grep "VmHWM:" /proc/$EXEC_PID/status | awk '{print $2}')
+                if [ -z "$MAX_PHYS_MEM" ]; then
+            #process finished
+                        break
+                fi
+
+                READ_BYTES=$(grep "read_bytes:" /proc/$EXEC_PID/io | awk '{print $2}')
+                if [ -z "$READ_BYTES" ]; then
+            #process finished
+                        break
+                fi
+
+                WRITE_BYTES=$(grep "^write_bytes:" /proc/$EXEC_PID/io | awk '{print $2}')
+                if [ -z "$WRITE_BYTES" ]; then
+            #process finished
+                        break
+                fi
+
+                STEP_DATE=$(date +%s).$(date +%N)
+                PSLINE=$(ps auxw | grep $EXEC_PID | grep -v grep)
+                if [ -z "$PSLINE" ]; then
+            #process finished
+                        break
+                fi
+                CPU_USAGE=$(echo $PSLINE | awk '{print $3}')
+                log "RUNTIME_INFO=timestamp:$STEP_DATE,cpu_usage:$CPU_USAGE,max_phys_mem:$MAX_PHYS_MEM,max_virtual_mem:$MAX_VIRTUAL_MEM,io_read_bytes:$READ_BYTES,io_write_bytes:$WRITE_BYTES"
+                INT_SAMPLING_INTERVAL=$(echo "$SAMPLING_INTERVAL/1" | bc)
+                if [ "$INT_SAMPLING_INTERVAL" -lt 60 ]; then
+                        SAMPLING_INTERVAL=$(echo "$SAMPLING_INTERVAL+0.1" | bc)
+                fi
+				sleep $SAMPLING_INTERVAL
+        done
+        wait $EXEC_PID
+}
+
+info() {
+	infosection "command line"
+	echo $COMMANDLINE 2>&1 >& "$INFO"
+	infosection "uname -a"
+	uname -a 2>&1 >& "$INFO"
+	infosection "id"
+	id 2>&1 >& "$INFO"
+	infosection "env"
+	env 2>&1 >& "$INFO"
+	infosection "df"
+	df 2>&1 >& "$INFO"
+        if [ -e "/proc/cpuinfo" ]; then
+		infosection "/proc/cpuinfo"
+		cat /proc/cpuinfo 2>&1 >& "$INFO"
+	fi
+	if [ -e "/proc/meminfo" ]; then
+		infosection "/proc/meminfo"
+		cat /proc/meminfo 2>&1 >& "$INFO"
+	fi
+	if [ -f "$STDOUT" ] ; then
+		infosection "stdout"
+		cat $STDOUT >& "$INFO"
+	fi
+	if [ -f "$STDERR" ] ; then
+		infosection "stderr"
+		cat $STDERR >& "$INFO"
+	fi
+}
+
+logstate() {
+	echo "Progress " `date +"%Y-%m-%d %H:%M:%S.%N%z"` " $@" >& "$INFO"
+}
+
+log() {
+	echo "$@" >& "$INFO"
+}
+
+debug() {
+	[[ $DEBUG == 1 ]] && echo "$@" >& "$INFO"
+}
+
+fail() {
+	EC=$1
+	shift
+	
+	if [ "$STATUSMODE" != "files" ]; then
+		mkdir -p $WFDIR/status/$JOBDIR
+	fi
+	
+	echo $@ >"$WFDIR/status/$JOBDIR/${ID}-error"
+		
+	log $@
+	info
+	if [ "$STATUSMODE" = "files" ]; then
+		exit 0
+	else
+		exit $EC
+	fi
+}
+
+checkError() {
+	if [ "$?" != "0" ]; then
+		fail $@
+	fi
+}
+
+checkEmpty() {
+	if [ "$1" == "" ]; then
+		shift
+		fail 254 $@
+	fi
+}
+
+checkparamfile() {
+	log "checking for paramfile"
+	if [ "$1" == "-p" ]; then
+		JOBDIR=$2
+		PARAMFILE=${WFDIR}/parameters/${JOBDIR}/param-${ID}
+	fi
+	log "paramfile is: $PARAMFILE"
+}
+
+getarg() {
+	NAME=$1
+	shift
+	VALUE=""
+	SHIFTCOUNT=0
+	if [ "$PARAMFILE" == "" ] && [ "$1" == "$NAME" ]; then
+		shift
+		let "SHIFTCOUNT=$SHIFTCOUNT+1"
+		while [ "${1:0:1}" != "-" ] && [ "$#" != "0" ]; do
+			VALUE="$VALUE $1"
+			shift
+			let "SHIFTCOUNT=$SHIFTCOUNT+1"
+		done
+		VALUE="${VALUE:1}"
+	elif [ "$PARAMFILE" != "" ] && grep -E "^$NAME " $PARAMFILE ; then
+		VALUE=$(grep -E "^$NAME " $PARAMFILE | cut -d ' ' -f 2-)
+	else
+		fail 254 "Missing $NAME argument"
+	fi
+}
+
+openinfo() {
+	exec 3<> $1
+	INFO=3
+}
+
+closeinfo() {
+	exec 3>&-
+}
+
+contains() {
+	ARRAY=$1
+	X=$2
+
+	for a in ${!ARRAY}
+	do
+		if [[ ${a} == ${X} ]]; then
+			return 0
+		fi
+	done
+	return 1
+}
+
+genScripts() {
+	echo "#!/bin/bash" > run.sh
+	echo -n "\"$EXEC\" " >> run.sh
+	for CMDARG in "${CMDARGS[@]}"; do
+    	echo -n "\"$CMDARG\" " >> run.sh
+	done
+	echo "1>\"$STDOUT\" 2>\"$STDERR\"" >> run.sh
+	chmod +x run.sh
+}
+
+cdm_local_output()
+{
+ 	L=$1
+
+	if [[ $CDM_FILE == "" ]]; then
+		return
+	fi
+
+ 	CDM_POLICY=$( cdm_lookup shared/cdm.pl $CDM_FILE $L )
+	if [[ $CDM_POLICY == "LOCAL" ]]; then
+		cdm_local_output_perform $L $CDM_POLICY
+	fi
+}
+
+cdm_local_output_perform()
+{
+	L=$1
+	TOOL=$2
+	REMOTE_DIR=$3
+	FLAGS=$3
+	log "Copying $REMOTE_DIR/$FILE to $JOBDIR/$FILE"
+	mkdir -p $REMOTE_DIR
+	checkError 254 "CDM[LOCAL]: mkdir -p $REMOTE_DIR failed!"
+	$TOOL $FLAGS $JOBDIR/$FILE $REMOTE_DIR/$FILE
+	checkError 254 "CDM[LOCAL]: Tool failed!"
+}
+
+cdm_gather()
+{
+	GATHER_OUTPUT=${*}
+	if [[ $CDM_FILE == "" ]]; then
+		return
+	fi
+	if [[ $GATHER_OUTPUT == "" ]]; then
+		return
+	fi
+
+	cdm_gather_action $GATHER_MAX $GATHER_OUTPUT
+}
+
+COMMANDLINE=$@
+
+# get the parent directory of the directory containing _swiftwrap, to use
+# as the run directory
+# this assumes that _swiftwrap is being executed from the top level of
+# the shared directory, and that shared directory is in the top level
+# of the workflow run directory
+WFDIR=$(dirname $(dirname $0))
+
+cd $WFDIR
+
+# make the WFDIR absolute
+WFDIR=$(pwd)
+PARAMFILE=
+
+openinfo "wrapper.log"
+ID=$1
+checkEmpty "$ID" "Missing job ID"
+
+shift
+
+checkparamfile "$@"
+
+# JOBDIR might have been assigned through the -p option, or might
+# be a parameter here
+if [ "$JOBDIR" == "" ] ; then
+	getarg "-jobdir" "$@"
+	JOBDIR=$VALUE
+	shift $SHIFTCOUNT
+fi
+
+getarg "-scratch" "$@"
+SCRATCH=$VALUE
+shift $SHIFTCOUNT
+
+if [ "X$PROGRESSIVE_INFO" == "X" ] && [ "X$SCRATCH" != "X" ]; then
+	INFODIR=$SCRATCH
+else
+	INFODIR=$WFDIR/info/$JOBDIR
+fi
+checkEmpty "$JOBDIR" "Missing job directory prefix"
+mkdir -p $INFODIR
+closeinfo
+
+if [ -z $MPI_RANK ]; then
+	INFOFILE="$INFODIR/${ID}-info"
+else
+	INFOFILE="$INFODIR/${ID}-${MPI_RANK}-info"
+fi
+rm -f $INFOFILE
+openinfo "$INFOFILE"
+
+logstate "LOG_START"
+infosection "Wrapper (_swiftwrap)"
+
+getarg "-e" "$@"
+EXEC=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-out" "$@"
+STDOUT=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-err" "$@"
+STDERR=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-i" "$@"
+STDIN=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-d" "$@"
+DIRS=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-if" "$@"
+INF=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-of" "$@"
+OUTF=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-k" "$@"
+KICKSTART=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-cdmfile" "$@"
+CDM_FILE=
+if [ "X$VALUE" != "X" ]; then
+	CDM_FILE=shared/$VALUE
+fi
+shift $SHIFTCOUNT
+
+getarg "-status" "$@"
+STATUSMODE=$VALUE
+shift $SHIFTCOUNT
+
+declare -a CMDARGS
+if [ "$PARAMFILE" == "" ] && [ "$1" == "-a" ] ; then
+	shift
+	CMDARGS=("$@")
+elif [ "$PARAMFILE" != "" ] ; then
+	CMDARGS=()
+	FIRST=1
+	while read line ; do
+		if [ "$FIRST" == "1" ] ; then
+			CMDARGS=("$line")
+			FIRST=0
+		else
+			CMDARGS=("${CMDARGS[@]}" "$line")
+		fi
+	done < <(grep -E "^-a " $PARAMFILE | cut -d " " -f 2-)
+else
+	fail 254 "Missing arguments (-a option)"
+fi
+
+if [ "$STATUSMODE" = "files" ]; then
+	mkdir -p $WFDIR/status/$JOBDIR
+fi
+
+if [ "X$CDM_FILE" != "X" ]; then
+	logstate "SOURCE_CDM_LIB $WFDIR/shared/cdm_lib.sh"
+	source $WFDIR/shared/cdm_lib.sh
+	checkError 254 "Could not source: $WFDIR/shared/cdm_lib.sh"
+fi
+
+if [ "X$SCRATCH" != "X" ]; then
+	log "Job directory mode is: local copy"
+	DIR=$SCRATCH/$JOBDIR/$ID
+	COPYNOTLINK=1
+else
+	log "Job directory mode is: link on shared filesystem"
+	DIR=jobs/$JOBDIR/$ID
+	COPYNOTLINK=0
+fi
+
+PATH=$PATH:/bin:/usr/bin
+
+if [ "$PATHPREFIX" != "" ]; then
+	export PATH=$PATHPREFIX:$PATH
+fi
+
+if [ "$SWIFT_EXTRA_INFO" != "" ]; then
+	log "EXTRAINFO=$($SWIFT_EXTRA_INFO)"
+fi
+
+if [ "X${EXEC:0:1}" != "X/" ] ; then
+	export ORIGEXEC=$EXEC
+	export EXEC=$(which $EXEC)
+	if [ "X$EXEC" = "X" ] ; then
+		fail 254 "Cannot find executable $ORIGEXEC on site system path"
+	fi
+fi
+
+debug "PID=$$"
+log "PWD=$PWD"
+log "DIR=$DIR"
+log "EXEC=$EXEC"
+log "STDIN=$STDIN"
+log "STDOUT=$STDOUT"
+log "STDERR=$STDERR"
+log "DIRS=$DIRS"
+log "INF=$INF"
+log "OUTF=$OUTF"
+log "KICKSTART=$KICKSTART"
+log "CDM_FILE=$CDM_FILE"
+[ -z $MPI_RANK ] && log "MPI_RANK=$MPI_RANK"
+log "ARGS=$@"
+log "ARGC=$#"
+IFS="|"
+
+logstate "CREATE_JOBDIR"
+mkdir -p $DIR
+checkError 254 "Failed to create job directory $DIR"
+log "Created job directory: $DIR"
+
+if [[ $MPI_RANK == "" || $MPI_RANK == 0 ]]; then
+
+logstate "CREATE_INPUTDIR"
+for D in $DIRS ; do
+	mkdir -p "$DIR/$D" 2>&1 >>"$INFO"
+	checkError 254 "Failed to create input directory $D"
+	log "Created output directory: $DIR/$D"
+done
+
+logstate "LINK_INPUTS"
+for L in $INF ; do
+    CDM_POLICY="DEFAULT"
+	if [[ $CDM_FILE != "" ]]; then
+		CDM_POLICY=$( cdm_lookup shared/cdm.pl $CDM_FILE $L )
+	fi
+	if [[ $CDM_POLICY != "DEFAULT" && $CDM_POLICY != "EXTERNAL"* ]]; then
+		log "CDM_POLICY: $L -> $CDM_POLICY"
+		eval cdm_action $DIR "INPUT" $L $CDM_POLICY
+		continue
+	fi
+	if [ $COPYNOTLINK = 1 ]; then
+		cp "$WFDIR/shared/$L" "$DIR/$L" 2>&1 >& $INFO
+		checkError 254 "Failed to copy input file $L"
+		log "Copied input: $WFDIR/shared/$L to $DIR/$L"
+	else
+		[ -f $WFDIR/shared/$L ]
+		checkError 254 "Could not locate input file: $L"
+		ln -s "$WFDIR/shared/$L" "$DIR/$L" 2>&1 >& $INFO
+		checkError 254 "Failed to link input file $L"
+		log "Linked input: $WFDIR/shared/$L to $DIR/$L"
+	fi
+done
+
+if [ ! -z $CDM_FILE ]; then
+    logstate "LINK_CDM_OUTPUTS"
+    SKIPPED_OUTPUT=()
+	GATHER_OUTPUT=()
+	for L in $OUTF ; do
+		CDM_POLICY=$( cdm_lookup shared/cdm.pl $CDM_FILE $L )
+		if [[ $CDM_POLICY != "DEFAULT" &&
+			  $CDM_POLICY != "BROADCAST"* ]]; then
+			log "CDM_POLICY: $L -> $CDM_POLICY"
+    	    eval cdm_action $DIR "OUTPUT" $L $CDM_POLICY
+			SKIPPED_OUTPUT=( $SKIPPED_OUTPUT $L )
+		fi
+		if [ $CDM_POLICY == "GATHER" ]; then
+			GATHER_OUTPUT=( $GATHER_OUTPUT $L )
+		elif [ $CDM_POLICY == "LOCAL" ]; then
+			CDM_LOCAL_OUTPUT=( $CDM_LOCAL_OUTPUT $L )
+		fi
+	done
+fi
+
+fi # MPI_RANK==0
+
+debug "Moving to jobdir: $DIR"
+cd $DIR
+if [ $? != 0 ]; then
+	log "PWD: $PWD"
+	log $( find . )
+	fail 254 "Could not cd to: $DIR"
+fi
+logstate "EXECUTE"
+
+debug "Command line: $EXEC ${CMDARGS[@]}"
+
+if [ ! -f "$EXEC" ]; then
+	fail 254 "The executable $EXEC does not exist"
+fi
+if [ ! -x "$EXEC" ]; then
+	fail 254 "The executable $EXEC does not have the executable bit set"
+fi
+if [ "$KICKSTART" == "" ]; then
+	if [ "$STDIN" == "" ]; then
+		if [ "$SWIFT_GEN_SCRIPTS" != "" ]; then
+			genScripts
+		fi
+		/usr/bin/time -f "RUNTIME_AGGR=maxrss:%M,walltime:%e,systime:%S,usertime:%U,cpu:%P,fsin:%I,fsout:%O,timesswapped:%W,socketrecv:%r,socketsent:%s,majorpagefaults:%F,minorpagefaults:%R,contextswitchesinv:%c,contextswitchesvol:%w" "$EXEC" "${CMDARGS[@]}" 1>"$STDOUT" 2>"$STDERR"
+		#"$EXEC" "${CMDARGS[@]}" 1>"$STDOUT" 2>"$STDERR" &
+		#gather_proc_info
+		RTAGGR=$(cat "$STDERR" | grep RUNTIME_AGGR)
+		log "$RTAGGR"
+	else
+		if [ "$SWIFT_GEN_SCRIPTS" != "" ]; then
+			genScripts
+		fi
+		/usr/bin/time -f "RUNTIME_AGGR=maxrss:%M,walltime:%e,systime:%S,usertime:%U,cpu:%P,fsin:%I,fsout:%O,timesswapped:%W,socketrecv:%r,socketsent:%s,majorpagefaults:%F,minorpagefaults:%R,contextswitchesinv:%c,contextswitchesvol:%w" "$EXEC" "${CMDARGS[@]}" 1>"$STDOUT" 2>"$STDERR" <"$STDIN" 
+		#"$EXEC" "${CMDARGS[@]}" 1>"$STDOUT" 2>"$STDERR" <"$STDIN" &
+		#gather_proc_info
+		RTAGGR=$(cat "$STDERR" | grep RUNTIME_AGGR)
+		log "$RTAGGR"
+	fi
+	checkError $? "Application $EXEC failed with an exit code of $?"
+else
+	if [ ! -f "$KICKSTART" ]; then
+		log "Kickstart executable ($KICKSTART) not found"
+		fail 254 "The Kickstart executable ($KICKSTART) was not found"
+	elif [ ! -x "$KICKSTART" ]; then
+		log "Kickstart executable ($KICKSTART) is not executable"
+		fail 254 "The Kickstart executable ($KICKSTART) does not have the executable bit set"
+	else
+		mkdir -p $WFDIR/kickstart/$JOBDIR
+		log "Using Kickstart ($KICKSTART)"
+		if [ "$STDIN" == "" ]; then
+			"$KICKSTART" -H -o "$STDOUT" -e "$STDERR" "$EXEC" "$@" 1>kickstart.xml 2>"$STDERR"
+		else
+			"$KICKSTART" -H -o "$STDOUT" -i "$STDIN" -e "$STDERR" "$EXEC" "$@" 1>kickstart.xml 2>"$STDERR"
+		fi
+		export APPEXIT=$?
+		mv -f kickstart.xml "$WFDIR/kickstart/$JOBDIR/$ID-kickstart.xml" 2>&1 >& "$INFO"
+		checkError 254 "Failed to copy Kickstart record to shared directory"
+		if [ "$APPEXIT" != "0" ]; then
+			fail $APPEXIT "Application $EXEC failed with an exit code of $APPEXIT"
+		fi
+	fi
+fi
+
+log "Moving back to workflow directory $WFDIR"
+cd $WFDIR
+if [ $? != 0 ]; then
+	fail 254 "Could not cd to workflow directory: $WFDIR"
+fi
+
+logstate "EXECUTE_DONE"
+log "Job ran successfully"
+
+if [[ $MPI_RANK == "" || $MPI_RANK == 0 ]]; then
+
+MISSING=
+for O in $OUTF ; do
+	if [ ! -f "$DIR/$O" ]; then
+		if [ "$MISSING" == "" ]; then
+			MISSING=$O
+		else
+			MISSING="$MISSING, $O"
+		fi
+	fi
+done
+if [ "$MISSING" != "" ]; then
+	log $( find . )
+	fail 254 "The following output files were not created by the application: $MISSING"
+fi
+
+logstate "MOVING_OUTPUTS $OUTF"
+for O in $OUTF ; do
+	if ! contains SKIPPED_OUTPUT $O ; then
+		mv "$DIR/$O" "$WFDIR/shared/$O" 2>&1 >&	"$INFO"
+		checkError 254 "Failed to move output file $O to shared directory"
+	fi
+done
+
+cdm_local_output $CDM_LOCAL_OUTPUT
+cdm_gather $GATHER_OUTPUT
+
+logstate "RM_JOBDIR"
+rm -rf "$DIR" 2>&1 >& "$INFO"
+checkError 254 "Failed to remove job directory $DIR"
+
+if [ "$STATUSMODE" = "files" ]; then
+	logstate "TOUCH_SUCCESS"
+	touch $WFDIR/status/${JOBDIR}/${ID}-success
+fi
+
+else
+	# Allow rank 0 to write output
+	sleep 1
+fi # MPI_RANK==0
+
+logstate "END"
+
+closeinfo
+
+if [ "X$PROGRESSIVE_INFO" == "X" ] && [ "X$SCRATCH" != "X" ]; then
+	mkdir -p "$WFDIR/info/$JOBDIR"
+	mv "$INFODIR/${ID}-info" "$WFDIR/info/$JOBDIR/${ID}-info"
+fi
+
+# ensure we exit with a 0 after a successful execution
+exit 0
+
+# Local Variables:
+# mode: sh
+# sh-basic-offset: 4
+# tab-width: 4
+# indent-tabs-mode: 1
+# End:


Property changes on: provenancedb/swift_mod/_swiftwrap_runtime_aggregate
___________________________________________________________________
Added: svn:executable
   + *

Added: provenancedb/swift_mod/_swiftwrap_runtime_snapshots
===================================================================
--- provenancedb/swift_mod/_swiftwrap_runtime_snapshots	                        (rev 0)
+++ provenancedb/swift_mod/_swiftwrap_runtime_snapshots	2012-05-10 12:27:54 UTC (rev 5786)
@@ -0,0 +1,595 @@
+#!/bin/bash
+# this script must be invoked inside of bash, not plain sh
+# note that this script modifies $IFS
+
+# Toggle debugging output from debug()
+DEBUG=0
+
+infosection() {
+        echo >& "$INFO"
+	echo "_____________________________________________________________________________" >& "$INFO"
+	echo >& "$INFO"
+	echo "        $1" >& "$INFO"
+	echo "_____________________________________________________________________________" >& "$INFO"
+	echo >& "$INFO"
+}
+
+gather_proc_info()
+{
+	EXEC_PID=$!
+	SAMPLING_INTERVAL=1
+	while true
+	do
+		sleep $SAMPLING_INTERVAL
+		
+		MAX_VIRTUAL_MEM=$(grep "VmPeak:" /proc/$EXEC_PID/status | awk '{print $2}')
+		if [ -z "$MAX_VIRTUAL_MEM" ]; then
+            #process finished
+			break
+		fi 
+
+		MAX_PHYS_MEM=$(grep "VmHWM:" /proc/$EXEC_PID/status | awk '{print $2}')
+		if [ -z "$MAX_PHYS_MEM" ]; then
+            #process finished
+			break
+		fi 
+				
+		READ_BYTES=$(grep "read_bytes:" /proc/$EXEC_PID/io | awk '{print $2}') 
+		if [ -z "$READ_BYTES" ]; then
+            #process finished
+			break
+		fi
+
+		WRITE_BYTES=$(grep "^write_bytes:" /proc/$EXEC_PID/io | awk '{print $2}') 
+		if [ -z "$WRITE_BYTES" ]; then
+            #process finished
+			break
+		fi
+		
+		STEP_DATE=$(date +%s)
+		PSLINE=$(ps auxw | grep $EXEC_PID | grep -v grep)
+		if [ -z "$PSLINE" ]; then
+            #process finished
+			break
+		fi
+		CPU_USAGE=$(echo $PSLINE | awk '{print $3}')
+		log "RUNTIME_INFO=timestamp:$STEP_DATE,cpu_usage:$CPU_USAGE,max_phys_mem:$MAX_PHYS_MEM,max_virtual_mem:$MAX_VIRTUAL_MEM,io_read_bytes:$READ_BYTES,io_write_bytes:$WRITE_BYTES"
+		if [ "$SAMPLING_INTERVAL" -lt 60 ]; then
+			let "SAMPLING_INTERVAL=$SAMPLING_INTERVAL+1"
+		fi
+	done
+	wait $EXEC_PID
+}
+
+info() {
+	infosection "command line"
+	echo $COMMANDLINE 2>&1 >& "$INFO"
+	infosection "uname -a"
+	uname -a 2>&1 >& "$INFO"
+	infosection "id"
+	id 2>&1 >& "$INFO"
+	infosection "env"
+	env 2>&1 >& "$INFO"
+	infosection "df"
+	df 2>&1 >& "$INFO"
+        if [ -e "/proc/cpuinfo" ]; then
+		infosection "/proc/cpuinfo"
+		cat /proc/cpuinfo 2>&1 >& "$INFO"
+	fi
+	if [ -e "/proc/meminfo" ]; then
+		infosection "/proc/meminfo"
+		cat /proc/meminfo 2>&1 >& "$INFO"
+	fi
+	if [ -f "$STDOUT" ] ; then
+		infosection "stdout"
+		cat $STDOUT >& "$INFO"
+	fi
+	if [ -f "$STDERR" ] ; then
+		infosection "stderr"
+		cat $STDERR >& "$INFO"
+	fi
+}
+
+logstate() {
+	echo "Progress " `date +"%Y-%m-%d %H:%M:%S.%N%z"` " $@" >& "$INFO"
+}
+
+log() {
+	echo "$@" >& "$INFO"
+}
+
+debug() {
+	[[ $DEBUG == 1 ]] && echo "$@" >& "$INFO"
+}
+
+fail() {
+	EC=$1
+	shift
+	
+	if [ "$STATUSMODE" != "files" ]; then
+		mkdir -p $WFDIR/status/$JOBDIR
+	fi
+	
+	echo $@ >"$WFDIR/status/$JOBDIR/${ID}-error"
+		
+	log $@
+	info
+	if [ "$STATUSMODE" = "files" ]; then
+		exit 0
+	else
+		exit $EC
+	fi
+}
+
+checkError() {
+	if [ "$?" != "0" ]; then
+		fail $@
+	fi
+}
+
+checkEmpty() {
+	if [ "$1" == "" ]; then
+		shift
+		fail 254 $@
+	fi
+}
+
+checkparamfile() {
+	log "checking for paramfile"
+	if [ "$1" == "-p" ]; then
+		JOBDIR=$2
+		PARAMFILE=${WFDIR}/parameters/${JOBDIR}/param-${ID}
+	fi
+	log "paramfile is: $PARAMFILE"
+}
+
+getarg() {
+	NAME=$1
+	shift
+	VALUE=""
+	SHIFTCOUNT=0
+	if [ "$PARAMFILE" == "" ] && [ "$1" == "$NAME" ]; then
+		shift
+		let "SHIFTCOUNT=$SHIFTCOUNT+1"
+		while [ "${1:0:1}" != "-" ] && [ "$#" != "0" ]; do
+			VALUE="$VALUE $1"
+			shift
+			let "SHIFTCOUNT=$SHIFTCOUNT+1"
+		done
+		VALUE="${VALUE:1}"
+	elif [ "$PARAMFILE" != "" ] && grep -E "^$NAME " $PARAMFILE ; then
+		VALUE=$(grep -E "^$NAME " $PARAMFILE | cut -d ' ' -f 2-)
+	else
+		fail 254 "Missing $NAME argument"
+	fi
+}
+
+openinfo() {
+	exec 3<> $1
+	INFO=3
+}
+
+closeinfo() {
+	exec 3>&-
+}
+
+contains() {
+	ARRAY=$1
+	X=$2
+
+	for a in ${!ARRAY}
+	do
+		if [[ ${a} == ${X} ]]; then
+			return 0
+		fi
+	done
+	return 1
+}
+
+genScripts() {
+	echo "#!/bin/bash" > run.sh
+	echo -n "\"$EXEC\" " >> run.sh
+	for CMDARG in "${CMDARGS[@]}"; do
+    	echo -n "\"$CMDARG\" " >> run.sh
+	done
+	echo "1>\"$STDOUT\" 2>\"$STDERR\"" >> run.sh
+	chmod +x run.sh
+}
+
+cdm_local_output()
+{
+ 	L=$1
+
+	if [[ $CDM_FILE == "" ]]; then
+		return
+	fi
+
+ 	CDM_POLICY=$( cdm_lookup shared/cdm.pl $CDM_FILE $L )
+	if [[ $CDM_POLICY == "LOCAL" ]]; then
+		cdm_local_output_perform $L $CDM_POLICY
+	fi
+}
+
+cdm_local_output_perform()
+{
+	L=$1
+	TOOL=$2
+	REMOTE_DIR=$3
+	FLAGS=$3
+	log "Copying $REMOTE_DIR/$FILE to $JOBDIR/$FILE"
+	mkdir -p $REMOTE_DIR
+	checkError 254 "CDM[LOCAL]: mkdir -p $REMOTE_DIR failed!"
+	$TOOL $FLAGS $JOBDIR/$FILE $REMOTE_DIR/$FILE
+	checkError 254 "CDM[LOCAL]: Tool failed!"
+}
+
+cdm_gather()
+{
+	GATHER_OUTPUT=${*}
+	if [[ $CDM_FILE == "" ]]; then
+		return
+	fi
+	if [[ $GATHER_OUTPUT == "" ]]; then
+		return
+	fi
+
+	cdm_gather_action $GATHER_MAX $GATHER_OUTPUT
+}
+
+COMMANDLINE=$@
+
+# get the parent directory of the directory containing _swiftwrap, to use
+# as the run directory
+# this assumes that _swiftwrap is being executed from the top level of
+# the shared directory, and that shared directory is in the top level
+# of the workflow run directory
+WFDIR=$(dirname $(dirname $0))
+
+cd $WFDIR
+
+# make the WFDIR absolute
+WFDIR=$(pwd)
+PARAMFILE=
+
+openinfo "wrapper.log"
+ID=$1
+checkEmpty "$ID" "Missing job ID"
+
+shift
+
+checkparamfile "$@"
+
+# JOBDIR might have been assigned through the -p option, or might
+# be a parameter here
+if [ "$JOBDIR" == "" ] ; then
+	getarg "-jobdir" "$@"
+	JOBDIR=$VALUE
+	shift $SHIFTCOUNT
+fi
+
+getarg "-scratch" "$@"
+SCRATCH=$VALUE
+shift $SHIFTCOUNT
+
+if [ "X$PROGRESSIVE_INFO" == "X" ] && [ "X$SCRATCH" != "X" ]; then
+	INFODIR=$SCRATCH
+else
+	INFODIR=$WFDIR/info/$JOBDIR
+fi
+checkEmpty "$JOBDIR" "Missing job directory prefix"
+mkdir -p $INFODIR
+closeinfo
+
+if [ -z $MPI_RANK ]; then
+	INFOFILE="$INFODIR/${ID}-info"
+else
+	INFOFILE="$INFODIR/${ID}-${MPI_RANK}-info"
+fi
+rm -f $INFOFILE
+openinfo "$INFOFILE"
+
+logstate "LOG_START"
+infosection "Wrapper (_swiftwrap)"
+
+getarg "-e" "$@"
+EXEC=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-out" "$@"
+STDOUT=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-err" "$@"
+STDERR=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-i" "$@"
+STDIN=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-d" "$@"
+DIRS=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-if" "$@"
+INF=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-of" "$@"
+OUTF=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-k" "$@"
+KICKSTART=$VALUE
+shift $SHIFTCOUNT
+
+getarg "-cdmfile" "$@"
+CDM_FILE=
+if [ "X$VALUE" != "X" ]; then
+	CDM_FILE=shared/$VALUE
+fi
+shift $SHIFTCOUNT
+
+getarg "-status" "$@"
+STATUSMODE=$VALUE
+shift $SHIFTCOUNT
+
+declare -a CMDARGS
+if [ "$PARAMFILE" == "" ] && [ "$1" == "-a" ] ; then
+	shift
+	CMDARGS=("$@")
+elif [ "$PARAMFILE" != "" ] ; then
+	CMDARGS=()
+	FIRST=1
+	while read line ; do
+		if [ "$FIRST" == "1" ] ; then
+			CMDARGS=("$line")
+			FIRST=0
+		else
+			CMDARGS=("${CMDARGS[@]}" "$line")
+		fi
+	done < <(grep -E "^-a " $PARAMFILE | cut -d " " -f 2-)
+else
+	fail 254 "Missing arguments (-a option)"
+fi
+
+if [ "$STATUSMODE" = "files" ]; then
+	mkdir -p $WFDIR/status/$JOBDIR
+fi
+
+if [ "X$CDM_FILE" != "X" ]; then
+	logstate "SOURCE_CDM_LIB $WFDIR/shared/cdm_lib.sh"
+	source $WFDIR/shared/cdm_lib.sh
+	checkError 254 "Could not source: $WFDIR/shared/cdm_lib.sh"
+fi
+
+if [ "X$SCRATCH" != "X" ]; then
+	log "Job directory mode is: local copy"
+	DIR=$SCRATCH/$JOBDIR/$ID
+	COPYNOTLINK=1
+else
+	log "Job directory mode is: link on shared filesystem"
+	DIR=jobs/$JOBDIR/$ID
+	COPYNOTLINK=0
+fi
+
+PATH=$PATH:/bin:/usr/bin
+
+if [ "$PATHPREFIX" != "" ]; then
+	export PATH=$PATHPREFIX:$PATH
+fi
+
+if [ "$SWIFT_EXTRA_INFO" != "" ]; then
+	log "EXTRAINFO=$($SWIFT_EXTRA_INFO)"
+fi
+
+if [ "X${EXEC:0:1}" != "X/" ] ; then
+	export ORIGEXEC=$EXEC
+	export EXEC=$(which $EXEC)
+	if [ "X$EXEC" = "X" ] ; then
+		fail 254 "Cannot find executable $ORIGEXEC on site system path"
+	fi
+fi
+
+debug "PID=$$"
+log "PWD=$PWD"
+log "DIR=$DIR"
+log "EXEC=$EXEC"
+log "STDIN=$STDIN"
+log "STDOUT=$STDOUT"
+log "STDERR=$STDERR"
+log "DIRS=$DIRS"
+log "INF=$INF"
+log "OUTF=$OUTF"
+log "KICKSTART=$KICKSTART"
+log "CDM_FILE=$CDM_FILE"
+[ -z $MPI_RANK ] && log "MPI_RANK=$MPI_RANK"
+log "ARGS=$@"
+log "ARGC=$#"
+IFS="|"
+
+logstate "CREATE_JOBDIR"
+mkdir -p $DIR
+checkError 254 "Failed to create job directory $DIR"
+log "Created job directory: $DIR"
+
+if [[ $MPI_RANK == "" || $MPI_RANK == 0 ]]; then
+
+logstate "CREATE_INPUTDIR"
+for D in $DIRS ; do
+	mkdir -p "$DIR/$D" 2>&1 >>"$INFO"
+	checkError 254 "Failed to create input directory $D"
+	log "Created output directory: $DIR/$D"
+done
+
+logstate "LINK_INPUTS"
+for L in $INF ; do
+    CDM_POLICY="DEFAULT"
+	if [[ $CDM_FILE != "" ]]; then
+		CDM_POLICY=$( cdm_lookup shared/cdm.pl $CDM_FILE $L )
+	fi
+	if [[ $CDM_POLICY != "DEFAULT" && $CDM_POLICY != "EXTERNAL"* ]]; then
+		log "CDM_POLICY: $L -> $CDM_POLICY"
+		eval cdm_action $DIR "INPUT" $L $CDM_POLICY
+		continue
+	fi
+	if [ $COPYNOTLINK = 1 ]; then
+		cp "$WFDIR/shared/$L" "$DIR/$L" 2>&1 >& $INFO
+		checkError 254 "Failed to copy input file $L"
+		log "Copied input: $WFDIR/shared/$L to $DIR/$L"
+	else
+		[ -f $WFDIR/shared/$L ]
+		checkError 254 "Could not locate input file: $L"
+		ln -s "$WFDIR/shared/$L" "$DIR/$L" 2>&1 >& $INFO
+		checkError 254 "Failed to link input file $L"
+		log "Linked input: $WFDIR/shared/$L to $DIR/$L"
+	fi
+done
+
+if [ ! -z $CDM_FILE ]; then
+    logstate "LINK_CDM_OUTPUTS"
+    SKIPPED_OUTPUT=()
+	GATHER_OUTPUT=()
+	for L in $OUTF ; do
+		CDM_POLICY=$( cdm_lookup shared/cdm.pl $CDM_FILE $L )
+		if [[ $CDM_POLICY != "DEFAULT" &&
+			  $CDM_POLICY != "BROADCAST"* ]]; then
+			log "CDM_POLICY: $L -> $CDM_POLICY"
+    	    eval cdm_action $DIR "OUTPUT" $L $CDM_POLICY
+			SKIPPED_OUTPUT=( $SKIPPED_OUTPUT $L )
+		fi
+		if [ $CDM_POLICY == "GATHER" ]; then
+			GATHER_OUTPUT=( $GATHER_OUTPUT $L )
+		elif [ $CDM_POLICY == "LOCAL" ]; then
+			CDM_LOCAL_OUTPUT=( $CDM_LOCAL_OUTPUT $L )
+		fi
+	done
+fi
+
+fi # MPI_RANK==0
+
+debug "Moving to jobdir: $DIR"
+cd $DIR
+if [ $? != 0 ]; then
+	log "PWD: $PWD"
+	log $( find . )
+	fail 254 "Could not cd to: $DIR"
+fi
+logstate "EXECUTE"
+
+debug "Command line: $EXEC ${CMDARGS[@]}"
+
+if [ ! -f "$EXEC" ]; then
+	fail 254 "The executable $EXEC does not exist"
+fi
+if [ ! -x "$EXEC" ]; then
+	fail 254 "The executable $EXEC does not have the executable bit set"
+fi
+if [ "$KICKSTART" == "" ]; then
+	if [ "$STDIN" == "" ]; then
+		if [ "$SWIFT_GEN_SCRIPTS" != "" ]; then
+			genScripts
+		fi
+		"$EXEC" "${CMDARGS[@]}" 1>"$STDOUT" 2>"$STDERR" &
+		gather_proc_info
+	else
+		if [ "$SWIFT_GEN_SCRIPTS" != "" ]; then
+			genScripts
+		fi
+		"$EXEC" "${CMDARGS[@]}" 1>"$STDOUT" 2>"$STDERR" <"$STDIN" &
+		gather_proc_info
+	fi
+	checkError $? "Application $EXEC failed with an exit code of $?"
+else
+	if [ ! -f "$KICKSTART" ]; then
+		log "Kickstart executable ($KICKSTART) not found"
+		fail 254 "The Kickstart executable ($KICKSTART) was not found"
+	elif [ ! -x "$KICKSTART" ]; then
+		log "Kickstart executable ($KICKSTART) is not executable"
+		fail 254 "The Kickstart executable ($KICKSTART) does not have the executable bit set"
+	else
+		mkdir -p $WFDIR/kickstart/$JOBDIR
+		log "Using Kickstart ($KICKSTART)"
+		if [ "$STDIN" == "" ]; then
+			"$KICKSTART" -H -o "$STDOUT" -e "$STDERR" "$EXEC" "$@" 1>kickstart.xml 2>"$STDERR"
+		else
+			"$KICKSTART" -H -o "$STDOUT" -i "$STDIN" -e "$STDERR" "$EXEC" "$@" 1>kickstart.xml 2>"$STDERR"
+		fi
+		export APPEXIT=$?
+		mv -f kickstart.xml "$WFDIR/kickstart/$JOBDIR/$ID-kickstart.xml" 2>&1 >& "$INFO"
+		checkError 254 "Failed to copy Kickstart record to shared directory"
+		if [ "$APPEXIT" != "0" ]; then
+			fail $APPEXIT "Application $EXEC failed with an exit code of $APPEXIT"
+		fi
+	fi
+fi
+
+log "Moving back to workflow directory $WFDIR"
+cd $WFDIR
+if [ $? != 0 ]; then
+	fail 254 "Could not cd to workflow directory: $WFDIR"
+fi
+
+logstate "EXECUTE_DONE"
+log "Job ran successfully"
+
+if [[ $MPI_RANK == "" || $MPI_RANK == 0 ]]; then
+
+MISSING=
+for O in $OUTF ; do
+	if [ ! -f "$DIR/$O" ]; then
+		if [ "$MISSING" == "" ]; then
+			MISSING=$O
+		else
+			MISSING="$MISSING, $O"
+		fi
+	fi
+done
+if [ "$MISSING" != "" ]; then
+	log $( find . )
+	fail 254 "The following output files were not created by the application: $MISSING"
+fi
+
+logstate "MOVING_OUTPUTS $OUTF"
+for O in $OUTF ; do
+	if ! contains SKIPPED_OUTPUT $O ; then
+		mv "$DIR/$O" "$WFDIR/shared/$O" 2>&1 >&	"$INFO"
+		checkError 254 "Failed to move output file $O to shared directory"
+	fi
+done
+
+cdm_local_output $CDM_LOCAL_OUTPUT
+cdm_gather $GATHER_OUTPUT
+
+logstate "RM_JOBDIR"
+rm -rf "$DIR" 2>&1 >& "$INFO"
+checkError 254 "Failed to remove job directory $DIR"
+
+if [ "$STATUSMODE" = "files" ]; then
+	logstate "TOUCH_SUCCESS"
+	touch $WFDIR/status/${JOBDIR}/${ID}-success
+fi
+
+else
+	# Allow rank 0 to write output
+	sleep 1
+fi # MPI_RANK==0
+
+logstate "END"
+
+closeinfo
+
+if [ "X$PROGRESSIVE_INFO" == "X" ] && [ "X$SCRATCH" != "X" ]; then
+	mkdir -p "$WFDIR/info/$JOBDIR"
+	mv "$INFODIR/${ID}-info" "$WFDIR/info/$JOBDIR/${ID}-info"
+fi
+
+# ensure we exit with a 0 after a successful execution
+exit 0
+
+# Local Variables:
+# mode: sh
+# sh-basic-offset: 4
+# tab-width: 4
+# indent-tabs-mode: 1
+# End:


Property changes on: provenancedb/swift_mod/_swiftwrap_runtime_snapshots
___________________________________________________________________
Added: svn:executable
   + *

Added: provenancedb/swift_mod/create-everylog-vs-versions-data
===================================================================
--- provenancedb/swift_mod/create-everylog-vs-versions-data	                        (rev 0)
+++ provenancedb/swift_mod/create-everylog-vs-versions-data	2012-05-10 12:27:54 UTC (rev 5786)
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+echo Creating log/version data file: everylog-vs-versions.data
+
+rm -f everylog-vs-versions.data
+
+
+for logfile in `find $1 -name \*.log -and -not -name cluster-\* -and -not -name swift.log $EXTRALOGRESTRICTION`; do
+    SWIFT_VERSION=$(grep -m 1 -E 'Swift .* swift-r[0-9]*' $logfile | sed 's/.*Swift .* swift-r\([0-9]*\).*/\1/')
+    COG_VERSION=$(grep -m 1 -E 'Swift .* swift-r[0-9]*' $logfile | sed 's/.*Swift .* cog-r\([0-9]*\).*/\1/')
+    START=$(head -n1 < $logfile | iso-to-secs | cut -f 1 -d ' ')
+    if [ "X$SWIFT_VERSION" == "X" ]; then
+	SWIFT_VERSION=na
+	COG_VERSION=na
+    fi
+    if [ "X$START" != "X" ]; then
+	echo $START $SWIFT_VERSION $COG_VERSION $logfile >> everylog-vs-versions.data
+    fi
+    
+done
+
+echo Finished creating log/version data file


Property changes on: provenancedb/swift_mod/create-everylog-vs-versions-data
___________________________________________________________________
Added: svn:executable
   + *




More information about the Swift-commit mailing list