[Swift-commit] r4968 - provenancedb

lgadelha at ci.uchicago.edu lgadelha at ci.uchicago.edu
Tue Aug 9 08:16:30 CDT 2011


Author: lgadelha
Date: 2011-08-09 08:16:29 -0500 (Tue, 09 Aug 2011)
New Revision: 4968

Added:
   provenancedb/ProvSQL.g
   provenancedb/compare_run.py
   provenancedb/compare_run.sh
   provenancedb/import-run-to-datalog
   provenancedb/prov-to-datalog.sh
   provenancedb/provenancedb-rules.datalog
   provenancedb/swift-prov-import-all-logs-datalog
Removed:
   provenancedb/prov-to-swipl.sh
   provenancedb/xq.xq
Modified:
   provenancedb/pql_functions.sql
   provenancedb/prov-init.sql
   provenancedb/prov-to-sql.sh
   provenancedb/swift-prov-import-all-logs
Log:
Added ProvSQL grammar. Added query templates for comparison/correlation query patterns. Added Datalog import for experimentation. 


Added: provenancedb/ProvSQL.g
===================================================================
--- provenancedb/ProvSQL.g	                        (rev 0)
+++ provenancedb/ProvSQL.g	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,309 @@
+grammar ProvSQL;
+
+ at header {
+	import java.util.HashSet;
+	import java.util.HashMap;
+	import java.util.Iterator;
+	import org.jgrapht.*;
+	import org.jgrapht.alg.DijkstraShortestPath;
+	import org.jgrapht.graph.*;
+}
+
+ at members {
+	String selectClause = new String();
+	String fromClause = new String();
+	String whereClauseJoinExpressions = new String();
+	String whereClause = new String(); 
+	boolean hasWhereJoinExpression;
+	HashSet<String> relations = new HashSet<String>();
+	UndirectedGraph<String,DefaultEdge> schemaGraph;
+	HashSet<DefaultEdge> joinEdges;
+	
+	// Ideally it could receive a DB schema in SQL and build the graph automatically
+	public static UndirectedGraph<String,DefaultEdge> buildGraph() {
+		UndirectedGraph<String,DefaultEdge> schemaGraph = new Multigraph<String,DefaultEdge>(DefaultEdge.class);
+
+		schemaGraph.addVertex("a_run_t");
+		schemaGraph.addVertex("a_run_n");
+		schemaGraph.addVertex("run");
+		schemaGraph.addVertex("proc");
+		schemaGraph.addVertex("a_proc_n");
+		schemaGraph.addVertex("a_proc_t");
+		schemaGraph.addVertex("app_inv");
+		schemaGraph.addVertex("app_exec");
+		schemaGraph.addVertex("rt_info");
+		schemaGraph.addVertex("ds_in");
+		schemaGraph.addVertex("ds_out");
+		schemaGraph.addVertex("ds");
+		schemaGraph.addVertex("file");
+		schemaGraph.addVertex("in_mem");
+		schemaGraph.addVertex("a_ds_t");
+		schemaGraph.addVertex("a_ds_n");
+		schemaGraph.addVertex("ds_cont");
+		schemaGraph.addEdge("a_run_t", "run");
+		schemaGraph.addEdge("a_run_n", "run");
+		schemaGraph.addEdge("run","proc");
+		schemaGraph.addEdge("proc", "a_proc_t");
+		schemaGraph.addEdge("proc", "a_proc_n");
+		schemaGraph.addEdge("proc", "ds_out");
+		schemaGraph.addEdge("proc", "ds_in");
+		schemaGraph.addEdge("proc", "app_inv");
+		schemaGraph.addEdge("app_inv", "app_exec");
+		schemaGraph.addEdge("app_exec", "rt_info");
+		schemaGraph.addEdge("ds", "ds_in");
+		schemaGraph.addEdge("ds", "ds_out");
+		schemaGraph.addEdge("ds", "a_ds_t");
+		schemaGraph.addEdge("ds", "a_ds_n");
+		schemaGraph.addEdge("ds", "file");
+		schemaGraph.addEdge("ds", "in_mem");
+		schemaGraph.addEdge("ds", "ds_cont");
+		schemaGraph.addEdge("ds", "ds_cont");
+
+		return schemaGraph;
+	}
+
+	private static HashSet<DefaultEdge> computeJoinEdges(
+			UndirectedGraph<String, DefaultEdge> schemaGraph,
+			HashSet<String> relations) {
+		HashSet<DefaultEdge> jEdges = new HashSet<DefaultEdge>();
+		Iterator<String> i = relations.iterator();
+		String first = new String();
+		if(i.hasNext())
+			first += i.next();
+		while(i.hasNext()) {
+			DijkstraShortestPath<String, DefaultEdge> sP = new DijkstraShortestPath<String, DefaultEdge>(schemaGraph, first, i.next());
+			Iterator<DefaultEdge> j = (sP.getPathEdgeList()).iterator();
+			while(j.hasNext())
+				jEdges.add(j.next());
+		}
+
+		
+		return jEdges;
+	}
+
+	
+	public static String computeFrom(UndirectedGraph<String,DefaultEdge> schemaGraph, HashSet<DefaultEdge> joinEdges, HashSet<String> qrels) {
+		HashSet<String> fromRels = new HashSet<String>();
+		String fromq = " FROM ";
+		Iterator<DefaultEdge> i = joinEdges.iterator();
+		while(i.hasNext()) {
+			DefaultEdge aux = i.next();
+			// If ds_in or ds_out were not in the original select clause's relations and they are on the the joinEdges
+			// then one has to make sure that both consumed and produced datasets are considered in the join so there
+			// is no loss of information. One alternative, implemented here, is to replace these occurrences by the ds
+			// view, which is an union of ds_in and ds_out.
+			if(qrels.contains("ds_in") || qrels.contains("ds_out")) {
+				fromRels.add(schemaGraph.getEdgeSource(aux));
+				fromRels.add(schemaGraph.getEdgeTarget(aux));				
+			}
+			else {
+				if(aux.equals(schemaGraph.getEdge("ds_in","proc")) || 
+						aux.equals(schemaGraph.getEdge("ds_in","ds")) ||
+						aux.equals(schemaGraph.getEdge("ds_out","proc")) ||
+						aux.equals(schemaGraph.getEdge("ds_out","ds"))) {
+					fromRels.add("ds");
+					fromRels.add("ds_use");
+					fromRels.add("proc");
+				}
+				else {
+					fromRels.add(schemaGraph.getEdgeSource(aux));
+					fromRels.add(schemaGraph.getEdgeTarget(aux));				
+				}
+
+			}
+		}
+		Iterator<String> j = fromRels.iterator();
+		if(j.hasNext())
+			fromq += j.next();
+		while(j.hasNext())
+			fromq+=","+j.next();
+		
+		return fromq;
+	}
+
+
+	public static String computeJoinExpressions(UndirectedGraph<String,DefaultEdge> schemaGraph, HashSet<DefaultEdge> jEdges, HashSet<String> qrels) {
+
+		HashMap<DefaultEdge,String> joinExpressions = new HashMap<DefaultEdge, String>();
+		String joinExpressionsString = new String();
+
+		joinExpressions.put(schemaGraph.getEdge("a_run_t", "run"), "a_run_t.run_id=run.id");
+		joinExpressions.put(schemaGraph.getEdge("a_run_n", "run"), "a_run_n.run_id=run.id");
+		joinExpressions.put(schemaGraph.getEdge("run", "proc"), "run.id=proc.run_id");
+		joinExpressions.put(schemaGraph.getEdge("proc", "a_proc_t"), "proc.id=a_proc_t.proc_id");
+		joinExpressions.put(schemaGraph.getEdge("proc", "a_proc_n"), "proc.id=a_proc_n.proc_id");
+		joinExpressions.put(schemaGraph.getEdge("proc", "ds_out"), "proc.id=ds_out.proc_id");
+		joinExpressions.put(schemaGraph.getEdge("proc", "ds_in"), "proc.id=ds_in.proc_id");
+		joinExpressions.put(schemaGraph.getEdge("proc", "app_inv"), "proc.id=app_inv.id");
+		joinExpressions.put(schemaGraph.getEdge("app_inv", "app_exec"), "app_inv.id=app_exec.app_inv_id");
+		joinExpressions.put(schemaGraph.getEdge("app_exec", "rt_info"), "app_exec.id=rt_info.app_exec_id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "ds_in"), "ds.id=ds_in.id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "ds_out"), "ds.id=ds_out.id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "a_ds_t"), "ds.id=a_ds_t.ds_id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "a_ds_n"), "ds.id=a_ds_n.ds_id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "file"), "ds.id=file.id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "in_mem"), "ds.id=in_mem.id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "ds_cont"), "ds.id=ds_cont.in_id");
+		joinExpressions.put(schemaGraph.getEdge("ds", "ds_cont"), "ds.id=ds_cont.out_id");
+
+		Iterator<DefaultEdge> i = jEdges.iterator();
+		if(i.hasNext()) {
+			DefaultEdge aux = i.next();
+			if(qrels.contains("ds_in") || qrels.contains("ds_out")) {
+				joinExpressionsString = joinExpressions.get(aux);
+			}
+			else {
+				if(aux.equals(schemaGraph.getEdge("ds_in","proc")) || aux.equals(schemaGraph.getEdge("ds_out","proc")))
+					joinExpressionsString = "ds_use.proc_id=proc.id";
+				else if(aux.equals(schemaGraph.getEdge("ds_in","ds")) || aux.equals(schemaGraph.getEdge("ds_out","ds"))) 
+					joinExpressionsString = "ds_use.ds_id=ds.id";
+				else {
+					joinExpressionsString = joinExpressions.get(aux);
+				}
+
+			}    		
+		}
+
+
+		while(i.hasNext()) {
+			DefaultEdge aux = i.next();
+			if(qrels.contains("ds_in") || qrels.contains("ds_out")) {
+				joinExpressionsString += " AND " + joinExpressions.get(aux);
+			}
+			else {
+				if(aux.equals(schemaGraph.getEdge("ds_in","proc")) || aux.equals(schemaGraph.getEdge("ds_out","proc")))
+					joinExpressionsString += " AND " + "ds_use.proc_id=proc.id";
+				else if(aux.equals(schemaGraph.getEdge("ds_in","ds")) || aux.equals(schemaGraph.getEdge("ds_out","ds"))) 
+					joinExpressionsString += " AND " + "ds_use.ds_id=ds.id";
+				else {
+					joinExpressionsString += " AND " + joinExpressions.get(aux);
+				}
+
+			}    		
+		}
+
+		return joinExpressionsString;
+	}
+	
+}
+
+query	:	SELECT selectExpression 
+		{ 
+			schemaGraph = buildGraph();
+			joinEdges = computeJoinEdges(schemaGraph, relations);
+			hasWhereJoinExpression=true;
+
+			System.out.print("SELECT " + selectClause);
+			fromClause += computeFrom(schemaGraph, joinEdges, relations);
+
+			System.out.print(fromClause);
+			
+			whereClauseJoinExpressions += computeJoinExpressions(schemaGraph, joinEdges, relations);
+			
+			if(!whereClauseJoinExpressions.isEmpty()) {
+				hasWhereJoinExpression=true;
+				System.out.print(" WHERE " + whereClauseJoinExpressions);
+			}
+
+		}
+		(WHERE whereExpression
+		{
+			if(hasWhereJoinExpression)
+				System.out.print(",");
+			else
+				System.out.print(" WHERE ");
+			System.out.print(whereClause);
+		}
+		)? 
+		SEMICOLON 
+		{
+			System.out.print(";");
+		}
+		;     
+
+selectExpression
+	:	a=entityAttribute 
+		{ 
+			selectClause += $a.text; 
+			relations.add($a.text.split("\\.")[0]);
+			if($a.text.split("\\.").length == 1)
+				selectClause +=  ".*";
+		}
+		(COLON b=entityAttribute 
+		{ 
+			selectClause += "," + $b.text; 
+			relations.add($b.text.split("\\.")[0]);
+			if($b.text.split("\\.").length == 1)
+				selectClause +=  ".*";
+		}
+		)*
+	;
+
+whereExpression	
+	:	c=whereAtom
+		{
+			whereClause += $c.text;
+		} 
+		(
+			(AND 
+			{
+				whereClause += " AND ";
+			}
+			| OR
+			{
+				whereClause += " OR ";
+			}
+			) d=whereAtom
+			{
+				whereClause += $d.text;
+			}
+		)* 
+	;
+
+whereAtom 
+	:	entityAttribute OP (STRING | INT | FLOAT)
+	|	entityAttribute BETWEEN STRING AND STRING;
+
+entityAttribute	:	ID (DOT ID)?;
+
+OP	:	'=' | '>' | '>=' | '<' | '<=';
+
+SELECT 	:	's' 'e' 'l' 'e' 'c' 't';
+
+WHERE	:	'w' 'h' 'e' 'r' 'e';
+
+AND	:	'a' 'n' 'd';
+
+OR	:	'o' 'r';
+
+DOT	:	'.';
+
+COLON	:	',';
+
+BETWEEN	:	'b' 'e' 't' 'w' 'e' 'e' 'n';
+
+SEMICOLON	:	';';
+
+ID  :	('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'-')*
+    ;
+
+INT :	'0'..'9'+
+    ;
+
+FLOAT
+    :   ('0'..'9')+ '.' ('0'..'9')* 
+    |   '.' ('0'..'9')+ 
+    |   ('0'..'9')+ 
+    ;
+
+STRING
+    :  '\'' ( 'a'..'z' | 'A'..'Z' | '_' | '-' | '0'..'9' | '.')* '\''
+    ;
+    
+NEWLINE	:	'\r' ?	'\n';
+
+WS	:	(' ' |'\t' |'\n' |'\r' )+	
+	{
+		skip();
+	}
+	;

Added: provenancedb/compare_run.py
===================================================================
--- provenancedb/compare_run.py	                        (rev 0)
+++ provenancedb/compare_run.py	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+import sys
+selectClause = 'SELECT run_id'
+fromClause = 'FROM'
+nId = 0
+for arg in sys.argv:
+    argTokens = arg.partition('=')
+    
+    if argTokens[0] == 'annot_num' or argTokens[0] == 'annot_txt' or argTokens[0] == 'param':
+        key = argTokens[2]
+        nId+=1
+        sId = 'j%s' % nId
+        selectClause += ', ' + sId + '.value as ' + key
+        if nId>1:
+            fromClause += ' INNER JOIN'
+        fromClause += ' compare_run_by_' + argTokens[0] + '(\'' + key + '\') as ' + sId
+        if nId>1:
+            fromClause += ' USING (run_id)'
+
+query = selectClause + ' ' + fromClause + ';'
+
+print query
+        
+    
+    
+    


Property changes on: provenancedb/compare_run.py
___________________________________________________________________
Added: svn:executable
   + *

Added: provenancedb/compare_run.sh
===================================================================
--- provenancedb/compare_run.sh	                        (rev 0)
+++ provenancedb/compare_run.sh	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+PROVDIR=$(dirname $0)
+pushd $PROVDIR
+PROVDIR=$(pwd)
+popd
+
+source $PROVDIR/etc/provenance.config
+export PATH=$PROVDIR:$PATH
+
+# TODO: Check python's version, should be >=2.6
+echo "DROP VIEW temp;" | $SQLCMD
+temp_view=$(python $PROVDIR/compare_run.py $@)
+echo "CREATE VIEW temp AS " $temp_view | $SQLCMD
+echo "SELECT * FROM temp;" | $SQLCMD
\ No newline at end of file


Property changes on: provenancedb/compare_run.sh
___________________________________________________________________
Added: svn:executable
   + *

Added: provenancedb/import-run-to-datalog
===================================================================
--- provenancedb/import-run-to-datalog	                        (rev 0)
+++ provenancedb/import-run-to-datalog	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# this should be the main driver script which can be run right after a
+# workflow has finished and will do everything necessary to import all
+# of the provenance information into the SQL provenance database.
+
+# invoke with:   import-run-to-sql [logfile]
+# where [logfile] is a full path *not relative* to the log file
+# with kickstart records expected to be in the same directory as the
+# log file.
+
+version=$version prov-to-datalog.sh $1
+


Property changes on: provenancedb/import-run-to-datalog
___________________________________________________________________
Added: svn:executable
   + *

Modified: provenancedb/pql_functions.sql
===================================================================
--- provenancedb/pql_functions.sql	2011-08-09 11:16:34 UTC (rev 4967)
+++ provenancedb/pql_functions.sql	2011-08-09 13:16:29 UTC (rev 4968)
@@ -5,7 +5,6 @@
 
 -- lists variations in a parameter's value across workflows, for parameters that are in-memory variables
 drop view in_mem_in cascade;
-
 create view in_mem_in as
        select proc.run_id, proc.id as proc_id, 
               proc.name as proc_name, ds_in.ds_id, 
@@ -14,7 +13,6 @@
        where  proc.id=ds_in.proc_id and ds_in.ds_id=in_mem.id;
 
 drop view in_mem_out cascade;
-
 create view in_mem_out as
        select proc.run_id, proc.id as proc_id, 
               proc.name as proc_name, ds_out.ds_id, 
@@ -23,14 +21,12 @@
        where  proc.id=ds_out.proc_id and ds_out.ds_id=in_mem.id;
  
 drop view in_mem_use cascade;
-	    
 create view in_mem_use as
        select * from in_mem_in
      union
        select * from in_mem_out;      
 
 drop view file_in cascade;
-
 create view file_in as
        select proc.run_id, proc.id as proc_id, 
               proc.name as proc_name, ds_in.ds_id, 
@@ -39,7 +35,6 @@
        where  proc.id=ds_in.proc_id and ds_in.ds_id=file.id;
 
 drop view file_out cascade;
-
 create view file_out as
        select proc.run_id, proc.id as proc_id, 
               proc.name as proc_name, ds_out.ds_id, 
@@ -48,21 +43,18 @@
        where  proc.id=ds_out.proc_id and ds_out.ds_id=file.id;
  	    
 drop view file_use cascade;
-
 create view file_use as
        select * from file_in
      union
        select * from file_out;
 
-drop view ds_use cascade;
-
-create view ds_use as
+drop view ds_param_value cascade;
+create view ds_param_value as
        select * from in_mem_use
      union
        select * from file_use;
 
 drop view a_t cascade;
-
 create view a_t as
        select run.id, a_run_t.name, a_run_t.value
        from   run, a_run_t 
@@ -77,7 +69,6 @@
        where  ds_use.ds_id=a_ds_t.ds_id;
 
 drop view a_n cascade;
-
 create view a_n as
        select run.id, a_run_n.name, a_run_n.value
        from   run, a_run_n 
@@ -91,18 +82,23 @@
        from   ds_use, a_ds_n
        where  ds_use.ds_id=a_ds_n.ds_id;
 
+drop view ds_use cascade;
+create view ds_use as
+       select * from ds_in
+     union all
+       select * from ds_out;
 
 
 
-drop type compare_run_by_parameter_type cascade;
-create type compare_run_by_parameter_type as (run_id varchar, param varchar, value varchar);
+drop type compare_run_by_param_type cascade;
+create type compare_run_by_param_type as (run_id varchar, param varchar, value varchar);
 
-create or replace function compare_run_by_parameter(param_name varchar)
-returns setof compare_run_by_parameter_type
+create or replace function compare_run_by_param(param_name varchar)
+returns setof compare_run_by_param_type
 as $$
    select run_id, param, value
-   from   ds_use
-   where  param=$1;
+   from   ds_use,proc,in_mem
+   where  proc.id=ds_use.proc_id and ds_use.ds_id=in_mem.id and param=$1;
 $$ language sql;
 
 -- PostgreSQL >= 9.0
@@ -126,19 +122,7 @@
 --    GROUP BY proc.run_id, ds_in.param, in_mem.value	
 --$$ LANGUAGE SQL;
 
-DROP TYPE compare_run_by_parameter_type2; 
-CREATE TYPE compare_run_by_parameter_type2 AS (run_id VARCHAR, param1 VARCHAR, value1 VARCHAR, param2 VARCHAR, value2 VARCHAR);
 
-CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR) 
-RETURNS SETOF compare_run_by_parameter_type2
-AS $$
-  SELECT * 
-  FROM   compare_run_by_parameter($1) as t 
-         INNER JOIN 
-         compare_run_by_parameter($2) as s 
-         USING (run_id); 
-$$ LANGUAGE SQL;
-
 --CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR) 
 --RETURNS TABLE (
 --  workflow_id VARCHAR, 
@@ -156,73 +140,105 @@
 --$$ LANGUAGE SQL;
 
 
-CREATE OR REPLACE FUNCTION compare_run_by_parameter(param_name1 VARCHAR, param_name2 VARCHAR, param_name3 VARCHAR) 
-RETURNS TABLE (
-  workflow_id VARCHAR, 
-  param_name1 VARCHAR, 
-  value1 VARCHAR, 
-  param_name2 VARCHAR, 
-  value2 VARCHAR, 
-  param_name3 VARCHAR, 
-  value3 VARCHAR
-) 
-AS $$
-  SELECT * 
-  FROM   compare_run_by_parameter($1, $2) as t 
-         INNER JOIN 
-         compare_run_by_parameter($3) as s 
-         USING (workflow_id); 
-$$ LANGUAGE SQL;
+DROP TYPE compare_run_by_annot_num_type;
+CREATE TYPE compare_run_by_annot_num_type as (run_id VARCHAR, name VARCHAR, value NUMERIC);
 
-
 CREATE OR REPLACE FUNCTION compare_run_by_annot_num(name VARCHAR)
-RETURNS TABLE (
-  workflow_id VARCHAR, 
-  name VARCHAR, 
-  value NUMERIC
-)
+RETURNS SETOF compare_run_by_annot_num_type
 AS $$
-    SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value
-    FROM   annot_ds_num,ds_usage,ds_containment,process
-    WHERE  annot_ds_num.id=ds_containment.in_id AND ds_containment.out_id=ds_usage.dataset_id AND
-           ds_usage.process_id=process.id AND annot_ds_num.name=$1
+    SELECT proc.run_id, a_ds_n.name, a_ds_n.value
+    FROM   a_ds_n,ds_use,ds_cont,proc
+    WHERE  a_ds_n.ds_id=ds_cont.in_id AND ds_cont.out_id=ds_use.ds_id AND
+           ds_use.proc_id=proc.id AND a_ds_n.name=$1
   UNION
-    SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value 
-    FROM   process, ds_usage, annot_ds_num
-    WHERE  process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_num.id and
-           annot_ds_num.name=$1
+    SELECT proc.run_id, a_ds_n.name, a_ds_n.value 
+    FROM   proc, ds_use, a_ds_n
+    WHERE  proc.id=ds_use.proc_id and ds_use.ds_id=a_ds_n.ds_id and
+           a_ds_n.name=$1
   UNION
-    SELECT process.workflow_id, annot_p_num.name, annot_p_num.value 
-    FROM   process, annot_p_num
-    WHERE  process.id=annot_p_num.id and annot_p_num.name=$1
+    SELECT proc.run_id, a_proc_n.name, a_proc_n.value 
+    FROM   proc, a_proc_n
+    WHERE  proc.id=a_proc_n.proc_id and a_proc_n.name=$1
   UNION
-    SELECT workflow.id as workflow_id, annot_wf_num.name, annot_wf_num.value 
-    FROM   workflow, annot_wf_num
-    WHERE  workflow.id=annot_wf_num.id and annot_wf_num.name=$1
+    SELECT run.id as run_id, a_run_n.name, a_run_n.value 
+    FROM   run, a_run_n
+    WHERE  run.id=a_run_n.run_id and a_run_n.name=$1
 $$ LANGUAGE SQL;
 
+DROP TYPE compare_run_by_annot_txt_type;
+CREATE TYPE compare_run_by_annot_txt_type as (run_id VARCHAR, name VARCHAR, value VARCHAR);
 
 CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
-RETURNS TABLE (
-  workflow_id VARCHAR, 
-  name VARCHAR, 
-  value VARCHAR) 
+RETURNS SETOF compare_run_by_annot_txt_type
 AS $$
-    SELECT   process.workflow_id, annot_ds_txt.name, annot_ds_txt.value 
-    FROM     process, ds_usage, annot_ds_txt
-    WHERE    process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_txt.id and
-             annot_ds_txt.name=$1
+    SELECT proc.run_id, a_ds_t.name, a_ds_t.value
+    FROM   a_ds_t,ds_use,ds_cont,proc
+    WHERE  a_ds_t.ds_id=ds_cont.in_id AND ds_cont.out_id=ds_use.ds_id AND
+           ds_use.proc_id=proc.id AND a_ds_t.name=$1
   UNION
-    SELECT   process.workflow_id, annot_p_txt.name, annot_p_txt.value 
-    FROM     process, annot_p_txt
-    WHERE    process.id=annot_p_txt.id and annot_p_txt.name=$1
+    SELECT proc.run_id, a_ds_t.name, a_ds_t.value 
+    FROM   proc, ds_use, a_ds_t
+    WHERE  proc.id=ds_use.proc_id and ds_use.ds_id=a_ds_t.ds_id and
+           a_ds_t.name=$1
   UNION
-    SELECT   workflow.id as workflow_id, annot_wf_txt.name, annot_wf_txt.value 
-    FROM     workflow, annot_wf_txt
-    WHERE    workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
+    SELECT proc.run_id, a_proc_t.name, a_proc_t.value 
+    FROM   proc, a_proc_t
+    WHERE  proc.id=a_proc_t.proc_id and a_proc_t.name=$1
+  UNION
+    SELECT run.id as run_id, a_run_t.name, a_run_t.value 
+    FROM   run, a_run_t
+    WHERE  run.id=a_run_t.run_id and a_run_t.name=$1
 $$ LANGUAGE SQL;
 
 
+-- CREATE OR REPLACE FUNCTION compare_run_by_annot_num(name VARCHAR)
+-- RETURNS TABLE (
+--   workflow_id VARCHAR, 
+--   name VARCHAR, 
+--   value NUMERIC
+-- )
+-- AS $$
+--     SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value
+--     FROM   annot_ds_num,ds_usage,ds_containment,process
+--     WHERE  annot_ds_num.id=ds_containment.in_id AND ds_containment.out_id=ds_usage.dataset_id AND
+--            ds_usage.process_id=process.id AND annot_ds_num.name=$1
+--   UNION
+--     SELECT process.workflow_id, annot_ds_num.name, annot_ds_num.value 
+--     FROM   process, ds_usage, annot_ds_num
+--     WHERE  process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_num.id and
+--            annot_ds_num.name=$1
+--   UNION
+--     SELECT process.workflow_id, annot_p_num.name, annot_p_num.value 
+--     FROM   process, annot_p_num
+--     WHERE  process.id=annot_p_num.id and annot_p_num.name=$1
+--   UNION
+--     SELECT workflow.id as workflow_id, annot_wf_num.name, annot_wf_num.value 
+--     FROM   workflow, annot_wf_num
+--     WHERE  workflow.id=annot_wf_num.id and annot_wf_num.name=$1
+-- $$ LANGUAGE SQL;
+
+
+-- CREATE OR REPLACE FUNCTION compare_run_by_annot_txt(name VARCHAR)
+-- RETURNS TABLE (
+--   workflow_id VARCHAR, 
+--   name VARCHAR, 
+--   value VARCHAR) 
+-- AS $$
+--     SELECT   process.workflow_id, annot_ds_txt.name, annot_ds_txt.value 
+--     FROM     process, ds_usage, annot_ds_txt
+--     WHERE    process.id=ds_usage.process_id and ds_usage.dataset_id=annot_ds_txt.id and
+--              annot_ds_txt.name=$1
+--   UNION
+--     SELECT   process.workflow_id, annot_p_txt.name, annot_p_txt.value 
+--     FROM     process, annot_p_txt
+--     WHERE    process.id=annot_p_txt.id and annot_p_txt.name=$1
+--   UNION
+--     SELECT   workflow.id as workflow_id, annot_wf_txt.name, annot_wf_txt.value 
+--     FROM     workflow, annot_wf_txt
+--     WHERE    workflow.id=annot_wf_txt.id and annot_wf_txt.name=$1
+-- $$ LANGUAGE SQL;
+
+
 CREATE OR REPLACE FUNCTION compare_run_by_annot_bool(name VARCHAR)
 RETURNS TABLE (
   workflow_id VARCHAR,
@@ -262,16 +278,20 @@
 $$ LANGUAGE SQL;
 
 -- recursive query to find ancestor entities in a provenance graph
-CREATE OR REPLACE FUNCTION ancestors(varchar) RETURNS SETOF varchar AS $$
-       WITH RECURSIVE anc(ancestor,descendant) AS
-         (    
-              SELECT parent AS ancestor, child AS descendant FROM parent_of WHERE child=$1
-            UNION
-              SELECT parent_of.parent AS ancestor, anc.descendant AS descendant
-              FROM   anc,parent_of
-              WHERE  anc.ancestor=parent_of.child
-         )
-       SELECT ancestor FROM anc
+CREATE OR REPLACE FUNCTION ancestors(varchar) 
+RETURNS SETOF varchar AS $$
+  WITH RECURSIVE anc(ancestor,descendant) AS
+  (    
+       SELECT parent AS ancestor, child AS descendant 
+       FROM   parent_of 
+       WHERE child=$1
+     UNION
+       SELECT parent_of.parent AS ancestor, 
+              anc.descendant AS descendant
+       FROM   anc,parent_of
+       WHERE  anc.ancestor=parent_of.child
+  )
+  SELECT ancestor FROM anc
 $$ LANGUAGE SQL;
 
 

Modified: provenancedb/prov-init.sql
===================================================================
--- provenancedb/prov-init.sql	2011-08-09 11:16:34 UTC (rev 4967)
+++ provenancedb/prov-init.sql	2011-08-09 13:16:29 UTC (rev 4968)
@@ -42,7 +42,7 @@
 -- has an entry for this process.
 -- process types: internal, rootthread, execute, function, compound, scope, operator
 create table proc
-    (id     varchar(256) primary key, 
+           (id     varchar(256) primary key, 
      type   varchar(16),
      name   varchar(256), -- in the case of an execute this refers to the transformation name in tc.data
      run_id varchar(256) references run (id) on delete cascade   -- normalize: workflow_id of sub-procedure determined
@@ -187,11 +187,17 @@
      primary key (run_id, name)
    );
 
+create table iq
+   ( idx      serial primary key,
+     q        varchar(2048)
+   );
+
+drop view pgraph_edge;
 create view pgraph_edge as 
        select proc_id as parent,ds_id as child from ds_out
-       union
+       union all
        select ds_id as parent,proc_id as child from ds_in
-       union 
+       union all
        select out_id as parent,in_id as child from ds_cont;
 
 -- continue renaming from here

Added: provenancedb/prov-to-datalog.sh
===================================================================
--- provenancedb/prov-to-datalog.sh	                        (rev 0)
+++ provenancedb/prov-to-datalog.sh	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+export RUNID=$(basename $1 .log)
+
+export WFID="execute:${RUNID}:"
+
+# TODO is there already a URI form for identifying workflows?
+export WF="${RUNID}"
+
+echo Generating Datalog for $RUNID
+
+# this gives a distinction between the root process for a workflow and the
+# workflow itself. perhaps better to model the workflow as a process
+echo "isProcess('${WFID}0')." > tmp.datalog
+echo "hasType('${WFID}0', 'rootthread')." >> tmp.datalog
+echo "hasName('${WFID}0', 'name')." >> tmp.datalog
+echo "isInScriptRun('${WFID}0', '$WF')." >> tmp.datalog
+
+while read time duration thread localthread endstate tr_name scratch; do
+    echo "isProcess('$thread')." >> tmp.datalog
+    echo "hasType('$thread', 'execute')." >> tmp.datalog
+    echo "hasName('$thread', '$tr_name')." >> tmp.datalog
+    echo "hasStartTime('$thread', $time)." >> tmp.datalog
+    echo "hasDuration('$thread', $duration)." >> tmp.datalog
+    echo "hasFinalState('$thread', '$endstate')." >> tmp.datalog
+done < execute.global.event
+
+while read start_time duration globalid id endstate thread site scratch; do
+    # cut off the last component of the thread, so that we end up at the
+    # parent thread id which should correspond with the execute-level ID
+    inv_id="$WFID$(echo $thread | sed 's/-[^-]*$//')"  >> tmp.datalog
+    echo "isExecutionAttempt('$globalid')."  >> tmp.datalog
+    echo "attemptsToExecute('$globalid', '$inv_id')."  >> tmp.datalog
+    echo "hasStartTime('$globalid', $start_time)."  >> tmp.datalog
+    echo "hasDuration('$globalid', $duration)."  >> tmp.datalog
+    echo "hasFinalState('$globalid', '$endstate')."  >> tmp.datalog 
+    echo "executedAtSite('$globalid', '$site')." >> tmp.datalog
+done < execute2.global.event
+
+while read col1 col2 col3 col4 col5 thread name lhs rhs result; do
+    thread=$(echo $thread | awk 'BEGIN { FS = "=" }; {print $2}')
+    name=$(echo $name | awk 'BEGIN { FS = "=" }; {print $2}')
+    lhs=$(echo $lhs | awk 'BEGIN { FS = "=" }; {print $2}')
+    rhs=$(echo $rhs | awk 'BEGIN { FS = "=" }; {print $2}')
+    result=$(echo $result | awk 'BEGIN { FS = "=" }; {print $2}')
+    
+    operatorid="${WFID}operator:$thread"
+    
+    if [ $version -le 3726 ]; then
+	lhs=$(echo $lhs | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
+	rhs=$(echo $rhs | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
+	result=$(echo $result | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
+    fi
+    
+    echo "isDataset('$lhs')." >> tmp.datalog
+    echo "isDataset('$rhs')." >> tmp.datalog
+    echo "isDataset('$result')." >> tmp.datalog
+    echo "isProcess('$operatorid')." >> tmp.datalog
+    echo "hasType('$operatorid', 'operator')." >> tmp.datalog
+    echo "hasName('$operatorid', '$name')." >> tmp.datalog
+    echo "used('$operatorid', '$lhs', 'lhs')." >> tmp.datalog
+    echo "used('$operatorid', '$rhs', 'rhs')." >> tmp.datalog
+    echo "wasGeneratedBy('$result', '$operatorid', 'result')." >> tmp.datalog
+done < operators.txt
+
+while read id name output; do
+    echo "isDataset('$output')." >> tmp.datalog
+    echo "isProcess('$id')." >> tmp.datalog
+    echo "hasType('$id', 'function')." >> tmp.datalog
+    echo "hasName('$id', '$name')." >> tmp.datalog
+    echo "isInScriptRun('$id', '$WF')." >> tmp.datalog
+    echo "wasGeneratedBy('$output', '$id', 'result')." >> tmp.datalog
+done < functions.txt
+
+while read id value; do
+    # TODO need ordering/naming
+    echo "isDataset('$value')." >> tmp.datalog
+    echo "wasGeneratedBy('$value', '$id', 'undefined')." >> tmp.datalog
+done < function-inputs.txt
+
+
+while read thread appname; do
+    echo "hasName('$thread', '$appname')." >> tmp.datalog
+done < invocation-procedure-names.txt
+
+while read outer inner; do
+    echo "isDataset('$outer')." >> tmp.datalog
+    echo "isDataset('$inner')." >> tmp.datalog
+    echo "isContainedIn('$inner', '$outer')." >> tmp.datalog
+done < tie-containers.txt
+
+while read dataset filename; do
+
+    if [ $version -le 3726 ]; then
+	dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
+    fi
+    echo "isDataset('$dataset')."  >> tmp.datalog
+    echo "hasFileName('$dataset', '$filename')." >> tmp.datalog
+done < dataset-filenames.txt
+
+while read dataset idtype equal value rest; do
+
+    if [ $version -le 3726 ]; then
+	dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
+    fi
+
+    echo "isDataset('$dataset')."  >> tmp.datalog
+    echo "hasValue('$dataset', '$value')." >> tmp.datalog
+done < dataset-values.txt
+
+while read start duration wfid rest; do
+    echo "hasStartTime('$WF', $start)."  >> tmp.datalog
+    echo "hasDuration('$WF', $duration)." >> tmp.datalog
+done < workflow.event
+
+
+# TODO this could merge with other naming tables
+while read start duration thread final_state procname ; do
+    if [ "$duration" != "last-event-line" ]; then
+	compoundid=$WFID$thread
+	echo "isProcess('$compoundid')." >> tmp.datalog
+	echo "hasType('compound')." >> tmp.datalog
+	echo "hasName('$procname')."  >> tmp.datalog
+	echo "isInScriptRun('$compoundid', '$WF')." >> tmp.datalog
+    fi
+done < compound.event
+
+while read start duration thread final_state procname ; do
+    if [ "$duration" != "last-event-line" ]; then
+	fqid=$WFID$thread
+	echo "isProcess('$fqid')." >> tmp.datalog
+	echo "hasType('internal')." >> tmp.datalog
+	echo "hasName('$procname')."  >> tmp.datalog
+	echo "isInScriptRun('$fqid', '$WF')." >> tmp.datalog
+    fi	
+done < internalproc.event
+
+while read t ; do 
+    thread="${WFID}$t"
+    echo "isProcess('$thread')." >> tmp.datalog
+    echo "hasType('scope')." >> tmp.datalog
+    echo "hasName('scope')."  >> tmp.datalog
+    echo "isInScriptRun('$thread', '$WF')." >> tmp.datalog    
+done < scopes.txt
+
+while read thread direction dataset variable rest; do 
+
+    dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
+    echo "isDataset('$dataset')." >> tmp.datalog 
+
+    if [ "$direction" == "input" ] ; then
+	echo "used('$thread', '$dataset', '$variable')." >> tmp.datalog
+    else
+	echo "wasGeneratedBy('$dataset', '$thread', '$variable')." >> tmp.datalog
+    fi
+    
+done < tie-data-invocs.txt
+
+if [ -f extrainfo.txt ]; then
+    while read execute2_id extrainfo; do
+	echo $extrainfo | awk -F ";"  '{ for (i = 1; i <= NF; i++)
+                                               print $i
+                                         }' | awk -F "=" '{ print $1 " " $2 }' | awk -F ":" '{ print $1 " " $2 }' > fields.txt
+	id=$($SQLCMD --tuples-only -c "select app_inv_id from app_exec where id='$execute2_id';" | awk '{print $1}')
+	while read name type value; do
+	    # TODO: check types
+	    echo "hasAnnotation('$id', '$name', '$value')." >> tmp.datalog
+	done < fields.txt 
+    done < extrainfo.txt
+fi
+
+if [ -f runtime.txt ]; then
+    while read execute2_id runtime; do
+	timestamp=$(echo $runtime | awk -F "," '{print $1}' | awk -F ":" '{print $2}')
+	cpu_usage=$(echo $runtime | awk -F "," '{print $2}' | awk -F ":" '{print $2}')
+	max_phys_mem=$(echo $runtime | awk -F "," '{print $3}' | awk -F ":" '{print $2}')
+	max_virtual_mem=$(echo $runtime | awk -F "," '{print $4}' | awk -F ":" '{print $2}')
+	io_read_bytes=$(echo $runtime | awk -F "," '{print $5}' | awk -F ":" '{print $2}')
+	io_write_bytes=$(echo $runtime | awk -F "," '{print $6}' | awk -F ":" '{print $2}')
+	echo "hasRuntimeInfo('$execute2_id', $timestamp, $cpu_usage, $max_phys_mem, $max_virtual_mem, $io_read_bytes, $io_write_bytes)." >> tmp.datalog
+    done < runtime.txt
+fi
+
+cat tmp.datalog | sort | uniq >> provenancedb.datalog
+
+echo Finished writing Datalog.
\ No newline at end of file


Property changes on: provenancedb/prov-to-datalog.sh
___________________________________________________________________
Added: svn:executable
   + *

Modified: provenancedb/prov-to-sql.sh
===================================================================
--- provenancedb/prov-to-sql.sh	2011-08-09 11:16:34 UTC (rev 4967)
+++ provenancedb/prov-to-sql.sh	2011-08-09 13:16:29 UTC (rev 4968)
@@ -39,7 +39,7 @@
 	rhs=$(echo $rhs | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
 	result=$(echo $result | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')
     fi
-    
+  
     $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$lhs');"
     $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$rhs');"
     $SQLCMD -c  "INSERT INTO ds (id) VALUES ('$result');"
@@ -96,7 +96,7 @@
     $SQLCMD -c "INSERT INTO file (id, name) VALUES ('$dataset', '$filename');"
 done < dataset-filenames.txt
 
-while read dataset value; do
+while read dataset idtype equal value rest; do
 
     if [ $version -le 3726 ]; then
 	dataset=$(echo $dataset | sed -e 's/tag:benc at ci.uchicago.edu,2008:swift://g')

Deleted: provenancedb/prov-to-swipl.sh
===================================================================
--- provenancedb/prov-to-swipl.sh	2011-08-09 11:16:34 UTC (rev 4967)
+++ provenancedb/prov-to-swipl.sh	2011-08-09 13:16:29 UTC (rev 4968)
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-rm -f tmp-import.pl
-
-while read time duration thread endstate app; do
-  echo "execute('$thread', $time, $duration, '$endstate', '$app')." >> tmp-import.pl
-done < $LOGDIR/execute.event
-
-while read thread direction dataset variable value rest; do 
-  if [ "$direction" == "input" ] ; then
-    dir=I
-  else
-    dir=O
-  fi
-  echo "dataset_usage('$thread', '$dir', '$dataset', '$variable', '$value')." >> tmp-import.pl
-done < $LOGDIR/tie-data-invocs.txt
-
-while read thread appname; do
-  echo "invocation_procedure_names('$thread', '$appname')." >> tmp-import.pl
-
-done < $LOGDIR/invocation-procedure-names.txt
-
-while read outer inner; do
-  echo "dataset_containment('$outer', '$inner')." >> tmp-import.pl
-done < $LOGDIR/tie-containers.txt
-
-while read dataset filename; do
-  echo "dataset_filenames('$dataset', '$filename')." >> tmp-import.pl
-done < $LOGDIR/dataset-filenames.txt
-
-swipl -g "['tmp-import.pl']"
-

Added: provenancedb/provenancedb-rules.datalog
===================================================================
--- provenancedb/provenancedb-rules.datalog	                        (rev 0)
+++ provenancedb/provenancedb-rules.datalog	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,45 @@
+isArtifact(X) :- isDataset(X).
+
+%% used(Process, Dataset, Role).
+%% wasGeneratedBy(Dataset, Process, Role).
+isProcess(X) :- used(X,_,_).
+isProcess(Y) :- wasGeneratedBy(_,Y,_).
+
+isDataset(Y) :- used(_,Y,_).
+isDataset(X) :- wasGeneratedBy(X,_,_).
+
+:- table isAncestor/2.
+isAncestor(X,Y) :- used(Y,X,_).
+isAncestor(X,Y) :- used(Y,Z,_),isAncestor(X,Z).
+isAncestor(X,Y) :- wasGeneratedBy(Y,X,_).
+isAncestor(X,Y) :- wasGeneratedBy(Y,Z,_),isAncestor(X,Z).
+%%isAncestor(X,Y) :- isContainedIn(X,Y).
+%%isAncestor(X,Y) :- isContainedIn(Z,Y),isAncestor(X,Z).
+
+:- table wasDerivedFrom/2.
+wasDerivedFrom(X,Y) :- used(Z,Y,_),wasGeneratedBy(X,Z,_).
+
+:- table wasTriggeredBy/2.
+wasTriggeredBy(X,Y) :- wasGeneratedBy(Z,Y,_),used(X,Z,_).
+
+:- table isContainedInTC/2.
+isContainedInTC(X,Y) :- isContainedIn(X,Y).
+isContainedInTC(X,Y) :- isContainedIn(X,Z),isContainedInTC(Z,Y).
+
+:- table wasDerivedFromTC/2.
+wasDerivedFromTC(X,Y) :- wasDerivedFrom(X,Y).
+wasDerivedFromTC(X,Y) :- wasDerivedFrom(X,Z),wasDerivedFromTC(Z,Y).
+
+:- table wasTriggeredByTC/2.
+wasTriggeredByTC(X,Y) :- wasTriggeredBy(X,Y).
+wasTriggeredByTC(X,Y) :- wasTriggeredBy(X,Z),wasTriggeredByTC(Z,Y).
+
+isInRun(D,X) :- wasGeneratedBy(D,P,_),isInRun(P,X).
+isInRun(D,X) :- used(P,D,_),isInRun(P,X).
+
+runCorrelation(R,D,[N|L],V) :- isRun(R),isDataset(D),isInRun(D,R),hasAnnotation(D,N,V).
+runCorrelation(R,P,[N|L],V) :- isRun(R),isProcess(P),isInRun(P,R),hasAnnotation(P,N,V).
+runCorrelation(R,P,[N|L],V) :- isRun(R),isPro
+
+
+

Modified: provenancedb/swift-prov-import-all-logs
===================================================================
--- provenancedb/swift-prov-import-all-logs	2011-08-09 11:16:34 UTC (rev 4967)
+++ provenancedb/swift-prov-import-all-logs	2011-08-09 13:16:29 UTC (rev 4968)
@@ -13,6 +13,7 @@
 
 # this generates a file with tuples like:
 # <starttime> <swift version> <logfilename> 
+# This is where Swift's version is collected.
 swift-plot-log $LOGREPO everylog-vs-versions.data
 
 if [ "$?" != "0" ]; then

Added: provenancedb/swift-prov-import-all-logs-datalog
===================================================================
--- provenancedb/swift-prov-import-all-logs-datalog	                        (rev 0)
+++ provenancedb/swift-prov-import-all-logs-datalog	2011-08-09 13:16:29 UTC (rev 4968)
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+PROVDIR=$(dirname $0)
+pushd $PROVDIR
+PROVDIR=$(pwd)
+popd
+
+# we need to keep this out of the log-proceesing dir because import
+# of individual runs will clean other files.
+
+source $PROVDIR/etc/provenance.config
+export PATH=$PROVDIR:$PATH
+
+# this generates a file with tuples like:
+# <starttime> <swift version> <logfilename> 
+# This is where Swift's version is collected.
+swift-plot-log $LOGREPO everylog-vs-versions.data
+
+if [ "$?" != "0" ]; then
+    echo swift-plot-log failed when building everylog-vs-versions.data
+    exit 1
+fi
+
+# TODO better tmp handling that always using the same name in a shared
+# directory
+cp everylog-vs-versions.data /tmp/
+
+echo first commandline param is $1
+if [ "$1" == "rebuild" ]; then
+    echo CLEANING DATABASE
+    cat $PROVDIR/provenancedb-rules.datalog > provenancedb.datalog
+fi
+
+while read start version filename; do
+    
+    export IDIR=$(echo $filename | sed 's/\.log$/.d/')
+    echo IDIR=$IDIR
+    if [ $version -ge 1538 ]; then
+	echo -n "Log: $filename ... "
+	
+	EXISTING=$(grep "hasLogFilename('$filename')." provenancedb.datalog)
+	
+	if [ -z "$EXISTING" ];  then
+	    PROV_ENABLED=$(grep provenanceid $filename | wc -l)
+	    if [ $PROV_ENABLED -gt 0 ]; then
+		echo IMPORTING
+		
+		if grep --silent "Loader Swift finished with no errors" $filename; then
+		    wfstatus="SUCCESS"
+    		else
+		    wfstatus="FAIL"
+		fi
+		
+		export RUNID=$(basename $filename .log)
+		
+		export WF="${RUNID}"
+		echo WP1
+		echo "isRun('$WF')." >> provenancedb.datalog
+		echo "hasLogFilename('$WF', '$filename')." >> provenancedb.datalog
+		echo "hasSwiftVersion('$WF', '$version')." >> provenancedb.datalog
+		echo "hasFinalState('$WF', '$wfstatus')." >> provenancedb.datalog
+
+		echo version $version in log file $filename
+		echo ============= will import =============
+		prepare-for-import $filename
+		if [ "$?" != "0" ]; then
+		    echo prepare-for-import failed
+		    exit 2
+		fi
+		version=$version import-run-to-datalog $filename
+		if [ "$?" != "0" ]; then
+		    echo import-run-to-datalog failed
+		    exit 3
+		fi
+	    else 
+		echo SKIP: provenance.log not set to true in etc/swift.properties
+	    fi
+	else
+	    echo SKIP: Already known in workflow
+	fi
+    fi
+done < /tmp/everylog-vs-versions.data


Property changes on: provenancedb/swift-prov-import-all-logs-datalog
___________________________________________________________________
Added: svn:executable
   + *

Deleted: provenancedb/xq.xq
===================================================================
--- provenancedb/xq.xq	2011-08-09 11:16:34 UTC (rev 4967)
+++ provenancedb/xq.xq	2011-08-09 13:16:29 UTC (rev 4968)
@@ -1,6 +0,0 @@
-for $t in //tie
-  let $dataset := //dataset[@identifier=$t/dataset]
-  let $exec := //execute[thread=$t/thread]
-  where $t/direction="input"
-  return <r>An invocation of {$exec/trname} took input {$dataset/filename}</r>
-




More information about the Swift-commit mailing list