[Swift-commit] r2894 - provenancedb

noreply at svn.ci.uchicago.edu noreply at svn.ci.uchicago.edu
Wed Apr 29 06:39:28 CDT 2009


Author: benc
Date: 2009-04-29 06:39:27 -0500 (Wed, 29 Apr 2009)
New Revision: 2894

Added:
   provenancedb/db-to-opm.sh
Modified:
   provenancedb/prov-to-opm.sh
Log:
database-backed opm exporter (rather than using the log-processing
based OPM exporter). this is part of a shift towards using the
db as the canonical source of provenance information, rather than
log-processing output files.

Added: provenancedb/db-to-opm.sh
===================================================================
--- provenancedb/db-to-opm.sh	                        (rev 0)
+++ provenancedb/db-to-opm.sh	2009-04-29 11:39:27 UTC (rev 2894)
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+echo Generating OPM for entire sqlite3 database
+
+rm -f ids.txt
+touch ids.txt
+
+mkid() {
+  if ! grep --silent "^$1\$" ids.txt  ; then
+    echo $1 >> ids.txt
+  fi
+  echo -n x
+  grep -n "^$1\$" ids.txt | cut -f 1 -d ':'
+}
+
+rm -f opm.xml
+
+echo "<opmGraph xmlns=\"http://openprovenance.org/model/v1.01.a\" xmlns:swift=\"tag:benc at ci.uchicago.edu,2008:swift:opm:20090419\">" > opm.xml
+
+# TODO - there are actually many accounts here, if compound procedure
+# nesting is regarded as presenting multiple accounts.
+# For now, emit everything into a single account, which probably
+# violates some (explicit or implicit) integrity rules.
+echo "<accounts><account id=\"base\"><value /></account></accounts>" >> opm.xml
+
+echo "<processes>" >> opm.xml
+
+sqlite3 -separator ' ' -batch provdb "select * from processes;"  |
+  while read id type ; do
+    flatid=$(mkid $id)
+    echo "  <process id=\"$flatid\">"
+    echo "    <account id=\"base\" />"
+    echo "    <value>"
+    echo "    <swift:type>$type</swift:type>"
+    echo "    <swift:uri>$id</swift:uri>"
+
+
+    if [ "$type" == "execute" ]; then
+      sqlite3 -separator ' ' -batch provdb "select * from executes where id='$id';"  | ( read  id starttime duration finalstate app scratch; echo "    <swift:executeinfo starttime=\"$starttime\" duration=\"$duration\" endstate=\"$finalstate\" app=\"$app\" scratch=\"$scratch\"/>" )
+    fi
+
+    sqlite3 -separator ' ' -batch provdb "select procedure_name from invocation_procedure_names where execute_id='$id';" | ( read pn ; echo "    <swift:name>$pn</swift:name>")
+
+   # TODO type handling for other types
+
+    echo "    </value>"
+    echo "  </process>"
+  done >> opm.xml
+
+echo "</processes>" >> opm.xml
+
+echo "<artifacts>" >> opm.xml
+
+# we need a list of all artifacts here. for now, take everything we can
+# find in the tie-data-invocs and containment tables, uniquefied.
+# This is probably the wrong thing to do?
+
+sqlite3 -separator ' ' -batch provdb "select outer_dataset_id from dataset_containment;" > tmp-dshandles.txt
+sqlite3 -separator ' ' -batch provdb "select inner_dataset_id from dataset_containment;" >> tmp-dshandles.txt
+sqlite3 -separator ' ' -batch provdb "select dataset_id from dataset_usage;" >> tmp-dshandles.txt
+
+cat tmp-dshandles.txt | sort | uniq > tmp-dshandles2.txt
+
+while read artifact ; do
+artifactid=$(mkid $artifact)
+echo "  <artifact id=\"$artifactid\">"
+echo "    <value>"
+echo "    <swift:uri>$artifact</swift:uri>"
+
+sqlite3 -separator ' ' -batch provdb "select inner_dataset_id from dataset_containment where outer_dataset_id='$artifact';" | while read innerartifact ; do
+  innerflat=$(mkid $innerartifact)
+  echo "<swift:contains ref=\"$innerflat\" />"
+ done
+
+sqlite3 -separator ' ' -batch provdb "select filename from dataset_filenames where dataset_id='$artifact';" | while read fn ; do
+  echo "<swift:filename>$fn</swift:filename>"
+ done
+
+sqlite3 -separator ' ' -batch provdb "select value from dataset_values where dataset_id='$artifact';" | while read value ; do
+  echo "<swift:value>$value</swift:value>"
+ done
+
+echo "    </value>"
+echo "    <account id=\"base\" />"
+echo "  </artifact>"
+done < tmp-dshandles2.txt >> opm.xml
+
+echo "</artifacts>" >> opm.xml
+
+echo "<causalDependencies>" >> opm.xml
+
+# other stuff can do this in any order, but here we must probably do it
+# in two passes, one for each relation, in order to satisfy schema.
+# but for now do it in a single pass...
+
+sqlite3 -separator ' ' -batch provdb "select * from dataset_usage;" |
+ while read thread direction dataset variable rest; do 
+  datasetid=$(mkid $dataset)
+  threadid=$(mkid $thread)
+  if [ "$direction" == "I" ] ; then
+    echo "  <used>"
+    echo "    <effect id=\"$threadid\" />"
+    echo "    <role value=\"$variable\" />"
+    echo "    <cause id=\"$datasetid\" />"
+    echo "    <account id=\"base\" />"
+    echo "  </used>"
+  elif [ "$direction" == "O" ] ; then
+    echo "  <wasGeneratedBy>"
+    echo "    <effect id=\"$datasetid\" />"
+    echo "    <role value=\"$variable\" />"
+    echo "    <cause id=\"$threadid\" />"
+    echo "    <account id=\"base\" />"
+    echo "  </wasGeneratedBy>"
+  else
+    echo ERROR: unknown dataset usage direction: $direction
+  fi
+done  >> opm.xml
+
+
+
+echo "</causalDependencies>" >> opm.xml
+
+echo "</opmGraph>" >> opm.xml
+echo Finished generating OPM, in opm.xml
+


Property changes on: provenancedb/db-to-opm.sh
___________________________________________________________________
Name: svn:executable
   + *

Modified: provenancedb/prov-to-opm.sh
===================================================================
--- provenancedb/prov-to-opm.sh	2009-04-29 10:21:39 UTC (rev 2893)
+++ provenancedb/prov-to-opm.sh	2009-04-29 11:39:27 UTC (rev 2894)
@@ -23,7 +23,7 @@
 # really know how should be mapped from Swift
 echo "  </process>"
 
-done < $LOGDIR/execute.global.event >> opm.xml
+done < execute.global.event >> opm.xml
 
 echo "</processes>" >> opm.xml
 
@@ -38,11 +38,11 @@
 while read outer inner; do
   echo $input
   echo $output
-done < $LOGDIR/tie-containers.txt > tmp-dshandles.txt
+done < tie-containers.txt > tmp-dshandles.txt
 
 while read t d dataset rest ; do
   echo $dataset
-done < $LOGDIR/tie-data-invocs.txt >> tmp-dshandles.txt
+done < tie-data-invocs.txt >> tmp-dshandles.txt
 
 cat tmp-dshandles.txt | sort | uniq > tmp-dshandles2.txt
 
@@ -81,7 +81,7 @@
     echo "    <account id=\"base\" />"
     echo "  </wasGeneratedBy>"
   fi
-done < $LOGDIR/tie-data-invocs.txt >> opm.xml
+done < tie-data-invocs.txt >> opm.xml
 
 
 




More information about the Swift-commit mailing list