[Swift-commit] r2849 - usertools/cio/science/blast

noreply at svn.ci.uchicago.edu noreply at svn.ci.uchicago.edu
Thu Apr 9 14:25:10 CDT 2009


Author: aespinosa
Date: 2009-04-09 14:25:09 -0500 (Thu, 09 Apr 2009)
New Revision: 2849

Added:
   usertools/cio/science/blast/tc.bgp
Removed:
   usertools/cio/science/blast/tc.data
Modified:
   usertools/cio/science/blast/blast.swift
   usertools/cio/science/blast/mapper-in.sh
   usertools/cio/science/blast/mapper-out.sh
   usertools/cio/science/blast/measure-runblast.sh
   usertools/cio/science/blast/mockblast.sh
   usertools/cio/science/blast/readseq.rb
   usertools/cio/science/blast/runblast.sh
Log:
Test framework for MTDM and vanilla+bgp_enhancement swift

Modified: usertools/cio/science/blast/blast.swift
===================================================================
--- usertools/cio/science/blast/blast.swift	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/blast.swift	2009-04-09 19:25:09 UTC (rev 2849)
@@ -24,13 +24,23 @@
   cat @filenames(out.out) stdout=@sum;
 }
 
-BlastDatabase pir[] <filesys_mapper;location="common", pattern="UNIPROT_for_blast_14.0.seq*">;
-BlastResult out[] <ext;exec="mapper-out.sh">;
-BlastQuery input[] <ext;exec="mapper-in.sh">;
+string db=@arg("db");
+string db_prefix=@arg("db_pref");
+string start=@arg("start");
+string end=@arg("end");
+string limit=@arg("limit");
+string runid=@arg("run");
 
+BlastDatabase pir[] <filesys_mapper;location=db_prefix,
+    pattern=@strcat(db, "*")>;
+BlastResult out[] <ext;exec="mapper-out.sh", a=start, b=end,
+    d=runid, l=limit>;
+BlastQuery input[] <ext;exec="mapper-in.sh", a=start, b=end,
+    d=runid, l=limit>;
+
 foreach data,i in input {
   (out[i]) = blastall(data, pir);
 }
 
-BlastSummary sum <"summary.out">;
+BlastSummary sum <single_file_mapper;file=@strcat(runid,"/summary.out")>;
 sum = summarize(out);

Modified: usertools/cio/science/blast/mapper-in.sh
===================================================================
--- usertools/cio/science/blast/mapper-in.sh	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/mapper-in.sh	2009-04-09 19:25:09 UTC (rev 2849)
@@ -1,18 +1,21 @@
 #!/bin/bash
 
-#while getopts ":a:b:d:" options; do
-#  case $options in
-#    a) export a=$OPTARG ;;
-#	b) export b=$OPTARG ;;
-#	d) export d=$OPTARG ;;
-#	*) exit 1;;
-#  esac
-#done
+while getopts ":a:b:d:l:" options; do
+  case $options in
+    a) export a=$OPTARG ;;
+	b) export b=$OPTARG ;;
+	d) export d=$OPTARG ;;
+    l) export l=$OPTARG ;;
+	*) exit 1;;
+  esac
+done
 
-export a=1
-export b=2
-export d=0000
-
-ls /disks/ci-gpfs/swift/blast/pir/input/$d/*.qry | \
-  perl -wnl -e '$. >= $ENV{"a"} and $. <= $ENV{"b"}
-    and print "[", $.-$ENV{"a"}, "] $_";'
+for (( i = a; i <= b; i++ )); do
+  seqid=`printf %07d $i`
+  if [[ $l == "l" ]]; then
+    prefix=$d/`printf %04d $(( i / 1000 ))`
+  else
+    prefix=$d
+  fi
+  echo [$(($i-a))] $prefix/SEQ$seqid\_*.qry
+done

Modified: usertools/cio/science/blast/mapper-out.sh
===================================================================
--- usertools/cio/science/blast/mapper-out.sh	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/mapper-out.sh	2009-04-09 19:25:09 UTC (rev 2849)
@@ -1,22 +1,26 @@
 #!/bin/bash
 
-#while getopts ":a:b:d:" options; do
-#  case $options in
-#    a) export a=$OPTARG ;;
-#	b) export b=$OPTARG ;;
-#	d) export d=$OPTARG ;;
-#	*) exit 1;;
-#  esac
-#done
-export a=1
-export b=2
-export d=0000
+while getopts ":a:b:d:l:" options; do
+  case $options in
+    a) export a=$OPTARG ;;
+	b) export b=$OPTARG ;;
+	d) export d=$OPTARG ;;
+    l) export l=$OPTARG ;;
+	*) exit 1;;
+  esac
+done
 
-ls /disks/ci-gpfs/swift/blast/pir/input/$d/*.qry | \
-  perl -wnl -e '$. >= $ENV{"a"} and $. <= $ENV{"b"}
-    and s/qry/out/ and s/input/output/
-    and print "[", $.-$ENV{"a"}, "].out $_";' 
-ls /disks/ci-gpfs/swift/blast/pir/input/$d/*.qry | \
-  perl -wnl -e '$. >= $ENV{"a"} and $. <= $ENV{"b"}
-    and s/qry/err/ and s/input/error/
-    and print "[", $.-$ENV{"a"}, "].err $_";' 
+
+for (( i = a; i <= b; i++ )); do
+  seqid=`printf %07d $i`
+  if [[ $l == "l" ]]; then
+    prefix=$d/`printf %04d $(( i / 1000 ))`
+  else
+    prefix=$d
+  fi
+  if [ ! -d $prefix ]; then
+    mkdir -p $prefix
+  fi
+  echo [$(($i-a))].out $prefix/SEQ$seqid.out
+  echo [$(($i-a))].err $prefix/SEQ$seqid.err
+done

Modified: usertools/cio/science/blast/measure-runblast.sh
===================================================================
--- usertools/cio/science/blast/measure-runblast.sh	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/measure-runblast.sh	2009-04-09 19:25:09 UTC (rev 2849)
@@ -1 +1,34 @@
-./swiftblast.sh - UNIPROT_for_blast_14.0.seq 100001 100010
+#!/bin/bash
+
+PATH=$PATH:$CIOROOT/bin
+
+set_vanilla()
+{
+  cat > ~/bin/swift << EOF
+#!/bin/bash
+/home/falkon/swift_vanilla/cog/modules/swift/dist/swift-svn/bin/swift $@
+EOF
+}
+
+set_mtdm()
+{
+  cat > ~/bin/swift << EOF
+#!/bin/bash
+/home/falkon/swift_working/cog/modules/swift/dist/swift-svn/bin/swift $@
+EOF
+}
+
+NSIZE="64 128 256 512 1024" 
+for i in $NSIZE; do
+  $CIOROOT/bin/falkon-start.sh default $i 60 4 1 4
+  set_vanilla
+  cobalt_id=`tail -1 $HOME/.falkonjobs | awk '{print $1}'`
+  ./runblast.sh . common/UNIPROT_for_blast_14.0.seq \
+      100001 104000 l out.run_`printf %03d $i`
+  set_mtdm
+  falkon_id=`tail -1 $HOME/.falkonjobs | awk '{print $2}'`
+  /home/zzhang/DHT/bin/treebroadcast.sh $falkon_id common/common.tar
+  ./runblast.sh . common/UNIPROT_for_blast_14.0.seq \
+      100001 104000 l out.run_`printf %03d $i`
+  qdel $cobalt_id
+done

Modified: usertools/cio/science/blast/mockblast.sh
===================================================================
--- usertools/cio/science/blast/mockblast.sh	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/mockblast.sh	2009-04-09 19:25:09 UTC (rev 2849)
@@ -21,21 +21,24 @@
 
 BLOCKS=1310272
 
-dd if=$query of=/dev/null bs=$BLOCKS #2> /dev/null
+dd=/fuse/bin/dd
+ls=/fuse/bin/ls
 
+$dd if=$query of=/dev/null bs=$BLOCKS #2> /dev/null
+
 for i in $db; do
   if [ -h $i ]; then
     truefile=`readlink $i`
   else
     truefile=$i
   fi
-  size=`ls --block-size=$BLOCKS -s $truefile | cut -f 1 -d ' '`
+  size=`$ls --block-size=$BLOCKS -s $truefile | cut -f 1 -d ' '`
   count=$(( size / 4))
   if [ $count -eq 0 ]; then
      count=$size
   fi
   echo $count
-  dd if=$i of=/dev/null bs=$BLOCKS count=$count  #2> /dev/null
+  $dd if=$i of=/dev/null bs=$BLOCKS count=$count  #2> /dev/null
 done
 sleep 41
-dd if=/dev/zero of=$output bs=15812 count=1 #2> /dev/null
+$dd if=/dev/zero of=$output bs=15812 count=1 #2> /dev/null

Modified: usertools/cio/science/blast/readseq.rb
===================================================================
--- usertools/cio/science/blast/readseq.rb	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/readseq.rb	2009-04-09 19:25:09 UTC (rev 2849)
@@ -21,7 +21,7 @@
   x =~ />(.*)\n/
   seqname = $1
   if seq_start <= fasta_db.lineno
-    dir = pad == "l" ? prefix +
+    dir = pad == "l" ? prefix + 
 	    sprintf("/%04d", fasta_db.lineno / 1000) : prefix
     fname = sprintf "SEQ%07d_%s.qry", fasta_db.lineno, seqname
 	FileUtils.mkdir_p dir

Modified: usertools/cio/science/blast/runblast.sh
===================================================================
--- usertools/cio/science/blast/runblast.sh	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/runblast.sh	2009-04-09 19:25:09 UTC (rev 2849)
@@ -1,18 +1,25 @@
 #!/bin/bash
 
-# Script: swiftblast.sh [blast_db] [start_seq] [end_seq] [outdir]
+# Script: swiftblast.sh [falkon_id] [blast_db] [start_seq] [end_seq] [outdir]
 #         Invokes the swift workflow of reciprocal blast
 
 if [ $# -lt 3 ]; then
   cat << EOF
 ERROR   :  too few arguments
-Usage   :  $0 [blast_db] [start_seq] [end_seq] [outdir]
+Usage   :  $0 [falkon_id][blast_db] [start_seq] [end_seq] [outdir]
 Example :  Reciprocal blast on the 100th to 200th sequences
 of UNIPRT.seq and dump results to run_100:
 $0 UNIPROT.seq 100 200 run_100
 EOF
 fi
 
+if [ -z $CIOROOT ]; then
+  echo "ERROR: CIOROOT not defined"
+  exit 1
+fi
+
+PATH=$PATH:$CIOROOT/bin
+
 FALKON_ID=$1
 BLAST_DB=$2
 START_SEQ=$3
@@ -20,18 +27,49 @@
 
 # Limit to 1k sequences per dir "l"
 LIMIT=$5
+RUNID=$6 # Default output directory of everything!
 
 BLASTROOT=$CIOROOT/science/blast
 
+if [ ! -d $RUNID ]; then
+  mkdir -p $RUNID
+fi
+
 # Extract sequences
-ruby readseq.rb $BLAST_DB $START_SEQ $END_SEQ seqs
 
-if [ $FALKON_ID != "-" ]; then # Real Falkon job
-  sleep 0 
-  # Regenerate tc.data and sites.xml @BGP@
+ruby readseq.rb `dirname $BLAST_DB`/fasta/`basename $BLAST_DB` \
+    $START_SEQ $END_SEQ $RUNID $LIMIT
+
+if [ $FALKON_ID == "." ]; then
+  FALKON_ID=`tail -1 $HOME/.falkonjobs | awk '{print $2}'`
+fi 
+
+if [ $FALKON_ID == "-" ]; then # Real Falkon job
+  tcfile=tc.local
+  cp $tcfile tc.data
 else
-  sleep 0 
+  tcfile=tc.bgp
+  gentcdata 32 < $tcfile > tc.data
 fi
 
-# Run swift:
-#swift -sites.file sites.xml -tc.file tc.data $BLASTROOT/blast.swift
+
+if [ $FALKON_ID == "-" ]; then 
+  echo "local"
+  swift -sites.file ./sites.xml -tc.file tc.data blast.swift \
+      -db=`basename $BLAST_DB` -db_pref=`dirname $BLAST_DB`\
+      -limit=$LIMIT -start=$START_SEQ -end=$END_SEQ \
+      -run=$RUNID
+else # Real Falkon job
+  fjob=`echo $FALKON_ID|awk '{printf "%.4d",$1}'`
+  nodes=`cat $HOME/.falkonjobs | awk "\\$2==\"$fjob\" {print \\$4}"`
+  fdir=/home/falkon/users/$USER
+  if [ ! -d $fdir/$fjob ]; then
+    echo $0: Falkon job id $fjob does not exist in $fdir. Last 5 jobs there are:
+    (cd $fdir; ls -1 | tail -5)
+    exit 1
+  fi
+  /usr/bin/time -p swift_bgp.sh $fjob $nodes $BLASTROOT/blast.swift \
+      -db=`basename $BLAST_DB` -db_pref=`dirname $BLAST_DB`\
+      -limit=$LIMIT -start=$START_SEQ -end=$END_SEQ \
+      -run=$RUNID >&swift_out.$RUNID
+fi

Copied: usertools/cio/science/blast/tc.bgp (from rev 2835, usertools/cio/science/blast/tc.data)
===================================================================
--- usertools/cio/science/blast/tc.bgp	                        (rev 0)
+++ usertools/cio/science/blast/tc.bgp	2009-04-09 19:25:09 UTC (rev 2849)
@@ -0,0 +1,24 @@
+#This is the transformation catalog.
+#
+#It comes pre-configured with a number of simple transformations with
+#paths that are likely to work on a linux box. However, on some systems,
+#the paths to these executables will be different (for example, sometimes
+#some of these programs are found in /usr/bin rather than in /bin)
+#
+#NOTE WELL: fields in this file must be separated by tabs, not spaces; and
+#there must be no trailing whitespace at the end of each line.
+#
+# sitename  transformation  path   INSTALLED  platform  profiles
+localhost 	echo 		/bin/echo	INSTALLED	INTEL32::LINUX	null
+localhost 	cat 		/bin/cat	INSTALLED	INTEL32::LINUX	null
+localhost 	ls 		/bin/ls		INSTALLED	INTEL32::LINUX	null
+localhost 	grep 		/bin/grep	INSTALLED	INTEL32::LINUX	null
+localhost 	sort 		/bin/sort	INSTALLED	INTEL32::LINUX	null
+localhost 	paste 		/bin/paste	INSTALLED	INTEL32::LINUX	null
+
+# BLAST binaries
+communicado	blastall	/home/aespinosa/science/blast/ncbi.communicado/bin/blastall	INSTALLED	INTEL32::LINUX	null
+communicado	mockblast	/home/aespinosa/cio/science/blast/mockblast.sh	INSTALLED	INTEL32::LINUX	null
+ at BGP@	blastall	/home/espinosa/blast/ncbi/bin/blastall	INSTALLED	INTEL32::LINUX	null
+ at BGP@	mockblast	/home/espinosa/cio/science/blast/mockblast.sh	INSTALLED	INTEL32::LINUX	null
+ at BGP@ 	cat 		/bin/cat	INSTALLED	INTEL32::LINUX	null

Deleted: usertools/cio/science/blast/tc.data
===================================================================
--- usertools/cio/science/blast/tc.data	2009-04-08 22:38:45 UTC (rev 2848)
+++ usertools/cio/science/blast/tc.data	2009-04-09 19:25:09 UTC (rev 2849)
@@ -1,23 +0,0 @@
-#This is the transformation catalog.
-#
-#It comes pre-configured with a number of simple transformations with
-#paths that are likely to work on a linux box. However, on some systems,
-#the paths to these executables will be different (for example, sometimes
-#some of these programs are found in /usr/bin rather than in /bin)
-#
-#NOTE WELL: fields in this file must be separated by tabs, not spaces; and
-#there must be no trailing whitespace at the end of each line.
-#
-# sitename  transformation  path   INSTALLED  platform  profiles
-localhost 	echo 		/bin/echo	INSTALLED	INTEL32::LINUX	null
-localhost 	cat 		/bin/cat	INSTALLED	INTEL32::LINUX	null
-localhost 	ls 		/bin/ls		INSTALLED	INTEL32::LINUX	null
-localhost 	grep 		/bin/grep	INSTALLED	INTEL32::LINUX	null
-localhost 	sort 		/bin/sort	INSTALLED	INTEL32::LINUX	null
-localhost 	paste 		/bin/paste	INSTALLED	INTEL32::LINUX	null
-
-# BLAST binaries
-communicado	blastall	/home/aespinosa/science/blast/ncbi.communicado/bin/blastall	INSTALLED	INTEL32::LINUX	null
-communicado	mockblast	/home/aespinosa/cio/science/blast/mockblast.sh	INSTALLED	INTEL32::LINUX	null
-surveyor	blastall	/home/espinosa/science/blast/ncbi.communicado/bin/blastall	INSTALLED	INTEL32::LINUX	null
-surveyor	mockblast	/home/espinosa/cio/science/blast/mockblast.sh	INSTALLED	INTEL32::LINUX	null




More information about the Swift-commit mailing list