[Swift-commit] r4562 - in SwiftApps/SwiftR/Swift: R exec

tga at ci.uchicago.edu tga at ci.uchicago.edu
Fri Jun 3 15:27:02 CDT 2011


Author: tga
Date: 2011-06-03 15:27:01 -0500 (Fri, 03 Jun 2011)
New Revision: 4562

Modified:
   SwiftApps/SwiftR/Swift/R/Workers.R
   SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
   SwiftApps/SwiftR/Swift/exec/configure-server-crayxt
   SwiftApps/SwiftR/Swift/exec/configure-server-local
   SwiftApps/SwiftR/Swift/exec/configure-server-pbs
   SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
   SwiftApps/SwiftR/Swift/exec/configure-server-sge
   SwiftApps/SwiftR/Swift/exec/configure-server-ssh
   SwiftApps/SwiftR/Swift/exec/rserver.swift
   SwiftApps/SwiftR/Swift/exec/start-swift
Log:
Added configurable number of task retries.
found out that qsub sometimes returns code of 0 even on failure, added extra code to catch this case.


Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/R/Workers.R	2011-06-03 20:27:01 UTC (rev 4562)
@@ -52,6 +52,7 @@
                     kernel=getOption("swift.kernel"), 
                     workmode=getOption("swift.workmode"),
                     throttle=getOption("swift.throttle"), 
+                    retries=getOption("swift.retries"),
                     queue=getOption("swift.queue"),
                     rcmd=getOption("swift.rcmd"), time=getOption("swift.time"),
                     workerLogging=getOption("swift.workerLogging"),
@@ -138,6 +139,9 @@
     if(! is.null(workmode) )  {
         cmdString <- paste(cmdString, "-m", shQuote(workmode)) 
     }
+    if (! is.null(retries) ) {
+        cmdString <- paste(cmdString, "-retries", shQuote(retries))
+    }
 
     if (server == "local")
         nodes <- 1

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-cobalt	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-cobalt	2011-06-03 20:27:01 UTC (rev 4562)
@@ -27,7 +27,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=true

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-crayxt
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-crayxt	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-crayxt	2011-06-03 20:27:01 UTC (rev 4562)
@@ -40,7 +40,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=true

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-local
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-local	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-local	2011-06-03 20:27:01 UTC (rev 4562)
@@ -58,7 +58,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=false

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbs
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbs	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbs	2011-06-03 20:27:01 UTC (rev 4562)
@@ -29,7 +29,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=true

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto	2011-06-03 20:27:01 UTC (rev 4562)
@@ -71,7 +71,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=true

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-sge
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-sge	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-sge	2011-06-03 20:27:01 UTC (rev 4562)
@@ -32,7 +32,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=true

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-ssh
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-ssh	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-ssh	2011-06-03 20:27:01 UTC (rev 4562)
@@ -25,7 +25,7 @@
 cat >cf <<END
 wrapperlog.always.transfer=false
 sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
 lazy.errors=false
 status.mode=provider
 use.provider.staging=true

Modified: SwiftApps/SwiftR/Swift/exec/rserver.swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/rserver.swift	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/rserver.swift	2011-06-03 20:27:01 UTC (rev 4562)
@@ -54,13 +54,18 @@
 
 iterate serially {
   boolean done;
-  string dir;
+  // array in case we get multiple requests in one read
+  string dirs[];
 
   trace("top of loop: rserver waiting for input on", requestPipeName);
-  dir = readData(requestPipeName); # Reads direct from this local pipe. Assumes Swift started in right dir.
-  if (dir=="done") { done=true; } else { done=false;}
-  trace("rserver: got dir", dir);
-
-  # fork off thread
-  process_async(dir);
+  dirs = readData(requestPipeName); # Reads direct from this local pipe. Assumes Swift started in right dir.
+  done = false;
+//  if (dir=="done") { done=true; } else { done=false;}
+  foreach dir, i in dirs {
+      trace("rserver: got dir", dir);
+      if (dir != "") {
+          # fork off thread
+          process_async(dir);
+      }
+  }
 } until (done);

Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift	2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/start-swift	2011-06-03 20:27:01 UTC (rev 4562)
@@ -379,11 +379,26 @@
   else
     qsub $SUBMIT_FILE > $jobidfile
     succ=$?
+
   fi
 
   if [ $succ -eq 0 ]
   then
-    echo Started workers from batch job $(cat $jobidfile)
+    
+    # torque's error code is 0 sometimes when failed: make sure there 
+    # is something in job id file
+    jobid=$(cat $jobidfile)
+    jobid=`echo $jobid`
+    if [ -z "$jobid" ]; then
+        echo Batch queue submission failed, exiting.
+        if [ "$doack" = TRUE ]; then
+          echo "Error: batch submission failed: qsub returned $succ" > ackfifo
+        fi
+        stdcleanup_start
+        stdcleanup_end
+        exit 1
+    fi
+    echo Started workers from batch job $jobid
     if [ "$doack" = TRUE ]; then
       echo ok > ackfifo
     fi
@@ -531,6 +546,7 @@
 defaultSshCores=4
 defaultClusterCores=8
 throttle=10
+num_retries=0
 hosts=no-hosts-specified
 queue=NONE
 project=NONE
@@ -571,6 +587,7 @@
     --tc.file) tc_file=$2; shift ;;
     --sites.file) sites_file=$2; shift ;;
     --cf.file) cf_file=$2; shift ;;
+    -retries) num_retries=$2; verify-is-numeric num_retries $num_retries; shift ;;
     *)  usage; exit 1 ;;
   esac
   shift




More information about the Swift-commit mailing list