[Swift-commit] r4562 - in SwiftApps/SwiftR/Swift: R exec
tga at ci.uchicago.edu
tga at ci.uchicago.edu
Fri Jun 3 15:27:02 CDT 2011
Author: tga
Date: 2011-06-03 15:27:01 -0500 (Fri, 03 Jun 2011)
New Revision: 4562
Modified:
SwiftApps/SwiftR/Swift/R/Workers.R
SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
SwiftApps/SwiftR/Swift/exec/configure-server-crayxt
SwiftApps/SwiftR/Swift/exec/configure-server-local
SwiftApps/SwiftR/Swift/exec/configure-server-pbs
SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
SwiftApps/SwiftR/Swift/exec/configure-server-sge
SwiftApps/SwiftR/Swift/exec/configure-server-ssh
SwiftApps/SwiftR/Swift/exec/rserver.swift
SwiftApps/SwiftR/Swift/exec/start-swift
Log:
Added configurable number of task retries.
found out that qsub sometimes returns code of 0 even on failure, added extra code to catch this case.
Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/R/Workers.R 2011-06-03 20:27:01 UTC (rev 4562)
@@ -52,6 +52,7 @@
kernel=getOption("swift.kernel"),
workmode=getOption("swift.workmode"),
throttle=getOption("swift.throttle"),
+ retries=getOption("swift.retries"),
queue=getOption("swift.queue"),
rcmd=getOption("swift.rcmd"), time=getOption("swift.time"),
workerLogging=getOption("swift.workerLogging"),
@@ -138,6 +139,9 @@
if(! is.null(workmode) ) {
cmdString <- paste(cmdString, "-m", shQuote(workmode))
}
+ if (! is.null(retries) ) {
+ cmdString <- paste(cmdString, "-retries", shQuote(retries))
+ }
if (server == "local")
nodes <- 1
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-cobalt 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-cobalt 2011-06-03 20:27:01 UTC (rev 4562)
@@ -27,7 +27,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=true
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-crayxt
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-crayxt 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-crayxt 2011-06-03 20:27:01 UTC (rev 4562)
@@ -40,7 +40,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=true
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-local
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-local 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-local 2011-06-03 20:27:01 UTC (rev 4562)
@@ -58,7 +58,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=false
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbs
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbs 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbs 2011-06-03 20:27:01 UTC (rev 4562)
@@ -29,7 +29,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=true
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto 2011-06-03 20:27:01 UTC (rev 4562)
@@ -71,7 +71,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=true
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-sge
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-sge 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-sge 2011-06-03 20:27:01 UTC (rev 4562)
@@ -32,7 +32,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=true
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-ssh
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-ssh 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-ssh 2011-06-03 20:27:01 UTC (rev 4562)
@@ -25,7 +25,7 @@
cat >cf <<END
wrapperlog.always.transfer=false
sitedir.keep=false
-execution.retries=0
+execution.retries=$num_retries
lazy.errors=false
status.mode=provider
use.provider.staging=true
Modified: SwiftApps/SwiftR/Swift/exec/rserver.swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/rserver.swift 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/rserver.swift 2011-06-03 20:27:01 UTC (rev 4562)
@@ -54,13 +54,18 @@
iterate serially {
boolean done;
- string dir;
+ // array in case we get multiple requests in one read
+ string dirs[];
trace("top of loop: rserver waiting for input on", requestPipeName);
- dir = readData(requestPipeName); # Reads direct from this local pipe. Assumes Swift started in right dir.
- if (dir=="done") { done=true; } else { done=false;}
- trace("rserver: got dir", dir);
-
- # fork off thread
- process_async(dir);
+ dirs = readData(requestPipeName); # Reads direct from this local pipe. Assumes Swift started in right dir.
+ done = false;
+// if (dir=="done") { done=true; } else { done=false;}
+ foreach dir, i in dirs {
+ trace("rserver: got dir", dir);
+ if (dir != "") {
+ # fork off thread
+ process_async(dir);
+ }
+ }
} until (done);
Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift 2011-06-03 20:07:13 UTC (rev 4561)
+++ SwiftApps/SwiftR/Swift/exec/start-swift 2011-06-03 20:27:01 UTC (rev 4562)
@@ -379,11 +379,26 @@
else
qsub $SUBMIT_FILE > $jobidfile
succ=$?
+
fi
if [ $succ -eq 0 ]
then
- echo Started workers from batch job $(cat $jobidfile)
+
+ # torque's error code is 0 sometimes when failed: make sure there
+ # is something in job id file
+ jobid=$(cat $jobidfile)
+ jobid=`echo $jobid`
+ if [ -z "$jobid" ]; then
+ echo Batch queue submission failed, exiting.
+ if [ "$doack" = TRUE ]; then
+ echo "Error: batch submission failed: qsub returned $succ" > ackfifo
+ fi
+ stdcleanup_start
+ stdcleanup_end
+ exit 1
+ fi
+ echo Started workers from batch job $jobid
if [ "$doack" = TRUE ]; then
echo ok > ackfifo
fi
@@ -531,6 +546,7 @@
defaultSshCores=4
defaultClusterCores=8
throttle=10
+num_retries=0
hosts=no-hosts-specified
queue=NONE
project=NONE
@@ -571,6 +587,7 @@
--tc.file) tc_file=$2; shift ;;
--sites.file) sites_file=$2; shift ;;
--cf.file) cf_file=$2; shift ;;
+ -retries) num_retries=$2; verify-is-numeric num_retries $num_retries; shift ;;
*) usage; exit 1 ;;
esac
shift
More information about the Swift-commit
mailing list