[Swift-commit] r4459 - in SwiftApps/SwiftR/Swift: R exec
tga at ci.uchicago.edu
tga at ci.uchicago.edu
Mon May 9 13:48:40 CDT 2011
Author: tga
Date: 2011-05-09 13:48:40 -0500 (Mon, 09 May 2011)
New Revision: 4459
Modified:
SwiftApps/SwiftR/Swift/R/Workers.R
SwiftApps/SwiftR/Swift/exec/start-swift
SwiftApps/SwiftR/Swift/exec/start-swift-daemon
Log:
Ironing out some rough edges with ssh.
Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R 2011-05-09 18:05:15 UTC (rev 4458)
+++ SwiftApps/SwiftR/Swift/R/Workers.R 2011-05-09 18:48:40 UTC (rev 4459)
@@ -202,16 +202,15 @@
"--tc.file", shQuote(tc.file), "--sites.file", shQuote(sites.file))
}
- # launch asynchronously
- # for now, we will rely on the shell script's output to inform
- # the user if there was a problem with the workers
+ # launch server.
out <- system(cmdString, intern=TRUE)
- if (length(out) != 2)
+ if (length(out) != 3 && length(out) != 2) {
stop(paste("Unexpected output from start-swift: '",
paste(out, collapse="\n"), "'", "Launching may have failed"))
+
+ }
pid <- out[[1]]
workdir <- out[[2]]
- cat("Started worker manager with pid ", pid, "\n")
output <- list()
output$pid <- pid
@@ -219,18 +218,29 @@
output$workdir <- workdir
output$cores <- cores
output$nodes <- nodes
+
+ if (length(out) == 2) {
+ # didn't get status message
+ killWorkerProcess(output, quiet=T)
+ stop("swiftInit failed: no status message from start-swift\n")
+ }
+ stat <- out[[3]]
+ if (stat == "ok") {
+ cat("Started worker manager with pid ", pid, "\n")
+ # store worker info
+ .swift.workers[[length(.swift.workers) + 1]] <<- output
+ # add hook to ensure child process will be killed when
+ # this process exits
+ addHook()
- # store worker info
- .swift.workers[[length(.swift.workers) + 1]] <<- output
+ # Sleep to give start-swift time to set up fifos,etc
- # add hook to ensure child process will be killed when
- # this process exits
- addHook()
-
- # Sleep to give start-swift time to set up fifos,etc
- #Sys.sleep(2)
-
- return (invisible(output))
+ return (invisible(output))
+ }
+ else {
+ killWorkerProcess(output, quiet=T)
+ stop(paste("swiftInit failed with error:", stat))
+ }
}
swiftShutdown <- function(handle=NULL, all=FALSE) {
@@ -263,12 +273,18 @@
}
# shut down all worker processes using kill
for (worker in workers) {
+ killWorkerProcess(worker)
+ }
+}
+
+killWorkerProcess <- function (worker, quiet=F) {
+ if (! quiet) {
cat(paste("Terminating worker", worker$pid, "of type",
- worker$server, "\n"))
- cmdString <- file.path(.find.package("Swift"), "exec/killtree &> /dev/null ")
- killCmd <- paste(cmdString, worker$pid)
- system(killCmd, wait=FALSE)
+ worker$server, "\n"))
}
+ cmdString <- file.path(.find.package("Swift"), "exec/killtree &> /dev/null ")
+ killCmd <- paste(cmdString, worker$pid)
+ system(killCmd, wait=FALSE)
}
workerCount <- function (server) {
Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift 2011-05-09 18:05:15 UTC (rev 4458)
+++ SwiftApps/SwiftR/Swift/exec/start-swift 2011-05-09 18:48:40 UTC (rev 4459)
@@ -2,6 +2,9 @@
export TRAPEVENTS="EXIT 1 2 3 15" # Signals and conditions to trap
+# * expands to nothing if no match
+shopt -s nullglob
+
# Set the umask to prevent any access by other users:
# there is no reason why any other user should need to look at
# the temporary files, etc that we create
@@ -44,7 +47,7 @@
# mkdir -p $LOGDIR # is done with the ssh command, below
IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
-
+ sshpids=
for host in $(echo $hosts); do
timestamp=$(date "+%Y.%m%d.%H%M%S")
random=$(awk "BEGIN {printf \"%0.5d\", $RANDOM}")
@@ -71,12 +74,17 @@
done
-
- echo Started workers from ssh processes $sshpids
- echo $sshpids > $sshpidfile
- if [ "$doack" = TRUE ]; then
- echo done > ackfifo
- doack=FALSE
+ if [ "$sshpids" = "" ]; then
+ echo No ssh workers successfully launched
+ if [ "$doack" = TRUE ]; then
+ echo 'Error: no ssh workers launched' > ackfifo
+ fi
+ else
+ echo Started workers from ssh processes $sshpids
+ echo $sshpids > $sshpidfile
+ if [ "$doack" = TRUE ]; then
+ echo ok > ackfifo
+ fi
fi
}
@@ -374,11 +382,13 @@
then
echo Started workers from batch job $(cat $jobidfile)
if [ "$doack" = TRUE ]; then
- echo done > ackfifo
- doack=FALSE
+ echo ok > ackfifo
fi
else
echo Batch queue submission failed, exiting.
+ if [ "$doack" = TRUE ]; then
+ echo 'Error: no ssh workers launched' > ackfifo
+ fi
stdcleanup_start
stdcleanup_end
exit 1
@@ -575,7 +585,6 @@
# Standard clenuup actions
function stdcleanup_start {
# don't accept any more requests: unlink fifo from filesystem
- echo stdcleanup_start 1&>2
if [ -p requestpipe ]; then
rm requestpipe
fi
@@ -591,10 +600,6 @@
rm $SUBMIT_FILE
fi
fi
- if [ "$doack" = TRUE ]; then
- echo done > ackfifo
- doack=FALSE
- fi
}
echo Running in $trundir "(linked to $rundir)"
@@ -627,6 +632,9 @@
}
trap onexit $TRAPEVENTS
exitcmd=onexit
+ if [ "$doack" = TRUE ]; then
+ echo ok > ackfifo
+ fi
elif [ $server = local ]; then
if [ $cores -eq 0 ]; then
@@ -656,8 +664,7 @@
trap onexit $TRAPEVENTS
exitcmd=onexit
if [ "$doack" = TRUE ]; then
- echo done > ackfifo
- doack=FALSE
+ echo ok > ackfifo
fi
elif [ $server = ssh ]; then
@@ -675,9 +682,10 @@
stdcleanup_start
coasterservicepid="" # null: saved in case we go back to using coaster servers
trap - $TRAPEVENTS
- sshpids=$(cat $sshpidfile)
+ sshpids=$(cat $sshpidfile 2> /dev/null)
+
# echo Terminating worker processes $sshpids, starter $starterpid
- for rpfile in $(ls -1 remotepid.*); do
+ for rpfile in $(echo remotepid.*); do
rpgid=$(grep PGID= $rpfile | sed -e 's/PGID=//')
rhost=$(echo $rpfile | sed -e 's/remotepid.//')
#echo Based on $rpfile: terminating process process group $rpgid on $rhost
Modified: SwiftApps/SwiftR/Swift/exec/start-swift-daemon
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift-daemon 2011-05-09 18:05:15 UTC (rev 4458)
+++ SwiftApps/SwiftR/Swift/exec/start-swift-daemon 2011-05-09 18:48:40 UTC (rev 4459)
@@ -33,5 +33,12 @@
echo ${workdir}
# Wait for subprocess to let us know its ready
-cat $ackfifo > /dev/null
+res=`cat $ackfifo`
rm $ackfifo
+if [ "$res" = ok ]; then
+ echo $res
+ exit 0;
+else
+ echo $res
+ exit 1
+fi
More information about the Swift-commit
mailing list