[Swift-commit] r4459 - in SwiftApps/SwiftR/Swift: R exec

tga at ci.uchicago.edu tga at ci.uchicago.edu
Mon May 9 13:48:40 CDT 2011


Author: tga
Date: 2011-05-09 13:48:40 -0500 (Mon, 09 May 2011)
New Revision: 4459

Modified:
   SwiftApps/SwiftR/Swift/R/Workers.R
   SwiftApps/SwiftR/Swift/exec/start-swift
   SwiftApps/SwiftR/Swift/exec/start-swift-daemon
Log:
Ironing out some rough edges with ssh.


Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R	2011-05-09 18:05:15 UTC (rev 4458)
+++ SwiftApps/SwiftR/Swift/R/Workers.R	2011-05-09 18:48:40 UTC (rev 4459)
@@ -202,16 +202,15 @@
                 "--tc.file", shQuote(tc.file), "--sites.file", shQuote(sites.file))
     }
 
-    # launch asynchronously
-    # for now, we will rely on the shell script's output to inform
-    #  the user if there was a problem with the workers
+    # launch server. 
     out <- system(cmdString, intern=TRUE)
-    if (length(out) != 2) 
+    if (length(out) != 3 && length(out) != 2) {
         stop(paste("Unexpected output from start-swift: '", 
             paste(out, collapse="\n"), "'", "Launching may have failed"))
+
+    }
     pid <- out[[1]]
     workdir <- out[[2]]
-    cat("Started worker manager with pid ", pid, "\n")
     
     output <- list()
     output$pid <- pid
@@ -219,18 +218,29 @@
     output$workdir <- workdir
     output$cores <- cores
     output$nodes <- nodes
+    
+    if (length(out) == 2) {
+        # didn't get status message
+        killWorkerProcess(output, quiet=T)
+        stop("swiftInit failed: no status message from start-swift\n")
+    }
+    stat <- out[[3]]
+    if (stat == "ok") {
+        cat("Started worker manager with pid ", pid, "\n")
+        # store worker info
+        .swift.workers[[length(.swift.workers) + 1]] <<- output
+        # add hook to ensure child process will be killed when 
+        # this process exits
+        addHook()
 
-    # store worker info
-    .swift.workers[[length(.swift.workers) + 1]] <<- output
+        # Sleep to give start-swift time to set up fifos,etc
 
-    # add hook to ensure child process will be killed when 
-    # this process exits
-    addHook()
-
-    # Sleep to give start-swift time to set up fifos,etc
-    #Sys.sleep(2)
-
-    return (invisible(output))
+        return (invisible(output))
+    }
+    else {
+        killWorkerProcess(output, quiet=T)
+        stop(paste("swiftInit failed with error:", stat))
+    }
 }
 
 swiftShutdown <- function(handle=NULL, all=FALSE) {
@@ -263,12 +273,18 @@
     }
     # shut down all worker processes using kill
     for (worker in workers) {
+        killWorkerProcess(worker)
+    }
+}
+
+killWorkerProcess <- function (worker, quiet=F) {
+    if (! quiet) {
         cat(paste("Terminating worker", worker$pid, "of type", 
-                    worker$server, "\n"))
-        cmdString <- file.path(.find.package("Swift"), "exec/killtree &> /dev/null ")
-        killCmd <- paste(cmdString, worker$pid)
-        system(killCmd, wait=FALSE)
+                worker$server, "\n"))
     }
+    cmdString <- file.path(.find.package("Swift"), "exec/killtree &> /dev/null ")
+    killCmd <- paste(cmdString, worker$pid)
+    system(killCmd, wait=FALSE)
 }
 
 workerCount <- function (server) {

Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift	2011-05-09 18:05:15 UTC (rev 4458)
+++ SwiftApps/SwiftR/Swift/exec/start-swift	2011-05-09 18:48:40 UTC (rev 4459)
@@ -2,6 +2,9 @@
 
 export TRAPEVENTS="EXIT 1 2 3 15"  # Signals and conditions to trap
 
+# * expands to nothing if no match
+shopt -s nullglob
+
 # Set the umask to prevent any access by other users:
 # there is no reason why any other user should need to look at
 # the temporary files, etc that we create 
@@ -44,7 +47,7 @@
   #  mkdir -p $LOGDIR # is done with the ssh command, below
 
   IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
-
+  sshpids=
   for host in $(echo $hosts); do
     timestamp=$(date "+%Y.%m%d.%H%M%S")
     random=$(awk "BEGIN {printf \"%0.5d\", $RANDOM}")
@@ -71,12 +74,17 @@
 
 
   done
-
-  echo Started workers from ssh processes $sshpids
-  echo $sshpids > $sshpidfile
-  if [ "$doack" = TRUE ]; then
-    echo done > ackfifo
-    doack=FALSE
+  if [ "$sshpids" = "" ]; then
+      echo No ssh workers successfully launched
+      if [ "$doack" = TRUE ]; then
+        echo 'Error: no ssh workers launched' > ackfifo
+      fi
+  else
+      echo Started workers from ssh processes $sshpids
+      echo $sshpids > $sshpidfile
+      if [ "$doack" = TRUE ]; then
+        echo ok > ackfifo
+      fi
   fi
 }
 
@@ -374,11 +382,13 @@
   then
     echo Started workers from batch job $(cat $jobidfile)
     if [ "$doack" = TRUE ]; then
-      echo done > ackfifo
-      doack=FALSE
+      echo ok > ackfifo
     fi
   else
     echo Batch queue submission failed, exiting.
+    if [ "$doack" = TRUE ]; then
+      echo 'Error: no ssh workers launched' > ackfifo
+    fi
     stdcleanup_start
     stdcleanup_end
     exit 1
@@ -575,7 +585,6 @@
 # Standard clenuup actions
 function stdcleanup_start {
     # don't accept any more requests: unlink fifo from filesystem
-    echo stdcleanup_start 1&>2
     if [ -p requestpipe ]; then
         rm requestpipe 
     fi
@@ -591,10 +600,6 @@
             rm $SUBMIT_FILE
         fi 
     fi
-    if [ "$doack" = TRUE ]; then
-        echo done > ackfifo
-        doack=FALSE
-    fi
 }
 
 echo Running in $trundir "(linked to $rundir)"
@@ -627,6 +632,9 @@
     }
     trap onexit $TRAPEVENTS
     exitcmd=onexit
+    if [ "$doack" = TRUE ]; then
+      echo ok > ackfifo
+    fi
 elif [ $server = local ]; then
 
   if [ $cores -eq 0 ]; then
@@ -656,8 +664,7 @@
   trap onexit $TRAPEVENTS
   exitcmd=onexit
   if [ "$doack" = TRUE ]; then
-    echo done > ackfifo
-    doack=FALSE
+    echo ok > ackfifo
   fi
 elif [ $server = ssh ]; then
   
@@ -675,9 +682,10 @@
     stdcleanup_start
     coasterservicepid="" # null: saved in case we go back to using coaster servers
     trap - $TRAPEVENTS
-    sshpids=$(cat $sshpidfile)
+    sshpids=$(cat $sshpidfile 2> /dev/null)
+
 #    echo Terminating worker processes $sshpids, starter $starterpid
-    for rpfile in $(ls -1 remotepid.*); do
+    for rpfile in $(echo remotepid.*); do
       rpgid=$(grep PGID= $rpfile | sed -e 's/PGID=//')
       rhost=$(echo $rpfile | sed -e 's/remotepid.//')
       #echo Based on $rpfile: terminating process process group $rpgid on $rhost

Modified: SwiftApps/SwiftR/Swift/exec/start-swift-daemon
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift-daemon	2011-05-09 18:05:15 UTC (rev 4458)
+++ SwiftApps/SwiftR/Swift/exec/start-swift-daemon	2011-05-09 18:48:40 UTC (rev 4459)
@@ -33,5 +33,12 @@
 echo ${workdir}
 
 # Wait for subprocess to let us know its ready
-cat $ackfifo > /dev/null
+res=`cat $ackfifo`
 rm $ackfifo
+if [ "$res" = ok ]; then
+    echo $res
+    exit 0;
+else
+    echo $res
+    exit 1
+fi




More information about the Swift-commit mailing list