[Swift-commit] r4458 - in SwiftApps/SwiftR: . Swift/R Swift/exec perftools

tga at ci.uchicago.edu tga at ci.uchicago.edu
Mon May 9 13:05:15 CDT 2011


Author: tga
Date: 2011-05-09 13:05:15 -0500 (Mon, 09 May 2011)
New Revision: 4458

Modified:
   SwiftApps/SwiftR/IMMEDIATE-TODO
   SwiftApps/SwiftR/Swift/R/Workers.R
   SwiftApps/SwiftR/Swift/exec/start-swift
   SwiftApps/SwiftR/Swift/exec/start-swift-daemon
   SwiftApps/SwiftR/perftools/parselog.py
Log:
Changing swiftInit so that it doesn't return until the workers have been launched ok.  This should allow better error handling, and should allow users to type their ssh passwords.

still to be done: test more thoroughly, throw error from swiftInit if workers don't start up ok



Modified: SwiftApps/SwiftR/IMMEDIATE-TODO
===================================================================
--- SwiftApps/SwiftR/IMMEDIATE-TODO	2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/IMMEDIATE-TODO	2011-05-09 18:05:15 UTC (rev 4458)
@@ -16,6 +16,10 @@
         startup code for one of libraries
 
 HIGH:
+-- sometimes the server crashes after servicing first request(
+    only observer when running locally and on sge)
+
+HIGH:
 -- Benchmark ideas
     - Beagle
     - Sarah Kenny FMRI
@@ -47,8 +51,10 @@
 
 MED:
 -- Support generic swift sites.xml and tc.data files for power users
+-- Note: support is added, but need to consider either:
+    a) guidelines for how to write the file
+    b) templates or auto-generation.
 
-
 MED:
 -- automated tests
 

Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R	2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/Swift/R/Workers.R	2011-05-09 18:05:15 UTC (rev 4458)
@@ -207,8 +207,8 @@
     #  the user if there was a problem with the workers
     out <- system(cmdString, intern=TRUE)
     if (length(out) != 2) 
-        stop(paste("Unexpected output from start-swift: '", out, "'",
-            "Launching may have failed"))
+        stop(paste("Unexpected output from start-swift: '", 
+            paste(out, collapse="\n"), "'", "Launching may have failed"))
     pid <- out[[1]]
     workdir <- out[[2]]
     cat("Started worker manager with pid ", pid, "\n")
@@ -228,7 +228,7 @@
     addHook()
 
     # Sleep to give start-swift time to set up fifos,etc
-    Sys.sleep(2)
+    #Sys.sleep(2)
 
     return (invisible(output))
 }

Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift	2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/Swift/exec/start-swift	2011-05-09 18:05:15 UTC (rev 4458)
@@ -45,11 +45,11 @@
 
   IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
 
-  echo "Starting to launch ssh workers on hosts: $hosts"
   for host in $(echo $hosts); do
     timestamp=$(date "+%Y.%m%d.%H%M%S")
     random=$(awk "BEGIN {printf \"%0.5d\", $RANDOM}")
     ID=$timestamp.$random
+    echo "Starting to launch worker on host: $host"
        # FIXME: make logging an argument; set false by default
        # fixme:send worker.pl to remote host via stdin or scp.
        if ssh $host /bin/sh -c \'"mkdir -p $LOGDIR"\'
@@ -66,7 +66,7 @@
                echo "Error sending file to $host"
            fi
       else
-          echo "Error contacting $host"
+          echo "Error contacting $host or creating directory $LOGDIR on host"
       fi
 
 
@@ -74,6 +74,10 @@
 
   echo Started workers from ssh processes $sshpids
   echo $sshpids > $sshpidfile
+  if [ "$doack" = TRUE ]; then
+    echo done > ackfifo
+    doack=FALSE
+  fi
 }
 
 
@@ -369,6 +373,10 @@
   if [ $succ -eq 0 ]
   then
     echo Started workers from batch job $(cat $jobidfile)
+    if [ "$doack" = TRUE ]; then
+      echo done > ackfifo
+      doack=FALSE
+    fi
   else
     echo Batch queue submission failed, exiting.
     stdcleanup_start
@@ -525,6 +533,7 @@
 # Setup a working directory
 if [ "$workdir" = NONE ]
 then
+    doack=FALSE
     trundir=$(mktemp -d $rundir.XXXX) # FIXME: check success
     if [ "$?" != "0" ]
     then
@@ -532,6 +541,7 @@
         exit 1
     fi
 else 
+    doack=TRUE # let -daemon script know when we are done
     echo Working in $workdir
     trundir=$workdir
     mkdir -p $workdir
@@ -581,6 +591,10 @@
             rm $SUBMIT_FILE
         fi 
     fi
+    if [ "$doack" = TRUE ]; then
+        echo done > ackfifo
+        doack=FALSE
+    fi
 }
 
 echo Running in $trundir "(linked to $rundir)"
@@ -641,7 +655,10 @@
 
   trap onexit $TRAPEVENTS
   exitcmd=onexit
-
+  if [ "$doack" = TRUE ]; then
+    echo done > ackfifo
+    doack=FALSE
+  fi
 elif [ $server = ssh ]; then
   
   if [ $cores -eq 0 ]; then
@@ -666,6 +683,7 @@
       #echo Based on $rpfile: terminating process process group $rpgid on $rhost
       echo Shutting down worker processes on $rhost
       ssh $rhost sh -c \'"kill -s TERM -- -$rpgid &>/dev/null"\' 
+      echo Shut down worker process on $rhost
     done
     if [ "_$sshpids$starterpid$coasterservicepid" != _ ]; then
       echo kill $sshpids $starterpid $coasterservicepid >& /dev/null

Modified: SwiftApps/SwiftR/Swift/exec/start-swift-daemon
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift-daemon	2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/Swift/exec/start-swift-daemon	2011-05-09 18:05:15 UTC (rev 4458)
@@ -13,6 +13,10 @@
 
 mkdir -p $tmp/$USER/SwiftR
 workdir=$(mktemp -d $tmp/$USER/SwiftR/swift.XXXX) 
+
+ackfifo=$workdir/ackfifo
+mkfifo $ackfifo
+
 if [ "$?" != "0" ]
 then
     echo "Could not create temporary directory under $tmp/$USER/SwiftR"
@@ -25,6 +29,9 @@
 $ssscript "$@" -d $workdir 1>&2 &
 childpid=$!
 
-
 echo ${childpid}
 echo ${workdir}
+
+# Wait for subprocess to let us know its ready
+cat $ackfifo > /dev/null
+rm $ackfifo

Modified: SwiftApps/SwiftR/perftools/parselog.py
===================================================================
--- SwiftApps/SwiftR/perftools/parselog.py	2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/perftools/parselog.py	2011-05-09 18:05:15 UTC (rev 4458)
@@ -109,7 +109,8 @@
 exec_events = log_iter("vdl:execute")
 
 
-basetime = datetime.datetime(2011, 3, 18, 13, 22, 52, 231*1000)
+#basetime = datetime.datetime(2011, 3, 18, 13, 22, 52, 231*1000)
+basetime = datetime.datetime(2011, 3, 28, 19, 39, 25, 707*1000)
 exec_events = list(time_converted(exec_events, basetime))
 
 starts = [(t, priority, type, message, message.split()[1])
@@ -125,11 +126,26 @@
     print s, e
 
 def to_s(td):
-    return float(td.seconds) + float(td.microseconds) / 1000000.0
+    return td.days * 24 * 60 * 60 + float(td.seconds) + float(td.microseconds) / 1000000.0
 
-paired = [(i, s[0], e[0]) for i,s,e in zip(range(len(starts)), starts, ends)]
+paired = [(s[0], e[0], to_s(e[0]) - to_s(s[0])) 
+        for s,e in zip(starts, ends)
+        if to_s(s[0]) > 20600]
+        #if to_s(s[0]) > 300]
+paired.sort(key=itemgetter(2))
+paired.reverse()
+paired = [(i, s, e, d ) for i, (s,e,d) in zip(range(len(paired)), paired)]
+print paired
 import pylab
-for i, s, e in paired:
+for i, s, e, diff in paired:
     pylab.plot((to_s(s), to_s(e)), (i, i), 'r')
 
+pylab.xlabel("time (s)")
+pylab.ylabel("number of active bootstrap tasks.")
+
 pylab.show()
+
+pylab.hist([t[3] for t in paired], bins=20)
+pylab.xlabel("time (s)")
+pylab.ylabel("# tasks")
+pylab.show()




More information about the Swift-commit mailing list