[Swift-commit] r4458 - in SwiftApps/SwiftR: . Swift/R Swift/exec perftools
tga at ci.uchicago.edu
tga at ci.uchicago.edu
Mon May 9 13:05:15 CDT 2011
Author: tga
Date: 2011-05-09 13:05:15 -0500 (Mon, 09 May 2011)
New Revision: 4458
Modified:
SwiftApps/SwiftR/IMMEDIATE-TODO
SwiftApps/SwiftR/Swift/R/Workers.R
SwiftApps/SwiftR/Swift/exec/start-swift
SwiftApps/SwiftR/Swift/exec/start-swift-daemon
SwiftApps/SwiftR/perftools/parselog.py
Log:
Changing swiftInit so that it doesn't return until the workers have been launched ok. This should allow better error handling, and should allow users to type their ssh passwords.
still to be done: test more thoroughly, throw error from swiftInit if workers don't start up ok
Modified: SwiftApps/SwiftR/IMMEDIATE-TODO
===================================================================
--- SwiftApps/SwiftR/IMMEDIATE-TODO 2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/IMMEDIATE-TODO 2011-05-09 18:05:15 UTC (rev 4458)
@@ -16,6 +16,10 @@
startup code for one of libraries
HIGH:
+-- sometimes the server crashes after servicing first request(
+ only observer when running locally and on sge)
+
+HIGH:
-- Benchmark ideas
- Beagle
- Sarah Kenny FMRI
@@ -47,8 +51,10 @@
MED:
-- Support generic swift sites.xml and tc.data files for power users
+-- Note: support is added, but need to consider either:
+ a) guidelines for how to write the file
+ b) templates or auto-generation.
-
MED:
-- automated tests
Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R 2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/Swift/R/Workers.R 2011-05-09 18:05:15 UTC (rev 4458)
@@ -207,8 +207,8 @@
# the user if there was a problem with the workers
out <- system(cmdString, intern=TRUE)
if (length(out) != 2)
- stop(paste("Unexpected output from start-swift: '", out, "'",
- "Launching may have failed"))
+ stop(paste("Unexpected output from start-swift: '",
+ paste(out, collapse="\n"), "'", "Launching may have failed"))
pid <- out[[1]]
workdir <- out[[2]]
cat("Started worker manager with pid ", pid, "\n")
@@ -228,7 +228,7 @@
addHook()
# Sleep to give start-swift time to set up fifos,etc
- Sys.sleep(2)
+ #Sys.sleep(2)
return (invisible(output))
}
Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift 2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/Swift/exec/start-swift 2011-05-09 18:05:15 UTC (rev 4458)
@@ -45,11 +45,11 @@
IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
- echo "Starting to launch ssh workers on hosts: $hosts"
for host in $(echo $hosts); do
timestamp=$(date "+%Y.%m%d.%H%M%S")
random=$(awk "BEGIN {printf \"%0.5d\", $RANDOM}")
ID=$timestamp.$random
+ echo "Starting to launch worker on host: $host"
# FIXME: make logging an argument; set false by default
# fixme:send worker.pl to remote host via stdin or scp.
if ssh $host /bin/sh -c \'"mkdir -p $LOGDIR"\'
@@ -66,7 +66,7 @@
echo "Error sending file to $host"
fi
else
- echo "Error contacting $host"
+ echo "Error contacting $host or creating directory $LOGDIR on host"
fi
@@ -74,6 +74,10 @@
echo Started workers from ssh processes $sshpids
echo $sshpids > $sshpidfile
+ if [ "$doack" = TRUE ]; then
+ echo done > ackfifo
+ doack=FALSE
+ fi
}
@@ -369,6 +373,10 @@
if [ $succ -eq 0 ]
then
echo Started workers from batch job $(cat $jobidfile)
+ if [ "$doack" = TRUE ]; then
+ echo done > ackfifo
+ doack=FALSE
+ fi
else
echo Batch queue submission failed, exiting.
stdcleanup_start
@@ -525,6 +533,7 @@
# Setup a working directory
if [ "$workdir" = NONE ]
then
+ doack=FALSE
trundir=$(mktemp -d $rundir.XXXX) # FIXME: check success
if [ "$?" != "0" ]
then
@@ -532,6 +541,7 @@
exit 1
fi
else
+ doack=TRUE # let -daemon script know when we are done
echo Working in $workdir
trundir=$workdir
mkdir -p $workdir
@@ -581,6 +591,10 @@
rm $SUBMIT_FILE
fi
fi
+ if [ "$doack" = TRUE ]; then
+ echo done > ackfifo
+ doack=FALSE
+ fi
}
echo Running in $trundir "(linked to $rundir)"
@@ -641,7 +655,10 @@
trap onexit $TRAPEVENTS
exitcmd=onexit
-
+ if [ "$doack" = TRUE ]; then
+ echo done > ackfifo
+ doack=FALSE
+ fi
elif [ $server = ssh ]; then
if [ $cores -eq 0 ]; then
@@ -666,6 +683,7 @@
#echo Based on $rpfile: terminating process process group $rpgid on $rhost
echo Shutting down worker processes on $rhost
ssh $rhost sh -c \'"kill -s TERM -- -$rpgid &>/dev/null"\'
+ echo Shut down worker process on $rhost
done
if [ "_$sshpids$starterpid$coasterservicepid" != _ ]; then
echo kill $sshpids $starterpid $coasterservicepid >& /dev/null
Modified: SwiftApps/SwiftR/Swift/exec/start-swift-daemon
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift-daemon 2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/Swift/exec/start-swift-daemon 2011-05-09 18:05:15 UTC (rev 4458)
@@ -13,6 +13,10 @@
mkdir -p $tmp/$USER/SwiftR
workdir=$(mktemp -d $tmp/$USER/SwiftR/swift.XXXX)
+
+ackfifo=$workdir/ackfifo
+mkfifo $ackfifo
+
if [ "$?" != "0" ]
then
echo "Could not create temporary directory under $tmp/$USER/SwiftR"
@@ -25,6 +29,9 @@
$ssscript "$@" -d $workdir 1>&2 &
childpid=$!
-
echo ${childpid}
echo ${workdir}
+
+# Wait for subprocess to let us know its ready
+cat $ackfifo > /dev/null
+rm $ackfifo
Modified: SwiftApps/SwiftR/perftools/parselog.py
===================================================================
--- SwiftApps/SwiftR/perftools/parselog.py 2011-05-09 14:27:44 UTC (rev 4457)
+++ SwiftApps/SwiftR/perftools/parselog.py 2011-05-09 18:05:15 UTC (rev 4458)
@@ -109,7 +109,8 @@
exec_events = log_iter("vdl:execute")
-basetime = datetime.datetime(2011, 3, 18, 13, 22, 52, 231*1000)
+#basetime = datetime.datetime(2011, 3, 18, 13, 22, 52, 231*1000)
+basetime = datetime.datetime(2011, 3, 28, 19, 39, 25, 707*1000)
exec_events = list(time_converted(exec_events, basetime))
starts = [(t, priority, type, message, message.split()[1])
@@ -125,11 +126,26 @@
print s, e
def to_s(td):
- return float(td.seconds) + float(td.microseconds) / 1000000.0
+ return td.days * 24 * 60 * 60 + float(td.seconds) + float(td.microseconds) / 1000000.0
-paired = [(i, s[0], e[0]) for i,s,e in zip(range(len(starts)), starts, ends)]
+paired = [(s[0], e[0], to_s(e[0]) - to_s(s[0]))
+ for s,e in zip(starts, ends)
+ if to_s(s[0]) > 20600]
+ #if to_s(s[0]) > 300]
+paired.sort(key=itemgetter(2))
+paired.reverse()
+paired = [(i, s, e, d ) for i, (s,e,d) in zip(range(len(paired)), paired)]
+print paired
import pylab
-for i, s, e in paired:
+for i, s, e, diff in paired:
pylab.plot((to_s(s), to_s(e)), (i, i), 'r')
+pylab.xlabel("time (s)")
+pylab.ylabel("number of active bootstrap tasks.")
+
pylab.show()
+
+pylab.hist([t[3] for t in paired], bins=20)
+pylab.xlabel("time (s)")
+pylab.ylabel("# tasks")
+pylab.show()
More information about the Swift-commit
mailing list