[Swift-commit] r3681 - in SwiftApps/SwiftR/Swift: R exec man
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Thu Oct 14 08:47:37 CDT 2010
Author: wilde
Date: 2010-10-14 08:47:37 -0500 (Thu, 14 Oct 2010)
New Revision: 3681
Added:
SwiftApps/SwiftR/Swift/exec/configure-server-local
SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
SwiftApps/SwiftR/Swift/exec/configure-server-ssh
SwiftApps/SwiftR/Swift/exec/start-merlot
Removed:
SwiftApps/SwiftR/Swift/exec/configure-site-local
SwiftApps/SwiftR/Swift/exec/configure-site-pbs
Modified:
SwiftApps/SwiftR/Swift/R/Swift.R
SwiftApps/SwiftR/Swift/exec/SwiftRServer.sh
SwiftApps/SwiftR/Swift/exec/start-swift
SwiftApps/SwiftR/Swift/man/Swift-package.Rd
Log:
Added ability to connect pbs workers through firewall; temp version of this in start-merlot integrated into start-swift; added sections to Swift doc page.
Modified: SwiftApps/SwiftR/Swift/R/Swift.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Swift.R 2010-10-12 03:34:27 UTC (rev 3680)
+++ SwiftApps/SwiftR/Swift/R/Swift.R 2010-10-14 13:47:37 UTC (rev 3681)
@@ -2,7 +2,7 @@
swiftserver=NULL,
callsperbatch=NULL,
runmode=NULL,
- initialize=NULL,
+ initialexpr=NULL,
workerhosts=NULL,
keepwork=NULL,
tmpdir=NULL )
@@ -27,10 +27,10 @@
# script: run swift for each request, via RunSwiftScript.sh (currently broken)
# manual: for testing, let user run remote R server manually
- if(is.null(initialize))
- initialize <- getOption("swift.initialize")
- if(is.null(initialize))
- initialize <- "";
+ if(is.null(initialexpr))
+ initialexpr <- getOption("swift.initialexpr")
+ if(is.null(initialexpr))
+ initialexpr <- "";
if(is.null(workerhosts))
workerhosts <- getOption("swift.workerhosts")
@@ -55,7 +55,7 @@
cat(" runmode =", runmode,"\n")
cat(" tmpdir =", tmpdir,"\n")
cat(" workerhosts =", workerhosts,"\n")
- cat(" initialize =", initialize,"\n\n")
+ cat(" initialexpr =", initialexpr,"\n\n")
user <- Sys.info()[["user"]]
@@ -99,7 +99,7 @@
arglistbatch[[i]] <- arglists[[arglist]]
arglist <- arglist +1
}
- rcall <- list(initializer=initialize,func=func,arglistbatch=arglistbatch)
+ rcall <- list(initializer=initialexpr,func=func,arglistbatch=arglistbatch)
save(rcall,file=paste(reqdir,"/cbatch.",as.character(batch),".Rdata",sep=""))
batch <- batch + 1;
}
@@ -190,7 +190,7 @@
options(swift.site="service")
options(swift.keepwork=TRUE)
initcmds <- "initVar1 <<- 19; initVar2 <<- sqrt(400)+3"
- options(swift.initialize=initcmds) # Set here; used in test group 4
+ options(swift.initialexpr=initcmds) # Set here; used in test group 4
}
swiftTest_1.1 <- function()
@@ -252,7 +252,7 @@
swiftTest_4.2 <- function()
{
- options(swift.initialize="initVar3 <<- 123; initVar4 <<- 100");
+ options(swift.initialexpr="initVar3 <<- 123; initVar4 <<- 100");
mulivars <- function() { initVar3*initVar4 }
@@ -535,7 +535,7 @@
failures=failures+1
}
-##### Test Group 4 # test initializer string
+##### Test Group 4 # test initialexpr string
cat("\n*** Starting test group 4 - test remote R service initialization string ***\n")
Modified: SwiftApps/SwiftR/Swift/exec/SwiftRServer.sh
===================================================================
--- SwiftApps/SwiftR/Swift/exec/SwiftRServer.sh 2010-10-12 03:34:27 UTC (rev 3680)
+++ SwiftApps/SwiftR/Swift/exec/SwiftRServer.sh 2010-10-14 13:47:37 UTC (rev 3681)
@@ -1,5 +1,6 @@
#! /usr/bin/env Rscript
+require(methods)
argv = commandArgs(TRUE)
fifoDir = argv[1]; # FIXME: test for valid arguments
@@ -50,10 +51,12 @@
result <- list()
initializer <- rcall$initializer;
+ # print(sprintf("received initializer=%s latestInitializer=%s\n",initializer, latestInitializer));
if( initializer != latestInitializer) {
initialExpr <- parse(text=initializer)
eval(initialExpr)
latestInitializer <<- initializer
+ # print(sprintf("after eval: latestInitializer=%s\n", latestInitializer));
}
for(c in 1:length(rcall$arglistbatch)) {
# FIXME: run this under try/catch and save error status in results object (need to make it a list: rval + error status)
Copied: SwiftApps/SwiftR/Swift/exec/configure-server-local (from rev 3660, SwiftApps/SwiftR/Swift/exec/configure-site-local)
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-local (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-local 2010-10-14 13:47:37 UTC (rev 3681)
@@ -0,0 +1,69 @@
+#! /bin/bash
+
+throttleOneCore="0.00" # FIXME: test if new swft fix makes zero OK rather than -0.001
+
+cores=$1
+
+if [ -r /proc/cpuinfo ]; then
+ localcores=$(grep '^processor' /proc/cpuinfo | wc -l)
+else
+ localcores=1
+fi
+
+if [ $cores -eq 0 ]; then
+ cores=$localcores
+fi
+
+cat >tc <<END
+fork bashlocal /bin/bash null null null
+END
+
+cat >sites.xml <<END
+<config>
+ <pool handle="fork">
+ <execution provider="local" url="none" />
+ <profile key="jobThrottle" namespace="karajan">0.15</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$(pwd)/swiftwork</workdirectory>
+ </pool>
+END
+
+for i in `seq -w 0 $((cores-1))`; do
+
+# FIXME: how to get the right PATH for R in here?
+# use /bin/sh and count on users PATH?
+# then add ENV::PATH as an option, e.g., from options(swift.remotepath and swift.remotepath.sitename)
+
+cat >>tc <<END
+local${i} bash /bin/bash null null ENV::SWIFTR_TMP="$SWIFTR_TMP";ENV::PATH="$PATH";ENV::SWIFT_JOB_SLOT="${i}";ENV::SWIFT_WORKER_PID="$$"
+END
+
+cat >>sites.xml <<END
+ <pool handle="local${i}">
+ <execution provider="local" url="none" />
+ <profile key="jobThrottle" namespace="karajan">$throttleOneCore</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$(pwd)/swiftwork</workdirectory>
+ </pool>
+END
+
+done
+
+# <profile namespace="karajan" key="jobsPerCpu">1</profile>
+
+echo '</config>' >>sites.xml
+
+cat >cf <<END
+wrapperlog.always.transfer=true
+sitedir.keep=true
+execution.retries=0
+lazy.errors=false
+status.mode=provider
+use.provider.staging=false
+provider.staging.pin.swiftfiles=false
+throttle.host.submit=1
+END
+
+
Copied: SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto (from rev 3660, SwiftApps/SwiftR/Swift/exec/configure-site-pbs)
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto 2010-10-14 13:47:37 UTC (rev 3681)
@@ -0,0 +1,61 @@
+#! /bin/bash
+
+throttlePBS=.31 # FIXME: parameterize thsi and several other variables, below.
+
+cat >tc <<END
+fork bashlocal /bin/bash null null null
+pbscoast bash /bin/bash null null ENV::PATH="$PATH";GLOBUS::maxwalltime="00:10:00"
+END
+
+cat >sites.xml <<END
+<config>
+ <pool handle="fork">
+ <execution provider="local" url="none" />
+ <profile key="jobThrottle" namespace="karajan">0.15</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$(pwd)/swiftwork</workdirectory>
+ </pool>
+
+ <pool handle="pbsdirect">
+ <execution provider="pbs" url="none" />
+ <profile namespace="globus" key="queue">fast</profile>
+ <profile namespace="globus" key="maxwalltime">00:59:00</profile>
+ <profile key="jobThrottle" namespace="karajan">$throttlePBS</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$HOME/swiftwork</workdirectory>
+ </pool>
+
+ <pool handle="pbscoast">
+ <execution provider="coaster" url="none" jobmanager="local:pbs"/>
+ <profile namespace="globus" key="queue">short</profile>
+ <profile namespace="globus" key="maxTime">12000</profile>
+ <profile namespace="globus" key="slots">32</profile>
+ <profile namespace="globus" key="nodeGranularity">1</profile>
+ <profile namespace="globus" key="maxNodes">1</profile>
+ <profile namespace="globus" key="workersPerNode">1</profile>
+ <profile namespace="karajan" key="jobThrottle">2.55</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+
+ <filesystem provider="local" url="none"/>
+ <workdirectory>$HOME/swiftwork</workdirectory>
+ </pool>
+</config>
+END
+
+# <profile namespace="globus" key="maxWallTime">00:00:01</profile>
+# <profile namespace="globus" key="queue">fast</profile>
+
+cat >cf <<END
+wrapperlog.always.transfer=true
+sitedir.keep=true
+execution.retries=0
+lazy.errors=false
+status.mode=provider
+use.provider.staging=false
+provider.staging.pin.swiftfiles=false
+#throttle.host.submit=1
+END
+
+
Added: SwiftApps/SwiftR/Swift/exec/configure-server-ssh
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-ssh (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-ssh 2010-10-14 13:47:37 UTC (rev 3681)
@@ -0,0 +1,60 @@
+#! /bin/bash
+
+cores=$1
+time=$2
+
+cat >tc <<END
+fork bashlocal /bin/bash null null null
+sshcoast bash /bin/bash null null ENV::PATH="$PATH";GLOBUS::maxwalltime="$time"
+END
+
+cat >sites.xml <<END
+<config>
+ <pool handle="fork">
+ <execution provider="local" url="none" />
+ <profile key="jobThrottle" namespace="karajan">0.15</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$(pwd)/swiftwork</workdirectory>
+ </pool>
+
+ <pool handle="sshcoast">
+ <execution provider="coaster" url="none" jobmanager="local:NA"/>
+ <profile namespace="globus" key="workerManager">passive</profile>
+ <profile namespace="globus" key="workersPerNode">$cores</profile>
+ <profile namespace="karajan" key="jobThrottle">2.55</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none"/>
+ <workdirectory>$HOME/swiftwork</workdirectory>
+ </pool>
+
+</config>
+END
+
+cat >cf <<END
+wrapperlog.always.transfer=true
+sitedir.keep=true
+execution.retries=0
+lazy.errors=false
+status.mode=provider
+use.provider.staging=false
+provider.staging.pin.swiftfiles=false
+#throttle.host.submit=1
+END
+
+true to save <<END
+
+# <profile namespace="globus" key="maxWallTime">00:00:01</profile>
+# <profile namespace="globus" key="queue">fast</profile>
+
+ <pool handle="persistent-coasters-model">
+ <!-- <execution provider="coaster-persistent" url="http://$(hostname -f):${SERVICEPORT}" jobmanager="local:local"/> -->
+ <execution provider="coaster" jobmanager="local:local"/>
+ <profile namespace="globus" key="workerManager">passive</profile>
+ <profile namespace="globus" key="workersPerNode">4</profile>
+ <profile key="jobThrottle" namespace="karajan">.03</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$HOME/swiftwork</workdirectory>
+ </pool>
+END
Deleted: SwiftApps/SwiftR/Swift/exec/configure-site-local
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-site-local 2010-10-12 03:34:27 UTC (rev 3680)
+++ SwiftApps/SwiftR/Swift/exec/configure-site-local 2010-10-14 13:47:37 UTC (rev 3681)
@@ -1,67 +0,0 @@
-#! /bin/bash
-
-throttleOneCore="-0.001"
-throttleOneCore="0.00" # FIXME: test if new swft fix makes zero OK rather than -0.001
-
-if [ -r /proc/cpuinfo ]; then
- localcores=$(grep '^processor' /proc/cpuinfo | wc -l)
-else
- localcores=4
-fi
-
-#### DBDBDBDB vvvvvv
-#localcores=1
-
-cat >tc <<END
-fork bashlocal /bin/bash null null null
-END
-
-cat >sites.xml <<END
-<config>
- <pool handle="fork">
- <execution provider="local" url="none" />
- <profile key="jobThrottle" namespace="karajan">0.15</profile>
- <profile namespace="karajan" key="initialScore">10000</profile>
- <filesystem provider="local" url="none" />
- <workdirectory>$(pwd)/swiftwork</workdirectory>
- </pool>
-END
-
-for i in `seq -w 0 $((localcores-1))`; do
-
-# FIXME: how to get the right PATH for R in here?
-# use /bin/sh and count on users PATH?
-# then add ENV::PATH as an option, e.g., from options(swift.remotepath and swift.remotepath.sitename)
-
-cat >>tc <<END
-local${i} bash /bin/bash null null ENV::SWIFTR_TMP="$SWIFTR_TMP";ENV::PATH="$PATH";ENV::SWIFT_JOB_SLOT="${i}";ENV::SWIFT_WORKER_PID="$$"
-END
-
-cat >>sites.xml <<END
- <pool handle="local${i}">
- <execution provider="local" url="none" />
- <profile key="jobThrottle" namespace="karajan">$throttleOneCore</profile>
- <profile namespace="karajan" key="initialScore">10000</profile>
- <filesystem provider="local" url="none" />
- <workdirectory>$(pwd)/swiftwork</workdirectory>
- </pool>
-END
-
-done
-
-# <profile namespace="karajan" key="jobsPerCpu">1</profile>
-
-echo '</config>' >>sites.xml
-
-cat >cf <<END
-wrapperlog.always.transfer=true
-sitedir.keep=true
-execution.retries=0
-lazy.errors=false
-status.mode=provider
-use.provider.staging=false
-provider.staging.pin.swiftfiles=false
-throttle.host.submit=1
-END
-
-
Deleted: SwiftApps/SwiftR/Swift/exec/configure-site-pbs
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-site-pbs 2010-10-12 03:34:27 UTC (rev 3680)
+++ SwiftApps/SwiftR/Swift/exec/configure-site-pbs 2010-10-14 13:47:37 UTC (rev 3681)
@@ -1,61 +0,0 @@
-#! /bin/bash
-
-throttlePBS=.31 # FIXME: parameterize thsi and several other variables, below.
-
-cat >tc <<END
-fork bashlocal /bin/bash null null null
-pbscoast bash /bin/bash null null ENV::PATH="$PATH";GLOBUS::maxwalltime="00:10:00"
-END
-
-cat >sites.xml <<END
-<config>
- <pool handle="fork">
- <execution provider="local" url="none" />
- <profile key="jobThrottle" namespace="karajan">0.15</profile>
- <profile namespace="karajan" key="initialScore">10000</profile>
- <filesystem provider="local" url="none" />
- <workdirectory>$(pwd)/swiftwork</workdirectory>
- </pool>
-
- <pool handle="pbsdirect">
- <execution provider="pbs" url="none" />
- <profile namespace="globus" key="queue">fast</profile>
- <profile namespace="globus" key="maxwalltime">00:59:00</profile>
- <profile key="jobThrottle" namespace="karajan">$throttlePBS</profile>
- <profile namespace="karajan" key="initialScore">10000</profile>
- <filesystem provider="local" url="none" />
- <workdirectory>$HOME/swiftwork</workdirectory>
- </pool>
-
- <pool handle="pbscoast">
- <execution provider="coaster" url="none" jobmanager="local:pbs"/>
- <profile namespace="globus" key="queue">short</profile>
- <profile namespace="globus" key="maxTime">12000</profile>
- <profile namespace="globus" key="slots">32</profile>
- <profile namespace="globus" key="nodeGranularity">1</profile>
- <profile namespace="globus" key="maxNodes">1</profile>
- <profile namespace="globus" key="workersPerNode">1</profile>
- <profile namespace="karajan" key="jobThrottle">2.55</profile>
- <profile namespace="karajan" key="initialScore">10000</profile>
-
- <filesystem provider="local" url="none"/>
- <workdirectory>$HOME/swiftwork</workdirectory>
- </pool>
-</config>
-END
-
-# <profile namespace="globus" key="maxWallTime">00:00:01</profile>
-# <profile namespace="globus" key="queue">fast</profile>
-
-cat >cf <<END
-wrapperlog.always.transfer=true
-sitedir.keep=true
-execution.retries=0
-lazy.errors=false
-status.mode=provider
-use.provider.staging=false
-provider.staging.pin.swiftfiles=false
-#throttle.host.submit=1
-END
-
-
Added: SwiftApps/SwiftR/Swift/exec/start-merlot
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-merlot (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/start-merlot 2010-10-14 13:47:37 UTC (rev 3681)
@@ -0,0 +1,293 @@
+#! /bin/bash
+
+# Define internal functions
+
+export TRAPEVENTS="EXIT 1 2 3 15" # Signals and conditions to trap
+
+get-contact()
+{
+ # Look for:
+ # Passive queue processor initialized. Callback URI is http://140.221.8.62:55379
+
+ for try in $(seq 1 20); do
+ uriline=$(grep "Passive queue processor initialized. Callback URI is" $out 2> /dev/null)
+ if [ "_$uriline" = _ ]; then
+ sleep 1
+ else
+ break;
+ fi
+ done
+
+ if [ "_$uriline" = _ ]; then
+ echo "$0: No passive state message from Swift - exiting."
+ exit 1
+ fi
+
+ CONTACT=$(echo $uriline | sed -e 's/^.*http:/http:/')
+ echo Coaster service contact URI: $CONTACT
+}
+
+function wait-and-start-ssh-workers
+{
+ get-contact
+ LOGDIR=$(pwd)/swiftworkerlogs # full path. FIXME: Generate this with remote-side paths if not shared dir env?
+ LOGDIR=/tmp/$USER/SwiftR/swiftworkerlogs # FIXME: left this in /tmp so it works on any host. Better way?
+
+ # mkdir -p $LOGDIR # is done with the ssh command, below
+
+ IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
+
+ rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
+ for host in $(echo $hosts); do
+ timestamp=$(date "+%Y.%m%d.%H%M%S")
+ random=$(awk "BEGIN {printf \"%0.5d\", $RANDOM}")
+ ID=$timestamp.$random
+ # FIXME: make logging an argument; set false by default
+ # fixme:send worker.pl to remote host via stdin or scp.
+ ssh $host /bin/sh -c \'"mkdir -p $LOGDIR"\'
+ scp $SWIFTBIN/worker.pl $host:$LOGDIR
+ ssh $host '/bin/sh -c '\'"WORKER_LOGGING_ENABLED=true $LOGDIR/worker.pl $CONTACT $ID $LOGDIR $IDLETIMEOUT 2>&1 & echo PID=\$!"\' >remotepid.$host </dev/null &
+ sshpids="$sshpids $!"
+ done
+
+ echo Started workers from ssh processes $sshpids
+ echo $sshpids > $sshpidfile
+ wait
+}
+
+make-pbs-submit-file()
+{
+ if [ $queue != default ]; then
+ queueDirective="#PBS -q $queue"
+ else
+ queueDirective=""
+ fi
+cat >pbs.sub <<END
+#PBS -S /bin/sh
+#PBS -N SwiftR-workers
+#PBS -m n
+#PBS -l nodes=$nodes:ppn=$cores
+#PBS -l walltime=$time
+#PBS -o $HOME/mw/work/pbs.stdout
+#PBS -e $HOME/mw/work/pbs.stderr
+$queueDirective
+WORKER_LOGGING_ENABLED=true # FIXME: parameterize; fix w PBS -v
+HOST=\$(echo $CONTACT | sed -e 's,^http://,,' -e 's/:.*//')
+PORT=\$(echo $CONTACT | sed -e 's,^.*:,,')
+CONTACT=http://localhost:\$PORT
+echo '***' PBS_NODEFILE file is \$PBS_NODEFILE
+cat \$PBS_NODEFILE
+echo '***' unique nodes are:
+sort < \$PBS_NODEFILE|uniq
+for h in \$(sort < \$PBS_NODEFILE|uniq); do
+ ssh \$h "echo Swift R startup running on host; hostname; echo HOST=\$HOST PORT=\$PORT CONTACT=\$CONTACT; cd /; ( ssh -N -L \$PORT:\$HOST:\$PORT \$HOST & sleep 3; /usr/bin/perl $SWIFTBIN/worker.pl \$CONTACT SwiftR-\$h $HOME/.globus/coasters $IDLETIMEOUT ; wait)" &
+done
+
+ontrap()
+{
+ echo in ontrap
+ # Kill our processes on each node; do first node (on which this is running) last
+ for h in \$(sort < \$PBS_NODEFILE|uniq | sort -r); do
+ echo killing processes on host \$h
+ ssh \$h killall -u \$USER
+ done;
+ killall -u \$USER
+}
+
+trap ontrap $TRAPEVENTS
+wait
+
+END
+}
+
+function wait-and-start-pbs-workers
+{
+ get-contact
+ LOGDIR=$(pwd)/swiftworkerlogs # full path. FIXME: Generate this with remote-side paths if not shared dir env?
+ LOGDIR=/tmp/$USER/SwiftR/swiftworkerlogs # FIXME: left this in /tmp so it works on any host. Better way?
+
+ mkdir -p $LOGDIR
+
+ IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
+
+ # FIXME: set up for capturing pbs job id: rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
+ make-pbs-submit-file
+ qsub pbs.sub>$pbsjobidfile
+
+ echo Started workers from PBS job $(cat $pbsjobidfile)
+}
+
+usage()
+{
+ echo >&2 "usage: $0 -c cores -h 'host1 ... hostN' -n nodes -q queue -s server -t throttle"
+ echo >&2 " valid servers: local, ssh, pbs"
+ echo >&2 " defaults: cores=2 nodes=1 queue=none server=local throttle=10"
+}
+
+verify-is-one-of()
+{
+ argname=$1
+ arg=$2
+ shift 2
+ for v in $*; do
+ if [ $arg = $v ]; then
+ return 0
+ fi
+ done
+ echo $0: value for argument $argname was $arg - must be one of: $*
+ usage
+ exit 1
+}
+
+verify-is-numeric()
+{
+ argname=$1; shift
+ if test $1 -ge 0 2>/dev/null; then
+ return 0
+ else
+ echo $0: value for $argname must be a positive integer, was: $1
+ usage
+ exit 1
+ fi
+}
+
+verify-not-null()
+{
+ argname=$1; shift
+ if [ _$1 != _ ]; then
+ return 0
+ else
+ echo $0: value for $argname can not be null
+ usage
+ exit 1
+ fi
+}
+
+# main script
+
+tmp=${SWIFTR_TMP:-/tmp}
+
+# Process command line args
+
+server=local
+time="00:30:00"
+nodes=1
+queue=short
+#throttleOneCore="-0.001" FIXME: Remove
+#throttleOneCore="0.00" FIXME: Remove
+localcores=5
+cores=0
+defaultCores=4
+throttle=10
+hosts=no-hosts-specified
+queue=default
+
+while [ $# -gt 0 ]
+do
+ case "$1" in
+ -c) cores=$2; verify-is-numeric cores $cores; shift ;;
+ -h) hosts=$2; verify-not-null hosts $hosts; shift ;;
+ -n) nodes=$2; verify-is-numeric nodes $nodes; shift ;;
+ -p) throttle=$2; verify-is-numeric throttle $throttle; shift ;;
+ -q) queue=$2; verify-not-null queue $queue; shift ;;
+ -s) server=$2; verify-is-one-of server $server local ssh pbs; shift ;;
+ -t) time=$2; verify-is-not-null time $time; shift ;;
+ *) usage; exit 1 ;;
+ esac
+ shift
+done
+
+echo cores=$cores nodes=$nodes queue=$queue server=$server throttle=$throttle
+
+SWIFTRBIN=$(cd $(dirname $0); pwd)
+SWIFTBIN=$SWIFTRBIN/../swift/bin # This depends on ~/SwiftR/Swift/swift being a symlink to swift in RLibrary/Swift
+
+rundir=$tmp/$USER/SwiftR/swift.$server # rundir prefix # FIXME: handle multiple concurent independent swift servers per user
+mkdir -p $(dirname $rundir)
+trundir=$(mktemp -d $rundir.XXXX) # FIXME: check success
+rm -rf $rundir
+ln -s $trundir $rundir
+cd $rundir
+
+echo Running in $trundir "(linked to $rundir)"
+
+script=$SWIFTRBIN/rserver.swift
+#cp $script $SWIFTRBIN/passive-coaster-swift $SWIFTRBIN/swift.properties $rundir
+cp $script .
+script=$(basename $script)
+cp $SWIFTRBIN/{EvalRBatchPersistent.sh,SwiftRServer.sh} .
+
+# DONE: FIXME: rework this script to transfer all shells and rscripts
+# needed, and to copy in the R prelude for the R server processes (to
+# include for example the OpenMx library) NOTE: Both were done in older version of this script.
+
+# rm -f requestpipe resultpipe
+mkfifo requestpipe resultpipe
+
+out=swift.stdouterr
+touch $out
+
+if [ $server = local ]; then
+
+ source $SWIFTRBIN/configure-server-local $cores
+
+elif [ $server = ssh ]; then
+
+ if [ $cores -eq 0 ]; then
+ cores = $defaultRemoteCores
+ fi
+
+ source $SWIFTRBIN/configure-server-ssh $cores $time
+
+ sshpidfile=${out/stdouterr/workerpids}
+
+ function onexit {
+ coasterservicepid="" # null: saved in case we go back to using coaster servers
+ trap - $TRAPEVENTS
+ sshpids=$(cat $sshpidfile)
+ echo Terminating worker processes $sshpids, starter $starterpid
+ for rpfile in $(ls -1 remotepid.*); do
+ rpid=$(grep PID= $rpfile | sed -e 's/PID=//')
+ rhost=$(echo $rpfile | sed -e 's/remotepid.//')
+ echo Based on $rpfile: terminating process group of process $rpid on $rhost
+ ssh $rhost sh -c \''PGID=$(ps -p '$rpid' -o pgid --no-headers|sed -e "s/ //g"); kill -s TERM -- -$PGID'\'
+ done
+ if [ "_$sshpids$starterpid$coasterservicepid" != _ ]; then
+ echo kill $sshpids $starterpid $coasterservicepid >& /dev/null
+ fi
+ kill 0 # Kill all procs in current process group # FIXME: what was this for????
+ }
+
+ trap onexit $TRAPEVENTS
+
+ wait-and-start-ssh-workers &
+ starterpid=$!
+
+elif [ $server = pbs ]; then
+
+ source $SWIFTRBIN/configure-server-pbs $cores
+
+ pbsjobidfile=${out/stdouterr/pbsjobid}
+
+ function onexit {
+ coasterservicepid="" # null: saved in case we go back to using coaster servers
+ trap - $TRAPEVENTS
+ pbsjobid=$(cat $pbsjobidfile)
+ echo Terminating worker processes starter $starterpid and PBS job $pbsjobid
+ if [ "_$starterpid != _ ]; then
+ kill $starterpid
+ fi
+ if [ "_$pbsjobid != _ ]; then
+ qdel $pbsjobid
+ fi
+ kill 0 # Kill all procs in current process group # FIXME: what was this for????
+ }
+
+ trap onexit $TRAPEVENTS
+
+ wait-and-start-pbs-workers &
+ starterpid=$!
+
+fi
+
+$SWIFTRBIN/../swift/bin/swift -config cf -tc.file tc -sites.file sites.xml $script -pipedir=$(pwd) >& $out </dev/null
Property changes on: SwiftApps/SwiftR/Swift/exec/start-merlot
___________________________________________________________________
Name: svn:executable
+ *
Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift 2010-10-12 03:34:27 UTC (rev 3680)
+++ SwiftApps/SwiftR/Swift/exec/start-swift 2010-10-14 13:47:37 UTC (rev 3681)
@@ -1,5 +1,7 @@
#! /bin/bash
+export TRAPEVENTS="EXIT 1 2 3 15" # Signals and conditions to trap
+
# Define internal functions
get-contact()
@@ -36,7 +38,7 @@
IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
- for host in $(echo $COMPUTEHOSTS); do
+ for host in $(echo $hosts); do
timestamp=$(date "+%Y.%m%d.%H%M%S")
random=$(awk "BEGIN {printf \"%0.5d\", $RANDOM}")
ID=$timestamp.$random
@@ -54,20 +56,75 @@
make-pbs-submit-file()
{
+ if [ $queue != default ]; then
+ queueDirective="#PBS -q $queue"
+ else
+ queueDirective=""
+ fi
cat >pbs.sub <<END
#PBS -S /bin/sh
#PBS -N SwiftR-workers
#PBS -m n
#PBS -l nodes=$nodes
-#PBS -l walltime=$walltime
-#PBS -q $queue
+#PBS -l walltime=$time
#PBS -o pbs.stdout
#PBS -e pbs.stderr
-WORKER_LOGGING_ENABLED=true
+$queueDirective
+WORKER_LOGGING_ENABLED=true # FIXME: parameterize; fix w PBS -v
cd / && /usr/bin/perl $SWIFTBIN/worker.pl $CONTACT SwiftR-workers $HOME/.globus/coasters $IDLETIMEOUT
END
}
+make-pbsf-submit-file()
+{
+ if [ _$GLOBUS_HOSTNAME = _ ]; then
+ echo GLOBUS_HOSTNAME must be set to worker-reachable address of submit host for pbsf server mode.
+ usage
+ exit 1
+ fi
+ if [ $queue != default ]; then
+ queueDirective="#PBS -q $queue"
+ else
+ queueDirective=""
+ fi
+cat >pbs.sub <<END
+#PBS -S /bin/sh
+#PBS -N SwiftR-workers
+#PBS -m n
+#PBS -l nodes=$nodes:ppn=$cores
+#PBS -l walltime=$time
+#PBS -o $HOME/mw/work/pbs.stdout
+#PBS -e $HOME/mw/work/pbs.stderr
+$queueDirective
+WORKER_LOGGING_ENABLED=true # FIXME: parameterize; fix w PBS -v
+HOST=\$(echo $CONTACT | sed -e 's,^http://,,' -e 's/:.*//')
+PORT=\$(echo $CONTACT | sed -e 's,^.*:,,')
+CONTACT=http://localhost:\$PORT
+echo '***' PBS_NODEFILE file is \$PBS_NODEFILE
+cat \$PBS_NODEFILE
+echo '***' unique nodes are:
+sort < \$PBS_NODEFILE|uniq
+for h in \$(sort < \$PBS_NODEFILE|uniq); do
+ ssh \$h "echo Swift R startup running on host; hostname; echo HOST=\$HOST PORT=\$PORT CONTACT=\$CONTACT; cd /; ( ssh -N -L \$PORT:\$HOST:\$PORT \$HOST & sleep 3; /usr/bin/perl $SWIFTBIN/worker.pl \$CONTACT SwiftR-\$h $HOME/.globus/coasters $IDLETIMEOUT ; wait)" &
+done
+
+ontrap()
+{
+ echo in ontrap
+ # Kill our processes on each node; do first node (on which this is running) last
+ for h in \$(sort < \$PBS_NODEFILE|uniq | sort -r); do
+ echo killing processes on host \$h
+ ssh \$h killall -u \$USER
+ done;
+ killall -u \$USER
+}
+
+trap ontrap $TRAPEVENTS
+wait
+
+END
+}
+
function wait-and-start-pbs-workers
{
get-contact
@@ -79,28 +136,98 @@
IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
# FIXME: set up for capturing pbs job id: rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
- make-pbs-submit-file
+ make-${server}-submit-file
qsub pbs.sub>$pbsjobidfile
echo Started workers from PBS job $(cat $pbsjobidfile)
}
-# main script
+usage()
+{
+ echo >&2 "usage: $0 -c cores -h 'host1 ... hostN' -n nodes -q queue -s server -t throttle"
+ echo >&2 " valid servers: local, ssh, pbs, pbsf (pbs with firewalled workers)"
+ echo >&2 " defaults: cores=2 nodes=1 queue=none server=local throttle=10"
+}
-site=$1 # local, ssh, pbsauto, pbsman ...
+verify-is-one-of()
+{
+ argname=$1
+ arg=$2
+ shift 2
+ for v in $*; do
+ if [ $arg = $v ]; then
+ return 0
+ fi
+ done
+ echo $0: value for argument $argname was $arg - must be one of: $*
+ usage
+ exit 1
+}
-# FIXME: check args and use better arg parsing
+verify-is-numeric()
+{
+ argname=$1; shift
+ if test $1 -ge 0 2>/dev/null; then
+ return 0
+ else
+ echo $0: value for $argname must be a positive integer, was: $1
+ usage
+ exit 1
+ fi
+}
+verify-not-null()
+{
+ argname=$1; shift
+ if [ _$1 != _ ]; then
+ return 0
+ else
+ echo $0: value for $argname can not be null
+ usage
+ exit 1
+ fi
+}
+
+# main script
+
tmp=${SWIFTR_TMP:-/tmp}
-throttleOneCore="-0.001"
-throttleOneCore="0.00"
-localcores=5 # FIXME: parameterize: localthreads=N
+# Process command line args
+server=local
+time="00:30:00"
+nodes=1
+queue=short
+#throttleOneCore="-0.001" FIXME: Remove
+#throttleOneCore="0.00" FIXME: Remove
+localcores=5
+cores=0
+defaultCores=4
+throttle=10
+hosts=no-hosts-specified
+queue=default
+
+while [ $# -gt 0 ]
+do
+ case "$1" in
+ -c) cores=$2; verify-is-numeric cores $cores; shift ;;
+ -h) hosts=$2; verify-not-null hosts $hosts; shift ;;
+ -n) nodes=$2; verify-is-numeric nodes $nodes; shift ;;
+ -p) throttle=$2; verify-is-numeric throttle $throttle; shift ;;
+ -q) queue=$2; verify-not-null queue $queue; shift ;;
+ -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf; shift ;;
+ -t) time=$2; verify-not-null time $time; shift ;;
+ *) usage; exit 1 ;;
+ esac
+ shift
+done
+
+echo cores=$cores nodes=$nodes queue=$queue server=$server throttle=$throttle
+
SWIFTRBIN=$(cd $(dirname $0); pwd)
SWIFTBIN=$SWIFTRBIN/../swift/bin # This depends on ~/SwiftR/Swift/swift being a symlink to swift in RLibrary/Swift
-rundir=$tmp/$USER/SwiftR/swift.$site # rundir prefix # FIXME: handle multiple concurent independent swift servers per user
+rundir=$tmp/$USER/SwiftR/swift.$server # rundir prefix # FIXME: handle multiple concurent independent swift servers per user
mkdir -p $(dirname $rundir)
trundir=$(mktemp -d $rundir.XXXX) # FIXME: check success
rm -rf $rundir
@@ -122,23 +249,28 @@
# rm -f requestpipe resultpipe
mkfifo requestpipe resultpipe
-source $SWIFTRBIN/configure-site-$1
-
out=swift.stdouterr
touch $out
-if [ $site = ssh ]; then
+if [ $server = local ]; then
- shift
- COMPUTEHOSTS=$*
+ source $SWIFTRBIN/configure-server-local $cores
+elif [ $server = ssh ]; then
+
+ if [ $cores -eq 0 ]; then
+ cores = $defaultRemoteCores
+ fi
+
+ source $SWIFTRBIN/configure-server-ssh $cores $time
+
sshpidfile=${out/stdouterr/workerpids}
- TRAPS="EXIT 1 2 3 15" # Signals and conditions to trap
+ TRAPEVENTS="EXIT 1 2 3 15" # Signals and conditions to trap
function onexit {
coasterservicepid="" # null: saved in case we go back to using coaster servers
- trap - $TRAPS
+ trap - $TRAPEVENTS
sshpids=$(cat $sshpidfile)
echo Terminating worker processes $sshpids, starter $starterpid
for rpfile in $(ls -1 remotepid.*); do
@@ -153,26 +285,22 @@
kill 0 # Kill all procs in current process group # FIXME: what was this for????
}
- trap onexit $TRAPS
+ trap onexit $TRAPEVENTS
wait-and-start-ssh-workers &
starterpid=$!
-elif [ $site = pbsman ]; then
+elif [ \( $server = pbs \) -o \( $server = pbsf \) ]; then
- # FIXME: Parameterize:
+ source $SWIFTRBIN/configure-server-pbs $cores
- walltime="01:00:00"
- nodes=1
- queue=short
-
pbsjobidfile=${out/stdouterr/pbsjobid}
- TRAPS="EXIT 1 2 3 15" # Signals and conditions to trap
+ TRAPEVENTS="EXIT 1 2 3 15" # Signals and conditions to trap
function onexit {
coasterservicepid="" # null: saved in case we go back to using coaster servers
- trap - $TRAPS
+ trap - $TRAPEVENTS
pbsjobid=$(cat $pbsjobidfile)
echo Terminating worker processes starter $starterpid and PBS job $pbsjobid
if [ "_$starterpid != _ ]; then
@@ -184,7 +312,7 @@
kill 0 # Kill all procs in current process group # FIXME: what was this for????
}
- trap onexit $TRAPS
+ trap onexit $TRAPEVENTS
wait-and-start-pbs-workers &
starterpid=$!
Modified: SwiftApps/SwiftR/Swift/man/Swift-package.Rd
===================================================================
--- SwiftApps/SwiftR/Swift/man/Swift-package.Rd 2010-10-12 03:34:27 UTC (rev 3680)
+++ SwiftApps/SwiftR/Swift/man/Swift-package.Rd 2010-10-14 13:47:37 UTC (rev 3681)
@@ -39,8 +39,10 @@
options(swift.site=sitename) # sitename = "local" to run on the
current host and "pbs" to submit to a local PBS cluster.
-PREREQUISITES
+}
+\section{PREREQUISITES}{
+
1) Sun Java 1.4 or higher (preferably 1.6) installed and in your PATH
Download the appropriate Java for Linux at:
@@ -78,8 +80,9 @@
In configurations (b) and (c) Swift will launch its own workers, and
then communicate using its own TCP protocol.
+}
-INSTALL
+\section{INSTALLATION}{
mkdir ~/RPackages ~/RLibrary # if not already created
cd ~/RPackages
@@ -87,8 +90,11 @@
R CMS INSTALL -l ~/RLibrary Swift_0.1.tar.gz
export R_LIBS=~/RLibrary
-QUICK START
+export GLOBUS_HOSTNAME=10.0.0.200 # Eg for Merlot: internal address of the login node
+}
+\section{QUICK START}{
+
In a shell (outside of R) start the local Swift server:
$HOME/RLibrary/Swift/exec/swift-start local
@@ -100,7 +106,7 @@
These will produce output similar to:
-\\\\\\VERBATIM
+\preformatted{
> require(Swift)
Loading required package: Swift
@@ -127,13 +133,14 @@
[[1]]
[1] 4505
-
==> test 1.1 passed
>
-////////////// VERBATIM
+}
-CONFIGURE SERVERS
+}
+\section{CONFIGURE SERVERS}{
+
edit configure-site-NAME in exec/
can put local cores into an ssh pool
@@ -143,8 +150,9 @@
access remote systems via ssh
Export SWIFTR_TMP in your environment
+}
-START SERVERS
+\section{START SERVERS}{
# do this outside of R - BEFORE trying to run R Swift functions
@@ -163,15 +171,17 @@
These Swift servers can be started and left running, across R runs
options(swift.server="local") # or "pbsman" or "ssh"
+}
-HELLO WORLD TEST
+\section{HELLO WORLD TEST}{
# Start swift local server as above
require(Swift)
basicSwiftTest()
+}
-RUN FULL TEST
+\section{RUN FULL TEST}{
As a regular user:
@@ -180,9 +190,9 @@
# Then
-n=10 # 1o times through full test loop
+n=10 # 10 times through full test loop
-swiftTestLoop(n)
+testLoop(n)
Testing from the source tree:
@@ -190,10 +200,10 @@
source("Swift/tests/TestSwift.R")
or R CMD TEST etc? FIXME
+}
+\section{STOPPING SWIFT SERVERS}{
-STOPPING SWIFT SERVERS
-
The following ps command is useful for displaying the many background
swift processes. I keep this aliased as "mp" (my processes):
@@ -217,8 +227,10 @@
Occaasionally a killall R and/or killall java is required
-USAGE
+}
+\section{USAGE}{
+
Swift returns Error object when remote side fails.
swiftapply( )
@@ -242,22 +254,17 @@
To be developed: swiftSapply, ...
-TESTS AND EXAMPLES
+}
-basicSwiftTest()
+\section{OPENMX EXAMPLES}{
-runAllSwiftTests()
-
-testloop(n)
-
-
-OPENMX EXAMPLES
-
This section is specific to users of the OpenMX R package for
structural equation modeling.
-USING ADDITIONAL PARALLEL ENVIRONMENTS
+}
+\section{USING ADDITIONAL PARALLEL ENVIRONMENTS}{
+
3) ssh confiured for password-free login (to run on remote worker nodes)
Ability to ssh to server machines (without password: agents, master
@@ -272,14 +279,16 @@
(document ssh tricks here for pw-less access)
+}
+\section{DIRECTORY STRUCTURE USED FOR SWIFT RUNTIME}{
+tbd
+}
-DIRECTORY STRUCTURE USED FOR SWIFT RUNTIME
+\section{PROCEESS STRUCTURE OF SWIFT RUNTIME}{
-PROCEESS STRUCTURE USE FOR SWIFT RUNTIME
+\preformatted{
-vvvv VERBATIM
-
vanquish$ mp
UID PID PPID PGID SID C STIME TTY TIME CMD
wilde 3621 3553 3553 3553 0 19:17 ? 00:00:00 sshd: wilde at pts/1
@@ -299,12 +308,11 @@
wilde 4455 1 3726 3622 0 19:38 pts/1 00:00:00 /usr/lib64/R/bin/exec/R --slave --no-restore --file=./SwiftRServer.sh --ar
wilde 4270 1 3726 3622 0 19:38 pts/1 00:00:00 /usr/lib64/R/bin/exec/R --slave --no-restore --file=./SwiftRServer.sh --ar
wilde 4160 1 3726 3622 0 19:36 pts/1 00:00:00 /usr/lib64/R/bin/exec/R --slave --no-restore --file=./SwiftRServer.sh --ar
-vanquish$
+vanquish$ }
-^^^^^ VERBATIM
+}
+\section{DEBUGGING AND TROUBLESHOOTING}{
-DEBUGGING AND TROUBLESHOOTING
-
* manual mode
* logs to look at
@@ -315,23 +323,22 @@
You should see periodic status update lines such as the following:
-vvvvvvvvv VERBATIM
+\preformatted{
+tbd}
-
-
-
-^^^^^^^^^ VERBATIM
-
* reporting bugs: what to send (FIXME: need swiftsnapshot script)
* setting Swift worker logging with $HOME/.globus/coasters/loglevel
file. This file should contain a single text integer: 0=most detailed,
4=least detaild, 5=off. This is an interim log control mechanism and
may be deprecated in the future.
+}
-CHECKOUT AND BUILD SWIFT R PACKAGE FROM SVN
-WITH COMPILED SWIFT BINARY RELEASE (TRUNK) FROM SVN
+\section{CHECKOUT AND BUILD SWIFT R PACKAGE FROM SVN}{
+(with compiled swift binary release (trunk) from svn)
+
+\preformatted{
cd ~
svn checkout https://svn.ci.uchicago.edu/svn/vdl2/SwiftApps/SwiftR
cd SwiftR/Swift
@@ -340,10 +347,12 @@
wget http://www.ci.uchicago.edu/~wilde/swift.rNNNN.cog.rNNNN.tar.gz
cd ~/SwiftR
-./install.sh # generates a .gz package in ~/public_html/*.gz
+./install.sh # generates a .gz package in ~/public_html/*.gz}
-CAVEATS
+}
+\section{CAVEATS}{
+
Swift requires Sun Java 1.4 or above; preferably 1.6. It will not run
under gcj (GNU Java) although it is getting closer to being able to
and may work - to some extent in in some settings. You need to ensure
@@ -352,8 +361,27 @@
In addition, the environment variable CLASSPATH should not be set.
Variables set in the initialze script must typically be set in global
-environment ( var <<- value );
+environment ( var <<- value ); These conventions may need to be
+revisted. The initialexpr script is passed in the same saved/loaded R
+object as R functions and arguments. Thus if the expr needs to be run
+before these objects can be loaded, then an alternate mathod of
+intialization needs to be used. (Eg the original method of passing th
+einitvar as a separate file).
+All Swift servers do a \verb{require(methods)} call when they
+start. Its not clear if this is unversally desired. It was not done by
+default in Rserver but seems to be in interactive R.
+
+Running on systems like Merlot with very restrctive firewalls requires
+that the user first export the env var GLOBUS_HOSTNAME, set to the IP
+address of the network interface by which the worker nodes can connect
+to the login host on which the R client and start-swift commands are
+running.
+
+pbs and pbsf servers try to clean up all worker nodes used with a
+killlall -u $USER. This can be made more precise to avoid killing jobs
+on shared worker nodes.
+
The following caveats are high prioiry on the FIXME list:
You MUST start the Swift server before running a swiftapply() call
@@ -385,13 +413,13 @@
\author{
-Swift R package developed by Michael Wilde
-
Swift was developed by: Mihael Hategan, Ben Clifford, Justin Wozniak,
Yong Zhao, Ian Foster, and Michael Wilde with contributions from Sarah
Kenny, Ioan Raicu, Luiz Gadelha, Allan Espinosa, Zhao Zhang, David
Kelly, Jon Monette, Glen Hocky, Tom Uram, Wenjun Wu, and other users.
+Swift R package developed by Michael Wilde and the OpenMx project
+
Maintainer: Michael Wilde <wilde at mcs.anl.gov>
}
More information about the Swift-commit
mailing list