[Swift-commit] r4090 - in SwiftApps/SwiftR/Swift: R exec man
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Mon Feb 14 15:58:55 CST 2011
Author: tga
Date: 2011-02-14 15:58:55 -0600 (Mon, 14 Feb 2011)
New Revision: 4090
Added:
SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
Modified:
SwiftApps/SwiftR/Swift/R/Workers.R
SwiftApps/SwiftR/Swift/exec/configure-server-pbs
SwiftApps/SwiftR/Swift/exec/start-swift
SwiftApps/SwiftR/Swift/man/swiftInit.Rd
Log:
Adding in a backend for cobalt.
--THIS line, and those below, will be ignored--
M Swift/R/Workers.R
A Swift/exec/configure-server-cobalt
M Swift/exec/start-swift
M Swift/exec/configure-server-pbs
M Swift/man/swiftInit.Rd
Modified: SwiftApps/SwiftR/Swift/R/Workers.R
===================================================================
--- SwiftApps/SwiftR/Swift/R/Workers.R 2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/R/Workers.R 2011-02-14 21:58:55 UTC (rev 4090)
@@ -2,7 +2,7 @@
swiftInit <- function( cores=NULL, server=NULL,
hosts=NULL, nodes=NULL, project=NULL,
- parEnv=NULL, workmode=NULL,
+ parEnv=NULL, kernel=NULL, workmode=NULL,
throttle=NULL, queue=NULL,
rcmd=NULL, time=NULL,
workerLogging=NULL,keepworkdir=NULL)
@@ -87,6 +87,13 @@
if(! is.null(parEnv) ) {
cmdString <- paste(cmdString, "-e", shQuote(parEnv))
}
+
+ if(is.null(kernel))
+ kernel <- getOption("swift.kernel")
+
+ if(! is.null(kernel) ) {
+ cmdString <- paste(cmdString, "-kernel", shQuote(kernel))
+ }
if(is.null(workmode))
workmode <- getOption("swift.workmode")
Added: SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-cobalt (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-cobalt 2011-02-14 21:58:55 UTC (rev 4090)
@@ -0,0 +1,50 @@
+#! /bin/bash
+
+# configuration for cobalt with manually-started Swift workers (passive coasters)
+
+cores=$1
+
+throttle=5.0 # allow approximately 500 concurrent jobs
+
+cat >tc <<END
+fork bashlocal /bin/bash null null GLOBUS::maxwalltime="00:00:10"
+cobalt bash /bin/bash null null ENV::PATH="$PATH";GLOBUS::maxwalltime="00:01:00"
+END
+
+
+cat >sites.xml <<END
+<config>
+
+ <pool handle="fork">
+ <execution provider="local" url="none" />
+ <profile key="jobThrottle" namespace="karajan">0.15</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none" />
+ <workdirectory>$(pwd)/swiftwork</workdirectory>
+ </pool>
+
+ <pool handle="cobalt">
+ <execution provider="coaster" url="none" jobmanager="local:NA"/>
+ <profile namespace="globus" key="workerManager">passive</profile>
+ <profile namespace="globus" key="workersPerNode">$cores</profile>
+ <profile namespace="karajan" key="jobThrottle">$throttle</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <filesystem provider="local" url="none"/>
+ <workdirectory>$HOME/swiftwork</workdirectory>
+ </pool>
+
+</config>
+END
+
+cat >cf <<END
+wrapperlog.always.transfer=false
+sitedir.keep=true
+execution.retries=0
+lazy.errors=false
+status.mode=provider
+use.provider.staging=false
+provider.staging.pin.swiftfiles=false
+#throttle.host.submit=1
+END
+
+
Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbs
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbs 2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbs 2011-02-14 21:58:55 UTC (rev 4090)
@@ -39,7 +39,7 @@
END
cat >cf <<END
-wrapperlog.always.transfer=true
+wrapperlog.always.transfer=false
sitedir.keep=true
execution.retries=0
lazy.errors=false
Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift 2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/exec/start-swift 2011-02-14 21:58:55 UTC (rev 4090)
@@ -1,6 +1,6 @@
#! /bin/bash
-#set -x
+set -x
export TRAPEVENTS="EXIT 1 2 3 15" # Signals and conditions to trap
@@ -77,8 +77,8 @@
echo $sshpids > $sshpidfile
}
+
# FIXME: does PBS need same workers-per-node logic as SGE?
-
make-pbs-submit-file()
{
if [ $queue != NONE ]; then
@@ -254,15 +254,14 @@
IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
# FIXME: set up for capturing batch job id: rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
- make-${server}-submit-file
+ if [ "$server" != "cobalt" ]; then
+ make-${server}-submit-file
+ fi
#FIXME: doesn't work for SGE on IBI cluster as there is additional text
# returned by qsub
- if [ "${server}" != "sge" ]
+ if [ "${server}" = "sge" ]
then
- qsub batch.sub >$jobidfile
- succ=$?
- else
# Sun grid engine inconviently returns a bunch of text surrounding
# the job id. There is no documented way to obtain the job number
# directly from qsub. We will parse out the first number in this text
@@ -270,6 +269,38 @@
# this was tested on).
qsub batch.sub | sed 's/[^0-9 ]//g' | awk '{ print $1 }' > $jobidfile
succ=$?
+ elif [ "${server}" = "cobalt" ]
+ then
+ # cobalt arguments are specified through command ine rather than
+ # through a submit file
+ #cobalt qsub statement with relevant parameters
+ # queue name
+ if [ "$queue" != NONE ]; then
+ queueDirective="-q $queue"
+ else
+ queueDirective=""
+ fi
+ if [ "$kernel" != NONE ]; then
+ kernDirective="--kernel $kernel"
+ else
+ kernDirective=""
+ fi
+ if [ "$project" != NONE ]; then
+ projDirective="-A $project"
+ else
+ projDirective=""
+ fi
+ # for now, rely on swift worker to fork off worker processes
+ # so we have multiple workers per node. In future could
+ # add support for virtual node mode, etc
+ qsub -t "$time" -n $nodes $queueDirective $kernDirective \
+ $projDirective -O SwiftR-workers \
+ --env WORKER_LOGGING_LEVEL=$workerLogging \
+ /usr/bin/perl $SWIFTBIN/worker.pl $CONTACT \
+ $HOME/.globus/coasters $IDLETIMEOUT
+ else
+ qsub batch.sub >$jobidfile
+ succ=$?
fi
if [ $? -eq 0 ]
@@ -336,13 +367,13 @@
-c cores 2,4,5 >= 1 (default is: local 2; ssh 4; cluster 8)
-e parEnv site specific, SGE only
-h hosts 1 list of hosts, quoted as one argument, space separated
- -m workmode node node: start one worker for all slots on a node; slot (one worker on each slot)
+ -m workmode node node: start one worker for all slots on a node; slot (one worker on each slot) (Currently ignored)
-n nodes 1
-p throttle 10 >= 1
- -q queue site speific (PBS, SGE)
+ -q queue site speific (PBS, SGE, Cobalt)
-r rcmd ssh site specific, SGE only, typically ssh. qrsh for siraf cluster
- -s server local local, pbs, sge, ssh, pbsf (for firewalled worker nodes)
- -t time 00:30:00 hh:mm:ss, for PBS and SGE only
+ -s server local local, pbs, sge, ssh, pbsf,cobalt (for firewalled worker nodes)
+ -t time 00:30:00 hh:mm:ss, for PBS, Cobalt and SGE only
-w wkloglvl NONE NONE, ERROR, WARN, INFO, DEBUG, TRACE
-k keepdir No argument, if flag is set, will keep working directory
@@ -378,6 +409,7 @@
queue=NONE
project=NONE
parEnv=NONE
+kernel=NONE
workdir=NONE
workerLogging=ERROR
keepdir=FALSE
@@ -391,13 +423,14 @@
-A) project=$2; verify-not-null project $project; shift ;;
-c) cores=$2; verify-is-numeric cores $cores; shift ;;
-e) parEnv=$2; verify-not-null parEnv $parEnv; shift ;;
+ -kernel) kernel=$2; verify-not-null kernel $kernel; shift ;;
-h) hosts=$2; verify-not-null hosts $hosts; shift ;;
-m) workmode=$2; verify-is-one-of workmode $workmode slot node; shift ;;
-n) nodes=$2; verify-is-numeric nodes $nodes; shift ;;
-p) throttle=$2; verify-is-numeric throttle $throttle; shift ;;
-q) queue=$2; verify-not-null queue $queue; shift ;;
-r) rcmd=$2; verify-is-one-of rcmd $rcmd ssh qrsh; shift ;;
- -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge; shift ;;
+ -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge cobalt; shift ;;
-t) time=$2; verify-not-null time $time; shift ;;
-w) workerLogging=$2; verify-is-one-of workerLoggingLevel $workerLogging NONE ERROR WARN INFO DEBUG TRACE; shift ;;
-k) keepdir=TRUE ;;
@@ -541,7 +574,8 @@
wait-and-start-ssh-workers &
starterpid=$!
-elif [ \( $server = pbs \) -o \( $server = pbsf \) -o \( $server = sge \) ]; then
+elif [ \( $server = pbs \) -o \( $server = pbsf \) -o \( $server = sge \) \
+ -o \( $server = cobalt \) ]; then
if [ $cores -eq 0 ]; then
cores=$defaultClusterCores
Modified: SwiftApps/SwiftR/Swift/man/swiftInit.Rd
===================================================================
--- SwiftApps/SwiftR/Swift/man/swiftInit.Rd 2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/man/swiftInit.Rd 2011-02-14 21:58:55 UTC (rev 4090)
@@ -26,7 +26,7 @@
The number of cores per host. The default values vary from 2 to 8 depending on the server type.
}
\item{server}{
- One of: "local", "ssh", "pbs", "sge", "pbsf".
+ One of: "local", "ssh", "pbs", "sge", "pbsf", "cobalt".
How Swift will run the jobs: for example, if "local" is chosen, they
will be run on the local machine, or if "pbs" is chosen, they will be
run through the pbs scheduler.
@@ -40,10 +40,11 @@
}
\item{nodes}{
The number of nodes to request from the batch scheduler. This only
- is used if server is "pbs", "sge" or "pbsf".
+ is used if server is "pbs", "sge", "pbsf" and "cobalt".
}
\item{project}{
- The project name passed to the PBS or SGE batch scheduler. Site-specific.
+ The project name passed to the PBS, Cobalt or SGE batch scheduler.
+ Site-specific.
}
\item{parEnv}{
SGE only. This is the parallel environment setting passed to the
@@ -51,6 +52,9 @@
multi-node jobs with SwiftR on Sun Grid Engine sites. The "mpi"
environment is often a suitable choice.
}
+ \item{kernel} {
+ Cobalt only - the kernel to run on the compute nodes.
+ }
\item{workmode}{
Can be "node" or "slot".
If "node", one worker is started for all slots on a node.
More information about the Swift-commit
mailing list