[Swift-commit] r4090 - in SwiftApps/SwiftR/Swift: R exec man

noreply at svn.ci.uchicago.edu noreply at svn.ci.uchicago.edu
Mon Feb 14 15:58:55 CST 2011

Author: tga
Date: 2011-02-14 15:58:55 -0600 (Mon, 14 Feb 2011)
New Revision: 4090

Adding in a backend for cobalt.  

--THIS line, and those below, will be ignored--

M    Swift/R/Workers.R
A    Swift/exec/configure-server-cobalt
M    Swift/exec/start-swift
M    Swift/exec/configure-server-pbs
M    Swift/man/swiftInit.Rd

Modified: SwiftApps/SwiftR/Swift/R/Workers.R
--- SwiftApps/SwiftR/Swift/R/Workers.R	2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/R/Workers.R	2011-02-14 21:58:55 UTC (rev 4090)
@@ -2,7 +2,7 @@
 swiftInit <- function( cores=NULL, server=NULL, 
                     hosts=NULL, nodes=NULL, project=NULL, 
-                    parEnv=NULL, workmode=NULL,
+                    parEnv=NULL, kernel=NULL, workmode=NULL,
                     throttle=NULL, queue=NULL,
                     rcmd=NULL, time=NULL,
@@ -87,6 +87,13 @@
     if(! is.null(parEnv) )  {
         cmdString <- paste(cmdString, "-e", shQuote(parEnv)) 
+    if(is.null(kernel))
+        kernel <- getOption("swift.kernel")
+    if(! is.null(kernel) )  {
+        cmdString <- paste(cmdString, "-kernel", shQuote(kernel)) 
+    }
         workmode <- getOption("swift.workmode")

Added: SwiftApps/SwiftR/Swift/exec/configure-server-cobalt
--- SwiftApps/SwiftR/Swift/exec/configure-server-cobalt	                        (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-cobalt	2011-02-14 21:58:55 UTC (rev 4090)
@@ -0,0 +1,50 @@
+#! /bin/bash
+# configuration for cobalt with manually-started Swift workers (passive coasters)
+throttle=5.0 # allow approximately 500 concurrent jobs
+cat >tc <<END
+fork      bashlocal /bin/bash null null GLOBUS::maxwalltime="00:00:10"
+cobalt       bash      /bin/bash null null ENV::PATH="$PATH";GLOBUS::maxwalltime="00:01:00"
+cat >sites.xml <<END
+  <pool handle="fork">
+    <execution provider="local" url="none" />
+    <profile key="jobThrottle" namespace="karajan">0.15</profile>
+    <profile namespace="karajan" key="initialScore">10000</profile>
+    <filesystem provider="local" url="none" />
+    <workdirectory>$(pwd)/swiftwork</workdirectory>
+  </pool>
+  <pool handle="cobalt">
+    <execution provider="coaster" url="none" jobmanager="local:NA"/>
+    <profile namespace="globus" key="workerManager">passive</profile>
+    <profile namespace="globus" key="workersPerNode">$cores</profile>
+    <profile namespace="karajan" key="jobThrottle">$throttle</profile>
+    <profile namespace="karajan" key="initialScore">10000</profile>
+    <filesystem provider="local" url="none"/>
+    <workdirectory>$HOME/swiftwork</workdirectory>
+  </pool>
+cat >cf <<END

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbs
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbs	2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbs	2011-02-14 21:58:55 UTC (rev 4090)
@@ -39,7 +39,7 @@
 cat >cf <<END

Modified: SwiftApps/SwiftR/Swift/exec/start-swift
--- SwiftApps/SwiftR/Swift/exec/start-swift	2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/exec/start-swift	2011-02-14 21:58:55 UTC (rev 4090)
@@ -1,6 +1,6 @@
 #! /bin/bash
-#set -x
+set -x
 export TRAPEVENTS="EXIT 1 2 3 15"  # Signals and conditions to trap
@@ -77,8 +77,8 @@
   echo $sshpids > $sshpidfile
 # FIXME: does PBS need same workers-per-node logic as SGE?
   if [ $queue != NONE ]; then
@@ -254,15 +254,14 @@
   IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
   # FIXME: set up for capturing batch job id: rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
-  make-${server}-submit-file
+  if [ "$server" != "cobalt" ]; then
+    make-${server}-submit-file
+  fi
   #FIXME: doesn't work for SGE on IBI cluster as there is additional text
   # returned by qsub
-  if [ "${server}" != "sge" ]
+  if [ "${server}" = "sge" ]
-    qsub batch.sub >$jobidfile
-    succ=$?
-  else
     # Sun grid engine inconviently returns a bunch of text surrounding
     # the job id.  There is no documented way to obtain the job number
     # directly from qsub.  We will parse out the first number in this text
@@ -270,6 +269,38 @@
     # this was tested on).
     qsub batch.sub | sed 's/[^0-9 ]//g' | awk '{ print $1 }' > $jobidfile
+  elif [ "${server}" = "cobalt" ]
+  then
+    # cobalt arguments are specified through command ine rather than
+    # through a submit file
+    #cobalt qsub statement with relevant parameters
+    # queue name
+    if [ "$queue" != NONE ]; then
+        queueDirective="-q $queue"
+    else
+        queueDirective=""
+    fi
+    if [ "$kernel" != NONE ]; then
+        kernDirective="--kernel $kernel"
+    else
+        kernDirective=""
+    fi
+    if [ "$project" != NONE ]; then
+        projDirective="-A $project"
+    else
+        projDirective=""
+    fi
+    # for now, rely on swift worker to fork off worker processes 
+    # so we have multiple workers per node.  In future could
+    # add support for virtual node mode, etc
+    qsub -t "$time" -n $nodes $queueDirective $kernDirective \
+	$projDirective -O SwiftR-workers \
+	--env WORKER_LOGGING_LEVEL=$workerLogging \
+	 /usr/bin/perl $SWIFTBIN/worker.pl $CONTACT \
+         $HOME/.globus/coasters $IDLETIMEOUT
+  else
+    qsub batch.sub >$jobidfile
+    succ=$?
   if [ $? -eq 0 ]
@@ -336,13 +367,13 @@
    -c cores    2,4,5       >= 1 (default is: local 2; ssh 4; cluster 8)
    -e parEnv               site specific, SGE only
    -h hosts    1           list of hosts, quoted as one argument, space separated
-   -m workmode node        node: start one worker for all slots on a node; slot (one worker on each slot)
+   -m workmode node        node: start one worker for all slots on a node; slot (one worker on each slot) (Currently ignored)
    -n nodes    1
    -p throttle 10          >= 1
-   -q queue                site speific (PBS, SGE)
+   -q queue                site speific (PBS, SGE, Cobalt)
    -r rcmd     ssh         site specific, SGE only, typically ssh. qrsh for siraf cluster
-   -s server   local       local, pbs, sge, ssh, pbsf (for firewalled worker nodes)
-   -t time     00:30:00    hh:mm:ss, for PBS and SGE only
+   -s server   local       local, pbs, sge, ssh, pbsf,cobalt (for firewalled worker nodes)
+   -t time     00:30:00    hh:mm:ss, for PBS, Cobalt and SGE only
    -w wkloglvl NONE        NONE, ERROR, WARN, INFO, DEBUG, TRACE
    -k keepdir              No argument, if flag is set, will keep working directory
@@ -378,6 +409,7 @@
@@ -391,13 +423,14 @@
     -A) project=$2; verify-not-null project $project; shift ;;
     -c) cores=$2; verify-is-numeric cores $cores; shift ;;
     -e) parEnv=$2; verify-not-null parEnv $parEnv; shift ;; 
+    -kernel) kernel=$2; verify-not-null kernel $kernel; shift ;; 
     -h) hosts=$2; verify-not-null hosts $hosts; shift ;; 
     -m) workmode=$2; verify-is-one-of workmode $workmode slot node; shift ;; 
     -n) nodes=$2; verify-is-numeric nodes $nodes; shift ;;
     -p) throttle=$2; verify-is-numeric throttle $throttle; shift ;;
     -q) queue=$2; verify-not-null queue $queue; shift ;;
     -r) rcmd=$2; verify-is-one-of rcmd $rcmd ssh qrsh; shift ;;
-    -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge; shift ;;
+    -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge cobalt; shift ;;
     -t) time=$2; verify-not-null time $time; shift ;;
     -w) workerLogging=$2; verify-is-one-of workerLoggingLevel $workerLogging NONE ERROR WARN INFO DEBUG TRACE; shift ;;
     -k) keepdir=TRUE ;;
@@ -541,7 +574,8 @@
   wait-and-start-ssh-workers &
-elif [ \( $server = pbs \) -o \( $server = pbsf \) -o \( $server = sge \) ]; then
+elif [ \( $server = pbs \) -o \( $server = pbsf \) -o \( $server = sge \) \
+        -o \( $server = cobalt \) ]; then
   if [ $cores -eq 0 ]; then

Modified: SwiftApps/SwiftR/Swift/man/swiftInit.Rd
--- SwiftApps/SwiftR/Swift/man/swiftInit.Rd	2011-02-14 18:42:48 UTC (rev 4089)
+++ SwiftApps/SwiftR/Swift/man/swiftInit.Rd	2011-02-14 21:58:55 UTC (rev 4090)
@@ -26,7 +26,7 @@
     The number of cores per host.  The default values vary from 2 to 8 depending on the server type.
-    One of: "local", "ssh", "pbs", "sge", "pbsf".
+    One of: "local", "ssh", "pbs", "sge", "pbsf", "cobalt".
     How Swift will run the jobs: for example, if "local" is chosen, they
     will be run on the local machine, or if "pbs" is chosen, they will be
     run through the pbs scheduler.  
@@ -40,10 +40,11 @@
   The number of nodes to request from the batch scheduler.  This only
-  is used if server is "pbs", "sge" or "pbsf".
+  is used if server is "pbs", "sge", "pbsf" and "cobalt".
-  The project name passed to the PBS or SGE batch scheduler.  Site-specific.
+  The project name passed to the PBS, Cobalt or SGE batch scheduler.  
+	Site-specific.
   SGE only.  This is the parallel environment setting passed to the
@@ -51,6 +52,9 @@
     multi-node jobs with SwiftR on Sun Grid Engine sites.  The "mpi"
     environment is often a suitable choice.
+  \item{kernel} {
+  Cobalt only - the kernel to run on the compute nodes.
+  }
   Can be "node" or "slot".
   If "node", one worker is started for all slots on a node.  

