[Swift-commit] r5197 - in SwiftApps/SwiftR/Swift: exec man

tga at ci.uchicago.edu tga at ci.uchicago.edu
Thu Sep 29 19:16:08 CDT 2011


Author: tga
Date: 2011-09-29 19:16:08 -0500 (Thu, 29 Sep 2011)
New Revision: 5197

Added:
   SwiftApps/SwiftR/Swift/exec/configure-server-crayxtauto
Modified:
   SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
   SwiftApps/SwiftR/Swift/exec/start-swift
   SwiftApps/SwiftR/Swift/man/swiftInit.Rd
Log:
Added crayxtauto server

Added: SwiftApps/SwiftR/Swift/exec/configure-server-crayxtauto
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-crayxtauto	                        (rev 0)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-crayxtauto	2011-09-30 00:16:08 UTC (rev 5197)
@@ -0,0 +1,87 @@
+#! /usr/bin/env bash
+
+#TODO: is this best way to do this.
+
+# we assume that $time_secs specifies the lengths of jobs 
+# that should be submitted to pbs.  
+#We don't really know the expected duration of R jobs, so we will come up
+# with a sensible value based on $time-secs
+# 
+#Swift kills jobs that run for double maxwalltime, so maxwalltime should be
+# > 0.5 $time_secs
+# Ideally we want to run multiple jobs in each batch allocation, so we don't
+# want coasters to mistakenly decide that it can't schedule a job when we 
+# have most of the batch allocation left.  So we really want
+# maxwalltime to be just over 0.5 $time_secs, once we round both to minutes
+maxwalltime=$(( ($time_secs + 120) / 120 ))
+
+
+cat >tc <<END
+pbscoast  bash      /bin/bash null null ENV::PATH="$PATH";GLOBUS::maxwalltime="$maxwalltime"
+END
+
+cat >sites.xml <<END
+<config>
+
+
+  <pool handle="pbscoast">
+    <execution provider="coaster" url="none" jobmanager="local:pbs"/>
+    <filesystem provider="local" url="none"/>
+    <profile namespace="env" key="R_LIBS_USER">$R_LIBS_USER</profile>
+    <profile namespace="globus" key="providerAttributes">pbs.aprun;pbs.mpp;depth=$cores</profile>
+    
+    <profile namespace="karajan" key="jobThrottle">$throttle</profile>
+    <profile namespace="karajan" key="initialScore">10000</profile>
+
+    <!-- max number of cores in total -->
+    <profile namespace="globus" key="slots">$nodes</profile>
+    <profile namespace="globus" key="workersPerNode">$cores</profile>
+    <profile namespace="globus" key="ppn">$cores:cray:pack</profile>
+    
+    <!-- these settings control the size of the request blocks
+        put through the batch system -->
+    <profile namespace="globus" key="maxNodes">1</profile>
+    <profile namespace="globus" key="nodeGranularity">1</profile>
+    <profile namespace="globus" key="lowOverallocation">2</profile>
+    <profile namespace="globus" key="highOverallocation">2</profile>
+    <profile namespace="env" key="SWIFTR_TMP">/dev/shm/$USER/</profile>
+    <profile namespace="env" key="TMPDIR">/dev/shm</profile>
+    <workdirectory>/dev/shm/$USER/swiftwork</workdirectory>
+    <scratch>/dev/shm/$USER/swiftscratch</scratch>
+
+END
+if [ "$project" != NONE]; then
+    cat >> sites.xml <<END
+    <profile namespace="globus" key="project">$project</profile>
+END
+fi 
+if [ "$queue" != NONE ]; then
+    cat >> sites.xml <<END
+    <profile namespace="globus" key="queue">$queue</profile>
+END
+fi
+
+if [ "$time" != NONE ]; then
+    # Hack: Add 60 seconds to time to convince to request blocks for
+    # full time.
+    cat >> sites.xml <<END
+    <profile namespace="globus" key="maxTime">$((time_secs + 60))</profile>
+END
+fi
+
+cat >> sites.xml <<END
+  </pool>
+</config>
+END
+
+cat >cf <<END
+wrapperlog.always.transfer=false
+sitedir.keep=false
+execution.retries=$num_retries
+lazy.errors=false
+status.mode=provider
+use.provider.staging=true
+provider.staging.pin.swiftfiles=true
+#throttle.host.submit=1
+END
+

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto	2011-09-29 20:05:16 UTC (rev 5196)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbsauto	2011-09-30 00:16:08 UTC (rev 5197)
@@ -49,7 +49,6 @@
 if [ "$queue" != NONE ]; then
     #TODO: error handling
     # assume time in H:M:S format
-    t
     cat >> sites.xml <<END
     <profile namespace="globus" key="queue">$queue</profile>
 END

Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift	2011-09-29 20:05:16 UTC (rev 5196)
+++ SwiftApps/SwiftR/Swift/exec/start-swift	2011-09-30 00:16:08 UTC (rev 5197)
@@ -88,7 +88,34 @@
   fi
 }
 
+function setup-crayxt-lustre {
+    # Set up working directories on lustre file system
+    #FIXME: beagle-specific code
+    LUSTRE_TMPROOT=/lustre/beagle/$USER/swiftRtmp
+    if mkdir -p $LUSTRE_TMPROOT; then
+        :
+    else 
+        echo "Could not create temporary directory $LUSTRE_TMPROOT"
+        stdcleanup_start
+        stdcleanup_end
+        exit 1
+    fi
 
+
+    while true
+    do
+        LUSTRE_TMPSESSION=$LUSTRE_TMPROOT/$RANDOM
+        if mkdir $LUSTRE_TMPSESSION; then
+            echo "Temporary files will be stored in $LUSTRE_TMPSESSION" 1>&2
+            break
+        fi
+    done
+    # Cray XT cluster nodes don't have local writable tmp storage
+    export LUSTRE_TMP=$LUSTRE_TMPSESSION
+    export LUSTRE_RTMP=$LUSTRE_TMPSESSION/Rtmp
+    mkdir -p $LUSTRE_RTMP
+}
+
 make-pbs-submit-file()
 {
   SUBMIT_FILE=$1
@@ -512,7 +539,7 @@
    -r rcmd     ssh         site specific, SGE only, typically ssh. 
                                     qrsh for siraf cluster
    -s server   local       local, pbs, sge, ssh, pbsf (for firewalled workers)
-                            ,cobalt,crayxt,custom, pbsauto
+                            ,cobalt,crayxt,custom, pbsauto, crayxtauto
    -t time     00:30:00    hh:mm:ss, for PBS, Cobalt and SGE only
    -w wkloglvl NONE        NONE, ERROR, WARN, INFO, DEBUG, TRACE
    -k keepdir              No argument, if flag is set, will keep working 
@@ -577,7 +604,7 @@
     -p) throttle=$2; verify-is-numeric throttle $throttle; shift ;;
     -q) queue=$2; verify-not-null queue $queue; shift ;;
     -r) rcmd=$2; verify-is-one-of rcmd $rcmd ssh qrsh; shift ;;
-    -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge cobalt crayxt custom pbsauto; shift ;;
+    -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge cobalt crayxt custom pbsauto crayxtauto; shift ;;
     -t) time=$2; verify-is-time time $time; shift ;;
     -w) workerLogging=$2; verify-is-one-of workerLoggingLevel $workerLogging NONE ERROR WARN INFO DEBUG TRACE; shift ;;
     -L) swiftLoggingFlag="" ;; # swift default is lots of logging
@@ -699,6 +726,12 @@
 # Function to run on termination of swift
 exitcmd=""
 
+# Set up working directories on crayxt
+if [ $server = crayxt -o $server = crayxtauto ] ; then
+    setup-crayxt-lustre
+fi
+
+
 if [ $server = custom ]; then
     warmupjob=false
     # have already set up tc.data and sites.xml files, just set
@@ -775,14 +808,14 @@
   wait-and-start-ssh-workers &
   starterpid=$!
 
-elif [ \( $server = pbsauto \) ]; then
+elif [ \( $server = pbsauto \) -o \( $server = crayxtauto \) ]; then
     warmupjob=false
     # Systems where Swift manages workers
   if [ $cores -le 0 ]; then
     cores=$defaultClusterCores
   fi
   echo server=$server project=$project cores=$cores nodes=$nodes queue=$queue 
-  source $SWIFTRBIN/configure-server-pbsauto
+  source $SWIFTRBIN/configure-server-$server
 
   function onexit {
     stdcleanup_start
@@ -811,30 +844,6 @@
 
   DIRS_TO_DELETE=
   if [ $server = crayxt ]; then
-    #FIXME: beagle-specific code
-    LUSTRE_TMPROOT=/lustre/beagle/$USER/swiftRtmp
-    if mkdir -p $LUSTRE_TMPROOT; then
-        :
-    else 
-        echo "Could not create temporary directory $LUSTRE_TMPROOT"
-        stdcleanup_start
-        stdcleanup_end
-        exit 1
-    fi
-
-
-    while true
-    do
-        LUSTRE_TMPSESSION=$LUSTRE_TMPROOT/$RANDOM
-        if mkdir $LUSTRE_TMPSESSION; then
-            echo "Temporary files will be stored in $LUSTRE_TMPSESSION" 1>&2
-            break
-        fi
-    done
-    # Cray XT cluster nodes don't have local writable tmp storage
-    export LUSTRE_TMP=$LUSTRE_TMPSESSION
-    export LUSTRE_RTMP=$LUSTRE_TMPSESSION/Rtmp
-    mkdir -p $LUSTRE_RTMP
     source $SWIFTRBIN/configure-server-crayxt 
   elif [ $server = pbsf ]; then
     source $SWIFTRBIN/configure-server-pbs

Modified: SwiftApps/SwiftR/Swift/man/swiftInit.Rd
===================================================================
--- SwiftApps/SwiftR/Swift/man/swiftInit.Rd	2011-09-29 20:05:16 UTC (rev 5196)
+++ SwiftApps/SwiftR/Swift/man/swiftInit.Rd	2011-09-30 00:16:08 UTC (rev 5197)
@@ -25,7 +25,8 @@
     The number of cores per host.  The default values vary from 2 to 8 depending on the server type.
 }
   \item{server}{
-    One of: "local", "ssh", "pbs", "sge", "pbsf", "cobalt".
+    One of: "local", "ssh", "pbs", "sge", "pbsf", "cobalt", "crayxt",
+    "pbsauto", "crayxtauto".
     How Swift will run the jobs: for example, if "local" is chosen, they
     will be run on the local machine, or if "pbs" is chosen, they will be
     run through the pbs scheduler.  




More information about the Swift-commit mailing list