[Swift-commit] r3768 - in SwiftApps/SwiftR: . Swift/exec

noreply at svn.ci.uchicago.edu noreply at svn.ci.uchicago.edu
Sun Dec 12 22:45:17 CST 2010


Author: wilde
Date: 2010-12-12 22:45:17 -0600 (Sun, 12 Dec 2010)
New Revision: 3768

Modified:
   SwiftApps/SwiftR/Swift/exec/configure-server-pbs
   SwiftApps/SwiftR/Swift/exec/start-swift
   SwiftApps/SwiftR/install.sh
Log:
initial changes to handle SGE. May need to re-adjust PBS changes.

Modified: SwiftApps/SwiftR/Swift/exec/configure-server-pbs
===================================================================
--- SwiftApps/SwiftR/Swift/exec/configure-server-pbs	2010-12-12 23:47:03 UTC (rev 3767)
+++ SwiftApps/SwiftR/Swift/exec/configure-server-pbs	2010-12-13 04:45:17 UTC (rev 3768)
@@ -50,7 +50,7 @@
 END
 
 
-true to save <<END
+: SAVE FOR REFERENCE <<END
 
 #     <profile namespace="globus" key="maxWallTime">00:00:01</profile>
 #     <profile namespace="globus" key="queue">fast</profile>

Modified: SwiftApps/SwiftR/Swift/exec/start-swift
===================================================================
--- SwiftApps/SwiftR/Swift/exec/start-swift	2010-12-12 23:47:03 UTC (rev 3767)
+++ SwiftApps/SwiftR/Swift/exec/start-swift	2010-12-13 04:45:17 UTC (rev 3768)
@@ -1,5 +1,7 @@
 #! /bin/bash
 
+set -x
+
 export TRAPEVENTS="EXIT 1 2 3 15"  # Signals and conditions to trap
 
 # Define internal functions
@@ -54,6 +56,8 @@
   echo $sshpids > $sshpidfile
 }
 
+# FIXME: does PBS need same workers-per-node logic as SGE?
+
 make-pbs-submit-file()
 {
   if [ $queue != default ]; then
@@ -61,7 +65,7 @@
   else
     queueDirective=""
   fi
-cat >pbs.sub <<END
+cat >batch.sub <<END
 #PBS -S /bin/sh
 #PBS -N SwiftR-workers
 #PBS -m n
@@ -85,6 +89,8 @@
 END
 }
 
+# Submit file for PBS systems with firewall restrictions: specifically, Merlot at VCU
+
 make-pbsf-submit-file()
 {
   if [ _$GLOBUS_HOSTNAME = _ ]; then
@@ -97,7 +103,7 @@
   else
     queueDirective=""
   fi
-cat >pbs.sub <<END
+cat >batch.sub <<END
 #PBS -S /bin/sh
 #PBS -N SwiftR-workers
 #PBS -m n
@@ -105,37 +111,99 @@
 #PBS -l walltime=$time
 #PBS -o $HOME
 #PBS -e $HOME
-$queueDirective
-WORKER_LOGGING_ENABLED=true # FIXME: parameterize; fix w PBS -v
-HOST=\$(echo $CONTACT | sed -e 's,^http://,,' -e 's/:.*//')
-PORT=\$(echo $CONTACT | sed -e 's,^.*:,,')
-CONTACT=http://localhost:\$PORT
-echo '***' PBS_NODEFILE file: \$PBS_NODEFILE CONTACT:$CONTACT
-cat \$PBS_NODEFILE
-echo '***' unique nodes are:
-sort < \$PBS_NODEFILE|uniq
-for h in \$(sort < \$PBS_NODEFILE|uniq); do
-  ssh \$h "echo Swift R startup running on host; hostname; echo HOST=\$HOST PORT=\$PORT CONTACT=\$CONTACT; cd /; ( ssh -N -L \$PORT:\$HOST:\$PORT \$HOST & sleep 3; /usr/bin/perl $SWIFTBIN/worker.pl \$CONTACT SwiftR-\$h $HOME/.globus/coasters $IDLETIMEOUT ; wait)" &
-done
+  $queueDirective
+  WORKER_LOGGING_ENABLED=true # FIXME: parameterize; fix w PBS -v
+  HOST=\$(echo $CONTACT | sed -e 's,^http://,,' -e 's/:.*//')
+  PORT=\$(echo $CONTACT | sed -e 's,^.*:,,')
+  CONTACT=http://localhost:\$PORT
+  echo '***' PBS_NODEFILE file: \$PBS_NODEFILE CONTACT:$CONTACT
+  cat \$PBS_NODEFILE
+  echo '***' unique nodes are:
+  sort < \$PBS_NODEFILE|uniq
+  for h in \$(sort < \$PBS_NODEFILE|uniq); do
+    ssh \$h "echo Swift R startup running on host; hostname; echo HOST=\$HOST PORT=\$PORT CONTACT=\$CONTACT; cd /; ( ssh -N -L \$PORT:\$HOST:\$PORT \$HOST & sleep 3; /usr/bin/perl $SWIFTBIN/worker.pl \$CONTACT SwiftR-\$h $HOME/.globus/coasters $IDLETIMEOUT ; wait)" &
+  done
+#
+  ontrap()
+  {
+    echo in ontrap
+    # Kill our processes on each node; do first node (on which this is running) last
+    for h in \$(sort < \$PBS_NODEFILE|uniq | sort -r); do
+      echo killing processes on host \$h
+      ssh \$h killall -u \$USER
+    done;
+    killall -u \$USER
+  }
+#
+  trap ontrap $TRAPEVENTS
+  wait
+#
+END
+}
 
-ontrap()
+# FIXME: for big systems like Ranger, need to use ssh_tree to avoid socket FD exhastion?
+
+echo about to define make-sge
+
+make-sge-submit-file()
 {
-  echo in ontrap
-  # Kill our processes on each node; do first node (on which this is running) last
-  for h in \$(sort < \$PBS_NODEFILE|uniq | sort -r); do
-    echo killing processes on host \$h
-    ssh \$h killall -u \$USER
-  done;
-  killall -u \$USER
-}
+echo in $0
+  if [ $queue != default ]; then
+    queueDirective="#$ -q $queue"
+  else
+    queueDirective=""
+  fi
+  if [ $project != default ]; then
+    projectDirective="#$ -A $project"
+  else
+    projectDirective=""
+  fi
+  rcmd="qrsh" # FIXME - need to set on system basis; qrsh works for siraf
+  
+cat >batch.sub <<END
+#!/bin/bash
+#$ -S /bin/bash
+#$ -o $HOME
+#$ -e $HOME
+#$ -N SwiftR
+#$ -l h_rt=$time
+# #$ -v WORKER_LOGGING_LEVEL=NONE
+#$ -V
 
-trap ontrap $TRAPEVENTS
-wait
+# Siraf Site-specific:
+#$ -pe openmpi $(($nodes*$cores))
+$queueDirective
+#  -A ???
+#$projectDirective
 
+# Ranger Site-specific:
+# $ -pe 16way 256
+# $ -q development
+# $ -A TG-DBS080004N
+
+  cd / && NODES=\`cat \$PE_HOSTFILE | awk '{ for(i=0;i<\$2;i++){print \$1} }'\`
+
+  # -or-  cd / && NODES=`cat $PE_HOSTFILE | awk '{print $1}'` # Better for Ranger
+
+  # WORKER_LOGGING_ENABLED=true # FIXME: parameterize; fix w PBS -v
+  #cd / && /usr/bin/perl $SWIFTBIN/worker.pl $CONTACT SwiftR-workers $HOME/.globus/coasters $IDLETIMEOUT
+  HOST=\$(echo $CONTACT | sed -e 's,^http://,,' -e 's/:.*//')
+  PORT=\$(echo $CONTACT | sed -e 's,^.*:,,')
+  echo '***' PE_HOSTFILE file: \$PE_HOSTFILE CONTACT:$CONTACT
+
+  for h in \$NODES; do
+    workerCmd="echo Swift R startup running on host; hostname; cd /; WORKER_LOGGING_LEVEL=NONE /usr/bin/perl $SWIFTBIN/worker.pl $CONTACT SwiftR-\$h $HOME/.globus/coasters $IDLETIMEOUT"
+    if [ $rcmd = ssh ]; then
+      ssh \$h "\$workerCmd" &
+    else
+      qrsh -nostdin -l hostname=\$h "\$workerCmd" &
+    fi
+  done
+  wait
 END
 }
 
-function wait-and-start-pbs-workers
+function wait-and-start-batch-workers
 {
   get-contact
   LOGDIR=$(pwd)/swiftworkerlogs # full path. FIXME: Generate this with remote-side paths if not shared dir env?
@@ -145,11 +213,11 @@
 
   IDLETIMEOUT=$((60*60*240)) # 10 days: FIXME: make this a command line arg
 
-  # FIXME: set up for capturing pbs job id: rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
+  # FIXME: set up for capturing batch job id: rm -rf remotepid.* # FIXME: should not be needed if we start in a new dir each time
   make-${server}-submit-file
-  qsub pbs.sub>$pbsjobidfile
+  qsub batch.sub >$jobidfile
 
-  echo Started workers from PBS job $(cat $pbsjobidfile)
+  echo Started workers from batch job $(cat $jobidfile)
 }
 
 usage()
@@ -216,6 +284,7 @@
 throttle=10
 hosts=no-hosts-specified
 queue=default
+project=default
 
 while [ $# -gt 0 ]
 do
@@ -225,7 +294,8 @@
     -n) nodes=$2; verify-is-numeric nodes $nodes; shift ;;
     -p) throttle=$2; verify-is-numeric throttle $throttle; shift ;;
     -q) queue=$2; verify-not-null queue $queue; shift ;;
-    -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf; shift ;;
+    -A) project=$2; verify-not-null project $project; shift ;;
+    -s) server=$2; verify-is-one-of server $server local ssh pbs pbsf sge; shift ;;
     -t) time=$2; verify-not-null time $time; shift ;;
     *)  usage; exit 1 ;;
   esac
@@ -300,31 +370,31 @@
   wait-and-start-ssh-workers &
   starterpid=$!
 
-elif [ \( $server = pbs \) -o \( $server = pbsf \) ]; then
+elif [ \( $server = pbs \) -o \( $server = pbsf \) -o \( $server = sge \) ]; then
 
-  source $SWIFTRBIN/configure-server-pbs $cores
+  source $SWIFTRBIN/configure-server-${server} $cores
 
-  pbsjobidfile=${out/stdouterr/pbsjobid}
+  jobidfile=${out/stdouterr/jobid}
 
   TRAPEVENTS="EXIT 1 2 3 15"  # Signals and conditions to trap
 
   function onexit {
     coasterservicepid="" # null: saved in case we go back to using coaster servers
     trap - $TRAPEVENTS
-    pbsjobid=$(cat $pbsjobidfile)
-    echo Terminating worker processes starter $starterpid and PBS job $pbsjobid
+    jobid=$(cat $jobidfile)
+    echo Terminating worker processes starter $starterpid and batch job $jobid
     if [ "_$starterpid != _ ]; then
       kill $starterpid
     fi
-    if [ "_$pbsjobid != _ ]; then
-      qdel $pbsjobid
+    if [ "_$jobid != _ ]; then
+      qdel $jobid
     fi
     kill 0 # Kill all procs in current process group # FIXME: what was this for????
   }
 
   trap onexit $TRAPEVENTS
 
-  wait-and-start-pbs-workers &
+  wait-and-start-batch-workers &
   starterpid=$!
 
 fi

Modified: SwiftApps/SwiftR/install.sh
===================================================================
--- SwiftApps/SwiftR/install.sh	2010-12-12 23:47:03 UTC (rev 3767)
+++ SwiftApps/SwiftR/install.sh	2010-12-13 04:45:17 UTC (rev 3768)
@@ -1,4 +1,8 @@
 ver=0.1
+rm -rf Swift/inst/swift/*
+mkdir -p Swift/inst/swift
+SWIFTREL=$(cd $(dirname $(which swift))/..; pwd)
+cp -pr $SWIFTREL/* Swift/inst/swift
 R CMD build Swift
 R CMD INSTALL Swift_${ver}.tar.gz
 cp Swift_${ver}.tar.gz ~/public_html




More information about the Swift-commit mailing list