[Swift-commit] r4956 - trunk/bin/grid

wilde at ci.uchicago.edu wilde at ci.uchicago.edu
Sat Aug 6 17:40:14 CDT 2011


Author: wilde
Date: 2011-08-06 17:40:14 -0500 (Sat, 06 Aug 2011)
New Revision: 4956

Added:
   trunk/bin/grid/start-grid-service
Modified:
   trunk/bin/grid/TODO
   trunk/bin/grid/start-ranger-service
   trunk/bin/grid/start-ranger-service~
   trunk/bin/grid/start-swift-service
   trunk/bin/grid/swift-workers
Log:
commit snapshot of working version - work in progress

Modified: trunk/bin/grid/TODO
===================================================================
--- trunk/bin/grid/TODO	2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/TODO	2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,4 +1,8 @@
+BUGS TO INVESTIGATE AND/OR FILE
 
+- why are there two logs from the coaster service: uuid-named log and swift.log????
+
+
 EXTENCI APPLICATION WORK
 
 create modft install & test file; test under fork and work
@@ -14,11 +18,13 @@
 
 TO RESOLVE
 
-- how to set swift throttles to handle a varying number of coaster workers per site?
+- how to set swift throttles to handle a varying number of coaster
+workers per site?
 
 - why did Allan set exceptions in workdir names, eg for BNL?
 
-- how to dynamically grow/shrink pool and add/remove sites; dynamically take coaster services in and out of service.
+- how to dynamically grow/shrink pool and add/remove sites;
+dynamically take coaster services in and out of service.
 
 - settings for retry and replication
 
@@ -27,13 +33,26 @@
 
 - Add site selection option to foreachsite
 
+- Add test of larger/variable-size data transfer, and test of data
+transfer speed
+
+It should be easy to set the data size.
+
+It will be a bit harder to test the data transfer *speed*,
+though. That could perhaps be tested with a run against the fork
+jobmanager, where we can set a bound on the expected delay
+time. Otherwise the delay from the time the file staging starts to the
+time the app runs is hard to determine.
+
+
+
 CLEANUP
 
-- Find all interim tools under swift/lab/osg and place under grid/ for development
+- Find all interim tools under swift/lab/osg and place under grid/ for
+development
 
 ENHANCEMENTS
 
 - Find Glen's tgsites command and integrate
 
 - incorporate gstar (would be a good Globus Online feature)
-

Added: trunk/bin/grid/start-grid-service
===================================================================
--- trunk/bin/grid/start-grid-service	                        (rev 0)
+++ trunk/bin/grid/start-grid-service	2011-08-06 22:40:14 UTC (rev 4956)
@@ -0,0 +1,70 @@
+#! /bin/bash
+
+# FIXME: improve arg parsing / checking / optionals
+
+function usage ()
+{
+       echo "Usage:"
+       echo " $0 --throttle 0.01 --loglevel INFO|DEBUG|TRACE --jobspernode 1"
+}
+
+if [ $# -ne 6 ]
+then
+    usage
+    exit 1
+fi
+
+LOGLEVEL=INFO # INFO, DEBUG, TRACE for increasing detail
+THROTTLE=0.09
+
+while test "$1" != "" ; do
+    case $1 in
+        --jobspernode|-j)
+                JOBSPERNODE="$2"
+                shift
+        ;;
+        --loglevel|-l)
+                LOGLEVEL="$2"
+                shift
+        ;;
+        --throttle|-t)
+                THROTTLE="$2"
+                shift
+        ;;
+        -*)
+                echo "Error: no such option $1"
+                usage
+                exit 1
+        ;;
+    esac
+    shift
+done
+
+BIN=$(cd $(dirname $0); pwd)
+
+echo THROTTLE=$THROTTLE LOGLEVEL=$LOGLEVEL
+
+start-swift-service 1 &
+sleep 5
+SPORT=$(cat service.sports)
+cat >sites.grid-ps.xml <<EOF
+  <config>
+    <pool handle="localhost">
+      <execution provider="coaster-persistent" url="http://localhost:$SPORT" jobmanager="local:local"/>
+      <profile namespace="globus" key="workerManager">passive</profile>
+      <profile namespace="globus" key="jobsPerNode">$JOBSPERNODE</profile>
+      <profile key="jobThrottle" namespace="karajan">$THROTTLE</profile>
+      <profile namespace="karajan" key="initialScore">10000</profile>
+      <!-- <filesystem provider="local" url="none" /> -->
+      <profile namespace="swift" key="stagingMethod">proxy</profile>
+      <workdirectory>/tmp/$USER</workdirectory>
+    </pool>
+  </config>
+EOF
+
+WPORT=$(cat service.wports)
+SERVICE_URL=http://$(hostname -f):$WPORT
+echo swift service started - SPORT=$(cat service.sports) WPORT=$WPORT SERVICE_URL=$SERVICE_URL
+
+# SERVICE_URL=$SERVICE_URL WORKER_LOGLEVEL=$LOGLEVEL
+


Property changes on: trunk/bin/grid/start-grid-service
___________________________________________________________________
Added: svn:executable
   + *

Modified: trunk/bin/grid/start-ranger-service
===================================================================
--- trunk/bin/grid/start-ranger-service	2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/start-ranger-service	2011-08-06 22:40:14 UTC (rev 4956)
@@ -3,15 +3,25 @@
 function usage ()
 {
        echo "Usage:"
-       echo " $0 --nodes nnodes --walltime hh:mm:ss --project proj-name --queue q-name --user user-name"
+       echo " $0 --nodes nnodes --walltime hh:mm:ss --project proj-name --queue q-name --user user-name" --startservice true
 }
 
-if [ $# -ne 10 ]
+if [ $# -ne 12 ]
 then
     usage
     exit 1
 fi
 
+
+# NODES=${1:-1}
+# WALLTIME=${2:-00:10:00}
+# PROJECT=${3:-TG-DBS080004N}
+# QUEUE=${4:-development}
+# REMOTE_USER=${5:-$USER}
+# STARTSERVICE=false
+
+
+
 while test "$1" != "" ; do
     case $1 in
         --nodes|-n)
@@ -34,6 +44,10 @@
                 REMOTE_USER="$2"
                 shift
         ;;
+        --startservice|-)
+                STARTSERVICE="$2"
+		shift
+        ;;
         -*)
                 echo "Error: no such option $1"
                 usage
@@ -43,19 +57,10 @@
     shift
 done
 
-
-
-#NODES=${1:-1}
-#WALLTIME=${2:-00:10:00}
-#PROJECT=${3:-TG-DBS080004N}
-#QUEUE=${4:-development}
-#REMOTE_USER=${5:-$USER}
-
-STARTSERVICE=true
 HOST=tg-login.ranger.tacc.teragrid.org
 BIN=$(cd $(dirname $0); pwd)
 
-echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER QUEUE=$QUEUE
+echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER QUEUE=$QUEUE STARTSERVICE=$STARTSERVICE
 LOGLEVEL=INFO # INFO, DEBUG, TRACE for increasing detail
 
 CORESPERNODE=16

Modified: trunk/bin/grid/start-ranger-service~
===================================================================
--- trunk/bin/grid/start-ranger-service~	2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/start-ranger-service~	2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,18 +1,61 @@
 #! /bin/bash
 
-# FIXME: make these commandline keyword arguments, eg --nodes=
+function usage ()
+{
+       echo "Usage:"
+       echo " $0 --nodes nnodes --walltime hh:mm:ss --project proj-name --queue q-name --user user-name"
+}
 
-NODES=${1:-1}
-WALLTIME=${2:-00:10:00}
-PROJECT=${3:-TG-DBS080004N}
-QUEUE=${4:-development}
-REMOTE_USER=${5:-tg455797}
+if [ $# -ne 10 ]
+then
+    usage
+    exit 1
+fi
 
+while test "$1" != "" ; do
+    case $1 in
+        --nodes|-n)
+                NODES="$2"
+                shift
+        ;;
+        --walltime|-t)
+                WALLTIME="$2"
+                shift
+        ;;
+        --project|-p)
+                PROJECT="$2"
+                shift
+        ;;
+        --queue|-q)
+                QUEUE="$2"
+                shift
+        ;;
+        --user|-u)
+                REMOTE_USER="$2"
+                shift
+        ;;
+        -*)
+                echo "Error: no such option $1"
+                usage
+                exit 1
+        ;;
+    esac
+    shift
+done
+
+
+
+#NODES=${1:-1}
+#WALLTIME=${2:-00:10:00}
+#PROJECT=${3:-TG-DBS080004N}
+#QUEUE=${4:-development}
+#REMOTE_USER=${5:-$USER}
+
 STARTSERVICE=true
 HOST=tg-login.ranger.tacc.teragrid.org
 BIN=$(cd $(dirname $0); pwd)
 
-echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER
+echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER QUEUE=$QUEUE
 LOGLEVEL=INFO # INFO, DEBUG, TRACE for increasing detail
 
 CORESPERNODE=16
@@ -39,7 +82,7 @@
       <profile namespace="karajan" key="initialScore">10000</profile>
       <!-- <filesystem provider="local" url="none" /> -->
       <profile namespace="swift" key="stagingMethod">proxy</profile>
-      <workdirectory>/tmp/wilde</workdirectory>
+      <workdirectory>/tmp/$USER</workdirectory>
     </pool>
   </config>
 EOF
@@ -59,12 +102,17 @@
   exit 1
 fi
 
-echo Created remote dir
+echo Created remote dir: $rdir
 
 scp $BIN/{worker.pl,workers.ranger.sh,workers.ranger.sub} $REMOTE_USER@$HOST:$rdir
 
-echo Copied grid tools to remote dir
+echo Copied grid tools to remote dir: $rdir
 
-ssh $REMOTE_USER@$HOST qsub -A $PROJECT -N runworkers -pe 16way $(($NODES * 16)) -l h_rt=$WALLTIME -q $QUEUE -v SERVICE_URL=$SERVICE_URL,WORKER_LOGLEVEL=$LOGLEVEL $rdir/workers.ranger.sub
+echo Submitting ...
 
+echo "ssh $REMOTE_USER@$HOST qsub -A $PROJECT -N runworkers -pe 16way $(($NODES * 16)) -q $QUEUE -l h_rt=$WALLTIME -v SERVICE_URL=$SERVICE_URL, WORKER_LOGLEVEL=$LOGLEVEL $rdir/workers.ranger.sub"
+
+ssh $REMOTE_USER@$HOST qsub -A $PROJECT -N runworkers -pe 16way $(($NODES * 16)) -q $QUEUE -l h_rt=$WALLTIME -v SERVICE_URL=$SERVICE_URL,WORKER_LOGLEVEL=$LOGLEVEL $rdir/workers.ranger.sub
+
 echo Submitted remote worker launching script
+

Modified: trunk/bin/grid/start-swift-service
===================================================================
--- trunk/bin/grid/start-swift-service	2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/start-swift-service	2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,36 +1,57 @@
 #!/bin/bash
 
-NSERVICES=$1
+NSERVICES=${1:-1}
 SERVICE=coaster-service  # found via PATH
 
-ontrap()  # FIXME: Not needed?
+echo $0: starting $NSERVICES services
+
+ontrap() 
 {
-  echo '====>' in ontrap
+  echo $0: Received signal, killing coaster services
   trap - 1 2 3 15
   echo start_service: trapping exit or signal
-  kill $(cat service-*.pid)
+  kill 0 # $(cat service-*.pid)
 }
 
-# trap ontrap 1 2 3 15  # FIXME: Not needed?
+trap ontrap 1 2 3 15  # FIXME: Not needed?
 
+# Launch the requested number of services
+
 rm -f service.sports service.wports
 for i in `seq -w 0 $((NSERVICES - 1))`; do
-  rm -f service-$i.{sport,wport,pid,log}
-  $SERVICE -nosec -passive -portfile service-$i.sport -localportfile service-$i.wport &> service-$i.log  &
+  rm -f service-$i.{sport,wport,pid,out}
+  $SERVICE -nosec -passive -portfile service-$i.sport -localportfile service-$i.wport &> service-$i.out  &
   echo $! >service-$i.pid
-  sleep 3
-  if [ -s service-$i.sport ]; then
-    echo $(cat service-$i.sport) >> service.sports
-  else
-    echo service-$i.sport does not exist or is empty. exiting.
-    exit 1
+done
+
+# Wait (a bit) for all services to report their port numbers
+# Record missing ones as "-"
+
+maxtries=10
+
+for (( tries=0; tries < $maxtries; tries++ )); do
+  sleep 1
+  errors=0
+  for i in `seq -w 0 $((NSERVICES - 1))`; do
+    if [ -s service-$i.sport -a -s service-$i.wport ]; then
+      echo $(cat service-$i.sport) >> service.sports
+      echo $(cat service-$i.wport) >> service.wports
+    else
+      echo 0 >> service.sports
+      echo 0 >> service.wports
+      errors=$((errors+1))
+    fi
+  done
+  if [ $errors = 0 ]; then
+    break
   fi
-  if [ -s service-$i.wport ]; then
-    echo $(cat service-$i.wport) >> service.wports
-  else
-    echo service-$i.wport does not exist or is empty. exiting.
-    exit 1
-  fi
 done
 
-wait
+if [ $errors != 0 ]; then
+  echo $0: $errors services failesd to report their port numbers
+fi
+echo $0: $((NSERVICES-errors)) services started successfully
+
+wait # Wait on all the started services; stop them all if signalled (ontrap)
+
+

Modified: trunk/bin/grid/swift-workers
===================================================================
--- trunk/bin/grid/swift-workers	2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/swift-workers	2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,5 +1,11 @@
-#!/usr/bin/env ruby
+#! /usr/bin/env ruby
 
+$stdout.sync = true
+
+# FIXME: Is this the best way to get lib functions from bin/grid into RUBYLIB ?
+
+$:[$:.length] = File.dirname($0)
+
 require 'mk_catalog'
 require 'etc'
 
@@ -14,6 +20,10 @@
 #      executable = /home/wilde/swift/src/0.92/cog/modules/swift/dist/swift-svn/bin/worker.pl
 #      arguments = http://128.135.125.17:<%= port %> <%= name %> /tmp 14400
 
+# WORKER_LOGGING_LEVEL=$LOGLEVEL $HOME/swift_gridtools/worker.pl $SERVICEURL swork${worker} $LOGDIR >& /dev/null &
+
+# a mod
+
   def gen_submit(count = 1)
     job = %q[
       universe = grid
@@ -33,11 +43,38 @@
       <% } %>
     ]
 
-    ERB.new(job.gsub(/^\s+/, ""), 0, "%<>", "@submit_file").result(binding)
+    ov=$VERBOSE
+    $VERBOSE=nil
+    workerExecutable = `which worker.pl`
+    $VERBOSE=ov
+#    workerContact = "http://communicado.ci.uchicago.edu:36906"
+     workerContact = ARGV[2]
+
+    newjob = %Q[
+      universe = grid
+      stream_output = False
+      stream_error = False
+      transfer_executable = true
+      periodic_remove = JobStatus == 5
+      notification = Never
+
+      globus_rsl = (maxwalltime=240)
+      grid_resource = <%= @grid_resource %>
+      executable = #{workerExecutable}
+      arguments = #{workerContact} swork /tmp
+      environment = WORKER_LOGGING_LEVEL=INFO
+      log = condor.log
+
+      <% count.times { %>queue
+      <% } %>
+    ]
+
+    ERB.new(newjob.gsub(/^\s+/, ""), 0, "%<>", "@submit_file").result(binding)
   end
 
   def submit_job(count)
-#    puts "Submitting #{@name} #{count} jobs"
+    puts "submit_job: Submitting #{@name} #{count} jobs"
+    count = count.to_i
     output = ""
 #return output
     submitfile = gen_submit(count)
@@ -50,12 +87,18 @@
   end
 
   def queued
+    ov=$VERBOSE
+    $VERBOSE=nil
     jobs = `condor_q  #{$username} -const 'GridResource == \"#{@grid_resource}\" && JobStatus == 1' -format \"%s \" GlobalJobId`
+    $VERBOSE=ov
     jobs.split(" ").size
   end
 
   def running
+    ov=$VERBOSE
+    $VERBOSE=nil
     jobs = `condor_q #{$username} -const 'GridResource == \"#{@grid_resource}\" && JobStatus == 2' -format \"%s \" GlobalJobId`
+    $VERBOSE=ov
     jobs.split(" ").size
   end
 
@@ -75,13 +118,13 @@
 =end
 
 if __FILE__ == $0
-  raise "No whitelist file" if !ARGV[0]
+  raise "No greenlist file" if !ARGV[0]
 
   start_port = 61100 # FIXME
   ctr        = 0
   threads    = []
   ARGV[1]    = "scec" if !ARGV[1]
-  whitelist  = IO.readlines(ARGV[0]).map { |line| line.chomp! }
+  greenlist  = IO.readlines(ARGV[0]).map { |line| line.chomp! }
   $username = Etc.getlogin
 
   puts "Username = #{$username}"
@@ -93,7 +136,7 @@
   totalRunning = 0
 
   ress_parse(ARGV[1]) do |name, value|
-    next if not whitelist.index(name) and not whitelist.empty?
+    next if not greenlist.index(name) and not greenlist.empty?
     totalCores += (value.throttle * 100 + 2).to_i
   end
   puts "totalCores for green sites = #{totalCores}"
@@ -105,14 +148,16 @@
       # swiftDemand = IO.read("swiftDemand")  # Replace this with sensor of Swift demand
       swiftDemand = 15
       paddedDemand = (swiftDemand * 1.2).to_i
+      ov=$VERBOSE;$VERBOSE=nil
       totalRunning = `condor_q #{$username} -const 'JobStatus == 2' -format \"%s \" GlobalJobId`.split(" ").size
+      $VERBOSE=ov
       puts "*** demandThread: swiftDemand=#{swiftDemand} paddedDemand=#{paddedDemand} totalRunning=#{totalRunning}"
       sleep 60
     end
   end
 
   ress_parse(ARGV[1]) do |name, value|
-    next if not whitelist.index(name) and not whitelist.empty?
+    next if not greenlist.index(name) and not greenlist.empty?
     site               = Site.new
     site.name          = name
     site.grid_resource = "gt2 #{value.url}/jobmanager-#{value.jm}"




More information about the Swift-commit mailing list