[Swift-commit] r4956 - trunk/bin/grid
wilde at ci.uchicago.edu
wilde at ci.uchicago.edu
Sat Aug 6 17:40:14 CDT 2011
Author: wilde
Date: 2011-08-06 17:40:14 -0500 (Sat, 06 Aug 2011)
New Revision: 4956
Added:
trunk/bin/grid/start-grid-service
Modified:
trunk/bin/grid/TODO
trunk/bin/grid/start-ranger-service
trunk/bin/grid/start-ranger-service~
trunk/bin/grid/start-swift-service
trunk/bin/grid/swift-workers
Log:
commit snapshot of working version - work in progress
Modified: trunk/bin/grid/TODO
===================================================================
--- trunk/bin/grid/TODO 2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/TODO 2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,4 +1,8 @@
+BUGS TO INVESTIGATE AND/OR FILE
+- why are there two logs from the coaster service: uuid-named log and swift.log????
+
+
EXTENCI APPLICATION WORK
create modft install & test file; test under fork and work
@@ -14,11 +18,13 @@
TO RESOLVE
-- how to set swift throttles to handle a varying number of coaster workers per site?
+- how to set swift throttles to handle a varying number of coaster
+workers per site?
- why did Allan set exceptions in workdir names, eg for BNL?
-- how to dynamically grow/shrink pool and add/remove sites; dynamically take coaster services in and out of service.
+- how to dynamically grow/shrink pool and add/remove sites;
+dynamically take coaster services in and out of service.
- settings for retry and replication
@@ -27,13 +33,26 @@
- Add site selection option to foreachsite
+- Add test of larger/variable-size data transfer, and test of data
+transfer speed
+
+It should be easy to set the data size.
+
+It will be a bit harder to test the data transfer *speed*,
+though. That could perhaps be tested with a run against the fork
+jobmanager, where we can set a bound on the expected delay
+time. Otherwise the delay from the time the file staging starts to the
+time the app runs is hard to determine.
+
+
+
CLEANUP
-- Find all interim tools under swift/lab/osg and place under grid/ for development
+- Find all interim tools under swift/lab/osg and place under grid/ for
+development
ENHANCEMENTS
- Find Glen's tgsites command and integrate
- incorporate gstar (would be a good Globus Online feature)
-
Added: trunk/bin/grid/start-grid-service
===================================================================
--- trunk/bin/grid/start-grid-service (rev 0)
+++ trunk/bin/grid/start-grid-service 2011-08-06 22:40:14 UTC (rev 4956)
@@ -0,0 +1,70 @@
+#! /bin/bash
+
+# FIXME: improve arg parsing / checking / optionals
+
+function usage ()
+{
+ echo "Usage:"
+ echo " $0 --throttle 0.01 --loglevel INFO|DEBUG|TRACE --jobspernode 1"
+}
+
+if [ $# -ne 6 ]
+then
+ usage
+ exit 1
+fi
+
+LOGLEVEL=INFO # INFO, DEBUG, TRACE for increasing detail
+THROTTLE=0.09
+
+while test "$1" != "" ; do
+ case $1 in
+ --jobspernode|-j)
+ JOBSPERNODE="$2"
+ shift
+ ;;
+ --loglevel|-l)
+ LOGLEVEL="$2"
+ shift
+ ;;
+ --throttle|-t)
+ THROTTLE="$2"
+ shift
+ ;;
+ -*)
+ echo "Error: no such option $1"
+ usage
+ exit 1
+ ;;
+ esac
+ shift
+done
+
+BIN=$(cd $(dirname $0); pwd)
+
+echo THROTTLE=$THROTTLE LOGLEVEL=$LOGLEVEL
+
+start-swift-service 1 &
+sleep 5
+SPORT=$(cat service.sports)
+cat >sites.grid-ps.xml <<EOF
+ <config>
+ <pool handle="localhost">
+ <execution provider="coaster-persistent" url="http://localhost:$SPORT" jobmanager="local:local"/>
+ <profile namespace="globus" key="workerManager">passive</profile>
+ <profile namespace="globus" key="jobsPerNode">$JOBSPERNODE</profile>
+ <profile key="jobThrottle" namespace="karajan">$THROTTLE</profile>
+ <profile namespace="karajan" key="initialScore">10000</profile>
+ <!-- <filesystem provider="local" url="none" /> -->
+ <profile namespace="swift" key="stagingMethod">proxy</profile>
+ <workdirectory>/tmp/$USER</workdirectory>
+ </pool>
+ </config>
+EOF
+
+WPORT=$(cat service.wports)
+SERVICE_URL=http://$(hostname -f):$WPORT
+echo swift service started - SPORT=$(cat service.sports) WPORT=$WPORT SERVICE_URL=$SERVICE_URL
+
+# SERVICE_URL=$SERVICE_URL WORKER_LOGLEVEL=$LOGLEVEL
+
Property changes on: trunk/bin/grid/start-grid-service
___________________________________________________________________
Added: svn:executable
+ *
Modified: trunk/bin/grid/start-ranger-service
===================================================================
--- trunk/bin/grid/start-ranger-service 2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/start-ranger-service 2011-08-06 22:40:14 UTC (rev 4956)
@@ -3,15 +3,25 @@
function usage ()
{
echo "Usage:"
- echo " $0 --nodes nnodes --walltime hh:mm:ss --project proj-name --queue q-name --user user-name"
+ echo " $0 --nodes nnodes --walltime hh:mm:ss --project proj-name --queue q-name --user user-name" --startservice true
}
-if [ $# -ne 10 ]
+if [ $# -ne 12 ]
then
usage
exit 1
fi
+
+# NODES=${1:-1}
+# WALLTIME=${2:-00:10:00}
+# PROJECT=${3:-TG-DBS080004N}
+# QUEUE=${4:-development}
+# REMOTE_USER=${5:-$USER}
+# STARTSERVICE=false
+
+
+
while test "$1" != "" ; do
case $1 in
--nodes|-n)
@@ -34,6 +44,10 @@
REMOTE_USER="$2"
shift
;;
+ --startservice|-)
+ STARTSERVICE="$2"
+ shift
+ ;;
-*)
echo "Error: no such option $1"
usage
@@ -43,19 +57,10 @@
shift
done
-
-
-#NODES=${1:-1}
-#WALLTIME=${2:-00:10:00}
-#PROJECT=${3:-TG-DBS080004N}
-#QUEUE=${4:-development}
-#REMOTE_USER=${5:-$USER}
-
-STARTSERVICE=true
HOST=tg-login.ranger.tacc.teragrid.org
BIN=$(cd $(dirname $0); pwd)
-echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER QUEUE=$QUEUE
+echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER QUEUE=$QUEUE STARTSERVICE=$STARTSERVICE
LOGLEVEL=INFO # INFO, DEBUG, TRACE for increasing detail
CORESPERNODE=16
Modified: trunk/bin/grid/start-ranger-service~
===================================================================
--- trunk/bin/grid/start-ranger-service~ 2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/start-ranger-service~ 2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,18 +1,61 @@
#! /bin/bash
-# FIXME: make these commandline keyword arguments, eg --nodes=
+function usage ()
+{
+ echo "Usage:"
+ echo " $0 --nodes nnodes --walltime hh:mm:ss --project proj-name --queue q-name --user user-name"
+}
-NODES=${1:-1}
-WALLTIME=${2:-00:10:00}
-PROJECT=${3:-TG-DBS080004N}
-QUEUE=${4:-development}
-REMOTE_USER=${5:-tg455797}
+if [ $# -ne 10 ]
+then
+ usage
+ exit 1
+fi
+while test "$1" != "" ; do
+ case $1 in
+ --nodes|-n)
+ NODES="$2"
+ shift
+ ;;
+ --walltime|-t)
+ WALLTIME="$2"
+ shift
+ ;;
+ --project|-p)
+ PROJECT="$2"
+ shift
+ ;;
+ --queue|-q)
+ QUEUE="$2"
+ shift
+ ;;
+ --user|-u)
+ REMOTE_USER="$2"
+ shift
+ ;;
+ -*)
+ echo "Error: no such option $1"
+ usage
+ exit 1
+ ;;
+ esac
+ shift
+done
+
+
+
+#NODES=${1:-1}
+#WALLTIME=${2:-00:10:00}
+#PROJECT=${3:-TG-DBS080004N}
+#QUEUE=${4:-development}
+#REMOTE_USER=${5:-$USER}
+
STARTSERVICE=true
HOST=tg-login.ranger.tacc.teragrid.org
BIN=$(cd $(dirname $0); pwd)
-echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER
+echo NODES=$NODES WALLTIME=$WALLTIME PROJECT=$PROJECT REMOTE_USER=$REMOTE_USER QUEUE=$QUEUE
LOGLEVEL=INFO # INFO, DEBUG, TRACE for increasing detail
CORESPERNODE=16
@@ -39,7 +82,7 @@
<profile namespace="karajan" key="initialScore">10000</profile>
<!-- <filesystem provider="local" url="none" /> -->
<profile namespace="swift" key="stagingMethod">proxy</profile>
- <workdirectory>/tmp/wilde</workdirectory>
+ <workdirectory>/tmp/$USER</workdirectory>
</pool>
</config>
EOF
@@ -59,12 +102,17 @@
exit 1
fi
-echo Created remote dir
+echo Created remote dir: $rdir
scp $BIN/{worker.pl,workers.ranger.sh,workers.ranger.sub} $REMOTE_USER@$HOST:$rdir
-echo Copied grid tools to remote dir
+echo Copied grid tools to remote dir: $rdir
-ssh $REMOTE_USER@$HOST qsub -A $PROJECT -N runworkers -pe 16way $(($NODES * 16)) -l h_rt=$WALLTIME -q $QUEUE -v SERVICE_URL=$SERVICE_URL,WORKER_LOGLEVEL=$LOGLEVEL $rdir/workers.ranger.sub
+echo Submitting ...
+echo "ssh $REMOTE_USER@$HOST qsub -A $PROJECT -N runworkers -pe 16way $(($NODES * 16)) -q $QUEUE -l h_rt=$WALLTIME -v SERVICE_URL=$SERVICE_URL, WORKER_LOGLEVEL=$LOGLEVEL $rdir/workers.ranger.sub"
+
+ssh $REMOTE_USER@$HOST qsub -A $PROJECT -N runworkers -pe 16way $(($NODES * 16)) -q $QUEUE -l h_rt=$WALLTIME -v SERVICE_URL=$SERVICE_URL,WORKER_LOGLEVEL=$LOGLEVEL $rdir/workers.ranger.sub
+
echo Submitted remote worker launching script
+
Modified: trunk/bin/grid/start-swift-service
===================================================================
--- trunk/bin/grid/start-swift-service 2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/start-swift-service 2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,36 +1,57 @@
#!/bin/bash
-NSERVICES=$1
+NSERVICES=${1:-1}
SERVICE=coaster-service # found via PATH
-ontrap() # FIXME: Not needed?
+echo $0: starting $NSERVICES services
+
+ontrap()
{
- echo '====>' in ontrap
+ echo $0: Received signal, killing coaster services
trap - 1 2 3 15
echo start_service: trapping exit or signal
- kill $(cat service-*.pid)
+ kill 0 # $(cat service-*.pid)
}
-# trap ontrap 1 2 3 15 # FIXME: Not needed?
+trap ontrap 1 2 3 15 # FIXME: Not needed?
+# Launch the requested number of services
+
rm -f service.sports service.wports
for i in `seq -w 0 $((NSERVICES - 1))`; do
- rm -f service-$i.{sport,wport,pid,log}
- $SERVICE -nosec -passive -portfile service-$i.sport -localportfile service-$i.wport &> service-$i.log &
+ rm -f service-$i.{sport,wport,pid,out}
+ $SERVICE -nosec -passive -portfile service-$i.sport -localportfile service-$i.wport &> service-$i.out &
echo $! >service-$i.pid
- sleep 3
- if [ -s service-$i.sport ]; then
- echo $(cat service-$i.sport) >> service.sports
- else
- echo service-$i.sport does not exist or is empty. exiting.
- exit 1
+done
+
+# Wait (a bit) for all services to report their port numbers
+# Record missing ones as "-"
+
+maxtries=10
+
+for (( tries=0; tries < $maxtries; tries++ )); do
+ sleep 1
+ errors=0
+ for i in `seq -w 0 $((NSERVICES - 1))`; do
+ if [ -s service-$i.sport -a -s service-$i.wport ]; then
+ echo $(cat service-$i.sport) >> service.sports
+ echo $(cat service-$i.wport) >> service.wports
+ else
+ echo 0 >> service.sports
+ echo 0 >> service.wports
+ errors=$((errors+1))
+ fi
+ done
+ if [ $errors = 0 ]; then
+ break
fi
- if [ -s service-$i.wport ]; then
- echo $(cat service-$i.wport) >> service.wports
- else
- echo service-$i.wport does not exist or is empty. exiting.
- exit 1
- fi
done
-wait
+if [ $errors != 0 ]; then
+ echo $0: $errors services failesd to report their port numbers
+fi
+echo $0: $((NSERVICES-errors)) services started successfully
+
+wait # Wait on all the started services; stop them all if signalled (ontrap)
+
+
Modified: trunk/bin/grid/swift-workers
===================================================================
--- trunk/bin/grid/swift-workers 2011-08-06 05:45:15 UTC (rev 4955)
+++ trunk/bin/grid/swift-workers 2011-08-06 22:40:14 UTC (rev 4956)
@@ -1,5 +1,11 @@
-#!/usr/bin/env ruby
+#! /usr/bin/env ruby
+$stdout.sync = true
+
+# FIXME: Is this the best way to get lib functions from bin/grid into RUBYLIB ?
+
+$:[$:.length] = File.dirname($0)
+
require 'mk_catalog'
require 'etc'
@@ -14,6 +20,10 @@
# executable = /home/wilde/swift/src/0.92/cog/modules/swift/dist/swift-svn/bin/worker.pl
# arguments = http://128.135.125.17:<%= port %> <%= name %> /tmp 14400
+# WORKER_LOGGING_LEVEL=$LOGLEVEL $HOME/swift_gridtools/worker.pl $SERVICEURL swork${worker} $LOGDIR >& /dev/null &
+
+# a mod
+
def gen_submit(count = 1)
job = %q[
universe = grid
@@ -33,11 +43,38 @@
<% } %>
]
- ERB.new(job.gsub(/^\s+/, ""), 0, "%<>", "@submit_file").result(binding)
+ ov=$VERBOSE
+ $VERBOSE=nil
+ workerExecutable = `which worker.pl`
+ $VERBOSE=ov
+# workerContact = "http://communicado.ci.uchicago.edu:36906"
+ workerContact = ARGV[2]
+
+ newjob = %Q[
+ universe = grid
+ stream_output = False
+ stream_error = False
+ transfer_executable = true
+ periodic_remove = JobStatus == 5
+ notification = Never
+
+ globus_rsl = (maxwalltime=240)
+ grid_resource = <%= @grid_resource %>
+ executable = #{workerExecutable}
+ arguments = #{workerContact} swork /tmp
+ environment = WORKER_LOGGING_LEVEL=INFO
+ log = condor.log
+
+ <% count.times { %>queue
+ <% } %>
+ ]
+
+ ERB.new(newjob.gsub(/^\s+/, ""), 0, "%<>", "@submit_file").result(binding)
end
def submit_job(count)
-# puts "Submitting #{@name} #{count} jobs"
+ puts "submit_job: Submitting #{@name} #{count} jobs"
+ count = count.to_i
output = ""
#return output
submitfile = gen_submit(count)
@@ -50,12 +87,18 @@
end
def queued
+ ov=$VERBOSE
+ $VERBOSE=nil
jobs = `condor_q #{$username} -const 'GridResource == \"#{@grid_resource}\" && JobStatus == 1' -format \"%s \" GlobalJobId`
+ $VERBOSE=ov
jobs.split(" ").size
end
def running
+ ov=$VERBOSE
+ $VERBOSE=nil
jobs = `condor_q #{$username} -const 'GridResource == \"#{@grid_resource}\" && JobStatus == 2' -format \"%s \" GlobalJobId`
+ $VERBOSE=ov
jobs.split(" ").size
end
@@ -75,13 +118,13 @@
=end
if __FILE__ == $0
- raise "No whitelist file" if !ARGV[0]
+ raise "No greenlist file" if !ARGV[0]
start_port = 61100 # FIXME
ctr = 0
threads = []
ARGV[1] = "scec" if !ARGV[1]
- whitelist = IO.readlines(ARGV[0]).map { |line| line.chomp! }
+ greenlist = IO.readlines(ARGV[0]).map { |line| line.chomp! }
$username = Etc.getlogin
puts "Username = #{$username}"
@@ -93,7 +136,7 @@
totalRunning = 0
ress_parse(ARGV[1]) do |name, value|
- next if not whitelist.index(name) and not whitelist.empty?
+ next if not greenlist.index(name) and not greenlist.empty?
totalCores += (value.throttle * 100 + 2).to_i
end
puts "totalCores for green sites = #{totalCores}"
@@ -105,14 +148,16 @@
# swiftDemand = IO.read("swiftDemand") # Replace this with sensor of Swift demand
swiftDemand = 15
paddedDemand = (swiftDemand * 1.2).to_i
+ ov=$VERBOSE;$VERBOSE=nil
totalRunning = `condor_q #{$username} -const 'JobStatus == 2' -format \"%s \" GlobalJobId`.split(" ").size
+ $VERBOSE=ov
puts "*** demandThread: swiftDemand=#{swiftDemand} paddedDemand=#{paddedDemand} totalRunning=#{totalRunning}"
sleep 60
end
end
ress_parse(ARGV[1]) do |name, value|
- next if not whitelist.index(name) and not whitelist.empty?
+ next if not greenlist.index(name) and not greenlist.empty?
site = Site.new
site.name = name
site.grid_resource = "gt2 #{value.url}/jobmanager-#{value.jm}"
More information about the Swift-commit
mailing list