[Swift-commit] cog r3838
swift at ci.uchicago.edu
swift at ci.uchicago.edu
Fri Nov 22 22:30:04 CST 2013
------------------------------------------------------------------------
r3838 | hategan | 2013-11-22 22:27:56 -0600 (Fri, 22 Nov 2013) | 1 line
added a softImage setting that instructs the worker to unpack a tar.gz file before the first job is run on a node
------------------------------------------------------------------------
Index: modules/provider-coaster/src/org/globus/cog/abstraction/impl/execution/coaster/SubmitJobCommand.java
===================================================================
--- modules/provider-coaster/src/org/globus/cog/abstraction/impl/execution/coaster/SubmitJobCommand.java (revision 3837)
+++ modules/provider-coaster/src/org/globus/cog/abstraction/impl/execution/coaster/SubmitJobCommand.java (working copy)
@@ -116,6 +116,9 @@
if (spec.getAttribute("tracePerformance") != null) {
add(sb, "attr", "tracePerformance=" + spec.getAttribute("tracePerformance"));
}
+ if (spec.getAttribute("softImage") != null) {
+ add(sb, "attr", "softImage=" + spec.getAttribute("softImage"));
+ }
}
else {
for (String name : spec.getAttributeNames())
Index: modules/provider-coaster/resources/worker.pl
===================================================================
--- modules/provider-coaster/resources/worker.pl (revision 3837)
+++ modules/provider-coaster/resources/worker.pl (working copy)
@@ -134,6 +134,8 @@
my $LAST_JOB_CHECK_TIME = 0;
my $JOB_COUNT = 0;
+my $SOFT_IMAGE_DST;
+
use constant BUFSZ => 2048;
use constant IOBUFSZ => 32768;
use constant IOBLOCKSZ => 8;
@@ -1088,6 +1090,7 @@
sub dieNicely {
my ($msg) = @_;
+ cleanSoftImage();
wlog ERROR, "$msg\n";
if ($CONNECTED) {
$CONNECTED = 0; # avoid recursive calls to this method
@@ -1121,6 +1124,7 @@
wlog DEBUG, "Shutdown command received\n";
queueReply($tag, ("OK"));
sendQueued();
+ cleanSoftImage();
wlog INFO, "Acknowledged shutdown.\n";
wlog INFO, "Ran a total of $JOB_COUNT jobs\n";
if ($PROFILE) {
@@ -1790,7 +1794,120 @@
return substr($fn, 0, 1) eq "/";
}
+sub readUInt32 {
+ my ($fd) = @_;
+ seek($fd, 0, 0) or wlog ERROR, "Seek failed: $!\n";
+ my $n;
+ read($fd, $n, 4) or wlog ERROR, "Read failed: $!\n";
+ return unpack("V", $n);
+}
+sub writeUInt32 {
+ my ($fd, $n) = @_;
+
+ seek($fd, 0, 0) or wlog ERROR, "Seek failed: $!\n";
+ if (!(print {$fd} pack("V", $n))) {
+ wlog ERROR, "Write failed: $!\n";
+ }
+
+ return $n;
+}
+
+# Ensures that each job has the $src tar.gz file unpacked in
+# the directory pointed to by $dst. In addition, ensure that
+# the unpacking is only done if necessary so that it can be
+# shared between: 1. workers running concurrently on this node
+# (if $dst points to a local disk) and 2. subsequent jobs
+# running in this worker.
+#
+# If the image contains commonly used binaries and libraries,
+# it can be combined with a $PATH and $LD_LIBRARY_PATH settings
+# to minimize access to network file systems that would otherwise
+# be the sources for those binaries/libraries.
+sub prepareSoftImage {
+ my ($src, $dst) = @_;
+ my $lock;
+ my $counter;
+ mkpath($dst);
+ if (!open($lock, ">>$dst/.lock")) {
+ die "Cannot open lock file: $!";
+ }
+ # start critical section
+ if (!flock($lock, 2)) { # 2 - exclusive lock
+ die "Cannot get exclusive lock on soft image directory: $!";
+ }
+
+ if (! -f "$dst/.count") {
+ open($counter, "+>$dst/.count");
+ }
+ else {
+ open($counter, "+<$dst/.count");
+ }
+
+ seek($counter, 0, 2);
+ my $pos = tell($counter);
+ wlog DEBUG, "Counter file pos: $pos\n";
+ if ($pos == 0) {
+ wlog INFO, "Lead process. Uncompressing image $src to $dst\n";
+ wlog DEBUG, "Running tar -xzf $src -C $dst\n";
+ my $out;
+ $out = qx/tar -xzf $src -C $dst 2>&1/;
+ if ($? != 0) {
+ die "Cannot create soft image: $!\n$out";
+ }
+ $ENV{SOFTIMAGE} = $dst;
+ if (-x "$dst/start") {
+ wlog DEBUG, "Running $dst/start\n";
+ $out = qx/$dst\/start 2>&1/;
+ if ($? != 0) {
+ die "Error running soft image startup: $!\n$out";
+ }
+ }
+
+ writeUInt32($counter, 1);
+ wlog DEBUG, "Soft image use count updated: 1\n";
+ wlog DEBUG, "Soft image initialized\n";
+ }
+ else {
+ wlog DEBUG, "Not lead process\n";
+ if (! -f "$dst/.l$BLOCKID") {
+ my $n = writeUInt32($counter, readUInt32($counter) + 1);
+ wlog DEBUG, "Soft image use count updated: $n\n";
+ my $rl;
+ open($rl, ">$dst/.l$BLOCKID");
+ close($rl);
+ }
+ }
+ close($counter);
+ close($lock);
+ # end critical section
+}
+
+sub cleanSoftImage() {
+ my $lock;
+ my $counter;
+ open($lock, ">>$SOFT_IMAGE_DST/.lock");
+ if (!flock($lock, 2)) {
+ dieNicely("Cannot get exclusive lock on soft image directory: $!");
+ }
+
+ open($counter, "+<$SOFT_IMAGE_DST/.count");
+ if (writeUInt32($counter, readUInt32($counter) - 1) == 0) {
+ wlog INFO, "Tail process. Removing image from $SOFT_IMAGE_DST\n";
+
+ if (-x "$SOFT_IMAGE_DST/stop") {
+ my $out = qx/$SOFT_IMAGE_DST\/stop 2>&1/;
+ if ($? != 0) {
+ die "Error running soft image shutdown: $!\n$out";
+ }
+ }
+
+ rmtree($SOFT_IMAGE_DST, 0, 0);
+ }
+ close($counter);
+ close($lock);
+}
+
sub submitjob {
my ($tag, $timeout, $msgs) = @_;
my $desc = $$msgs[0];
@@ -1799,6 +1916,7 @@
my $JOBID = undef;
my $MAXWALLTIME = 600;
my $PERFTRACE = 0;
+ my $SOFTIMAGE;
my %JOB = ();
my @JOBARGS = ();
my %JOBENV = ();
@@ -1833,6 +1951,11 @@
# as long as the job is alive, enable debugging
$LOGLEVEL = DEBUG;
}
+ elsif ($ap[0] eq "softImage") {
+ $SOFTIMAGE = $ap[1];
+ my @SS = split(/ /, $SOFTIMAGE);
+ $SOFT_IMAGE_DST = $SS[1];
+ }
else {
wlog WARN, "Ignoring attribute $ap[0] = $ap[1]\n";
}
@@ -1886,6 +2009,7 @@
cleanup => \@CLEANUP,
maxwalltime => $MAXWALLTIME,
perftrace => $PERFTRACE,
+ softimage => $SOFTIMAGE,
};
stagein($JOBID);
@@ -1962,10 +2086,10 @@
if ($pid == 0) {
close $PARENT_R;
if ($JOBDATA{$JOBID}{"perftrace"} != 0) {
- stracerunjob($CHILD_W, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $JOBDATA{$JOBID}{"perftrace"});
+ stracerunjob($CHILD_W, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $JOBDATA{$JOBID});
}
else {
- runjob($CHILD_W, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID);
+ runjob($CHILD_W, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $JOBDATA{$JOBID});
}
close $CHILD_W;
}
@@ -2062,7 +2186,6 @@
return 0;
}
else {
-
if ($JOBWAITDATA{$JOBID}{"walltimeexceeded"}) {
wlog DEBUG, "Walltime exceeded. The status is $?\n";
$status = ERROR_PROCESS_WALLTIME_EXCEEDED;
@@ -2107,10 +2230,15 @@
}
sub runjob {
- my ($WR, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID) = @_;
+ my ($WR, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $JOBDATA) = @_;
my $executable = $$JOB{"executable"};
my $sout = $$JOB{"stdout"};
my $serr = $$JOB{"stderr"};
+
+ my $softImage = $$JOBDATA{"softimage"};
+ if (defined $softImage) {
+ prepareSoftImage(split(/ /, $softImage));
+ }
my $cwd = getcwd();
# wlog DEBUG, "CWD: $cwd\n";
@@ -2149,9 +2277,10 @@
}
sub stracerunjob {
- my ($WR, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $LOGID) = @_;
+ my ($WR, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $JOBDATA) = @_;
my $executable = $$JOB{"executable"};
+ my $LOGID = $$JOBDATA{"perftrace"};
$$JOB{"executable"} = "strace";
unshift @$JOBARGS, $executable;
unshift @$JOBARGS, "$LOGDIR/$BLOCKID-$ID-$LOGID.perf";
@@ -2160,7 +2289,7 @@
unshift @$JOBARGS, "-f";
unshift @$JOBARGS, "-T";
- runjob($WR, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID);
+ runjob($WR, $JOB, $JOBARGS, $JOBENV, $JOBSLOT, $WORKERPID, $JOBDATA);
}
More information about the Swift-commit
mailing list