[Swift-commit] cog r3907
swift at ci.uchicago.edu
swift at ci.uchicago.edu
Wed May 28 23:15:08 CDT 2014
------------------------------------------------------------------------
r3907 | hategan | 2014-05-28 23:14:07 -0500 (Wed, 28 May 2014) | 1 line
fixed bug where multiple stageout notifications were sent per job; abort stageouts and properly send job status if a stageout encounters an I/O error
------------------------------------------------------------------------
Index: modules/provider-coaster/resources/worker.pl
===================================================================
--- modules/provider-coaster/resources/worker.pl (revision 3906)
+++ modules/provider-coaster/resources/worker.pl (working copy)
@@ -81,7 +81,7 @@
ERROR_STAGEIN_FILE_WRITE => 522,
ERROR_STAGEIN_COPY => 524,
ERROR_STAGEIN_REQUEST => 525,
- ERROR_STAGEOUT_COPY => 528,
+ ERROR_STAGEOUT_IO => 528,
ERROR_STAGEOUT_SEND => 515,
ERROR_STAGEOUT_TIMEOUT => 516,
ERROR_PROCESS_FORK => 512,
@@ -667,12 +667,20 @@
my $buffer;
my $sz = read($handle, $buffer, IOBUFSZ);
if (!defined $sz) {
- wlog INFO, "$tag Failed to read data from file: $!\n";
- return (FINAL_FLAG + ERROR_FLAG, "$!", CONTINUE);
+ my $err = "Failed to read data from file: $!";
+ my $jobid = $$state{"jobid"};
+ wlog INFO, "$tag $err\n";
+ abortStageouts($jobid);
+ queueJobStatusCmd($jobid, FAILED, ERROR_STAGEOUT_IO, $err);
+ return (FINAL_FLAG + ERROR_FLAG, "$err", CONTINUE);
}
elsif ($sz == 0 && $$state{"sent"} < $$state{"size"}) {
- wlog INFO, "$tag File size mismatch. $$state{'size'} vs. $$state{'sent'}\n";
- return (FINAL_FLAG + ERROR_FLAG, "File size mismatch. Expected $$state{'size'}, got $$state{'sent'}", CONTINUE);
+ my $err = "File size mismatch. Expected $$state{'size'}, got $$state{'sent'}";
+ my $jobid = $$state{"jobid"};
+ wlog INFO, "$tag $err\n";
+ abortStageouts($jobid);
+ queueJobStatusCmd($jobid, FAILED, ERROR_STAGEOUT_IO, $err);
+ return (FINAL_FLAG + ERROR_FLAG, $err, CONTINUE);
}
$$state{"sent"} += $sz;
wlog DEBUG, "$tag size: $$state{'size'}, sent: $$state{'sent'}\n";
@@ -691,7 +699,7 @@
}
sub fileData {
- my ($cmd, $lname, $rname) = @_;
+ my ($cmd, $jobid, $lname, $rname) = @_;
my $desc;
if (!open($desc, "<", "$lname")) {
@@ -700,6 +708,7 @@
}
return {
"cmd" => $cmd,
+ "jobid" => $jobid,
"state" => 0,
"handle" => $desc,
"nextData" => \&nextFileData,
@@ -1519,6 +1528,13 @@
push(@$waiting, $jobid);
}
+sub abortStageouts {
+ my ($jobid) = @_;
+
+ # something larger than the number of actual stageouts
+ $JOBDATA{$jobid}{"stageindex"} = 1000000;
+}
+
sub stageout {
my ($jobid) = @_;
@@ -1558,7 +1574,7 @@
if (!defined($JOBDATA{$jobid}{"stagoutStatusSent"})) {
wlog DEBUG, "$jobid Sending STAGEOUT status\n";
queueJobStatusCmd($jobid, STAGEOUT, 0, "workerid=$ID");
- $JOBDATA{$jobid}{"jobStatusSent"} = 1;
+ $JOBDATA{$jobid}{"stageoutStatusSent"} = 1;
}
my $rfile = $$STAGED[$STAGEINDEX];
$JOBDATA{$jobid}{"stageindex"} = $STAGEINDEX + 1;
@@ -1572,13 +1588,13 @@
$JOBDATA{$jobid}{"stageoutCount"} += 1;
wlog DEBUG, "$jobid Stagecount is $JOBDATA{$jobid}{stageoutCount}\n";
- queueCmdCustomDataHandling(putFileCB($jobid), fileData("PUT", $lfile, $rfile));
+ queueCmdCustomDataHandling(putFileCB($jobid), fileData("PUT", $jobid, $lfile, $rfile));
}
elsif ($protocol eq "sfs") {
mkfdir($jobid, $path);
if (!copy($lfile, $path)) {
wlog DEBUG, "$jobid Error staging out $lfile to $path: $!\n";
- queueJobStatusCmd($jobid, FAILED, ERROR_STAGEOUT_COPY, "$!");
+ queueJobStatusCmd($jobid, FAILED, ERROR_STAGEOUT_IO, "$!");
return;
}
else {
More information about the Swift-commit
mailing list