[Swift-commit] cog r3897

swift at ci.uchicago.edu swift at ci.uchicago.edu
Wed May 7 15:30:03 CDT 2014


------------------------------------------------------------------------
r3897 | hategan | 2014-05-07 15:29:40 -0500 (Wed, 07 May 2014) | 1 line

deal better with errors in the lead job when a soft image is requested (i.e. signal all workers and jobs that a failure has occurred)
------------------------------------------------------------------------
Index: modules/provider-coaster/resources/worker.pl
===================================================================
--- modules/provider-coaster/resources/worker.pl	(revision 3896)
+++ modules/provider-coaster/resources/worker.pl	(working copy)
@@ -145,8 +145,14 @@
 my $SOFT_IMAGE_CREATE_LOCK;
 my $SOFT_IMAGE_USE_LOCK;
 my $SOFT_IMAGE_DIR;
+# true if this is the first worker on a node
 my $SOFT_IMAGE_LEAD_PROCESS = 0;
+# true if this is the first job in this worker
 my $SOFT_IMAGE_FIRST_IN_PROCESS = 1;
+# keep track of the job that stages in the soft image
+# any errors that occur with this job should cause the worker
+# to signal all other workers on this node to fail and then quit 
+my $SOFT_IMAGE_JOB_ID;
 
 use constant BUFSZ => 2048;
 use constant IOBUFSZ => 32768;
@@ -1088,6 +1094,9 @@
 sub queueJobStatusCmd {
 	my ($jobid, $statusCode, $errorCode, $msg) = @_;
 	
+	if ($statusCode == FAILED) {
+		checkSoftimageJobFailure($jobid, $msg);
+	} 
 	queueCmd((nullCB(), "JOBSTATUS", $jobid, 
 			encodeInt($statusCode), encodeInt($errorCode), $msg, NULL_TIMESTAMP));
 }
@@ -1095,6 +1104,9 @@
 sub queueJobStatusCmdExt {
 	my ($jobid, $statusCode, $errorCode, $msg, $out, $err) = @_;
 	
+	if ($statusCode == FAILED) {
+		checkSoftimageJobFailure($jobid, $msg);
+	}
 	queueCmd((nullCB(), "JOBSTATUS", $jobid, 
 			encodeInt($statusCode), encodeInt($errorCode), $msg, NULL_TIMESTAMP, $out, $err));
 }
@@ -1989,6 +2001,12 @@
 		$SOFT_IMAGE_CREATE_LOCK = writeLock("$SOFT_IMAGE_DIR/.create");
 		unlock($SOFT_IMAGE_USE_LOCK);
 		$SOFT_IMAGE_USE_LOCK = readLock("$SOFT_IMAGE_DIR/.use");
+		
+		# make sure no errors from previous runs are there
+		if (-f "$SOFT_IMAGE_DIR/.error") {
+			unlink("$SOFT_IMAGE_DIR/.error");
+		}
+		
 		return 1;
 	}
 	else {
@@ -2044,6 +2062,17 @@
 	}
 }
 
+sub checkSoftimageJobFailure {
+	my ($JOBID, $err) = @_;
+	
+	if ($JOBID == $SOFT_IMAGE_JOB_ID) {
+		$SOFT_IMAGE_JOB_ID = -1;
+		open(my $ERRF, ">$SOFT_IMAGE_DIR/.error");
+		print $ERRF $err;
+		close($ERRF);
+		unlock($SOFT_IMAGE_CREATE_LOCK);
+	}
+}
 
 sub cleanSoftImage {
 	if (!defined $SOFT_IMAGE_DIR) {
@@ -2137,6 +2166,7 @@
 						push @STAGEIND, $dest;
 						$SOFTIMAGE = $dest;
 					}
+					$SOFT_IMAGE_JOB_ID = $JOBID;
 				}
 				else {
 					# prevent job from trying to unpack the image
@@ -2182,7 +2212,8 @@
 			$JOB{$pair[0]} = $pair[1];
 		}
 	}
-	if (checkJob($tag, $JOBID, \%JOB)) {
+	my $err = checkJob($tag, $JOBID, \%JOB);
+	if ($err eq "") {
 		$JOBDATA{$JOBID} = {
 			stagein => \@STAGEIN,
 			stageind => \@STAGEIND,
@@ -2201,6 +2232,10 @@
 
 		stagein($JOBID);
 	}
+	else {
+		queueError($tag, ($err));
+		checkSoftimageJobFailure($JOBID, $err);
+	}
 }
 
 sub checkJob() {
@@ -2214,12 +2249,10 @@
 
 		wlog DEBUG, "$JOBID Job details $ds\n";
 
-		queueError($tag, ("Missing job identity"));
-		return 0;
+		return "Missing job identity";
 	}
 	elsif (!(defined $executable)) {
-		queueError($tag, ("Missing executable"));
-		return 0;
+		return "Missing executable";
 	}
 	else {
 		my $dir = $$JOB{directory};
@@ -2231,8 +2264,7 @@
 		my $c;
 		foreach $c (@$cleanup) {
 			if (substr($c, 0, $dirlen) ne $dir) {
-				queueError($tag, ("Cannot clean up outside of the job directory (cleanup: $c, jobdir: $dir)"));
-				return 0;
+				return "Cannot clean up outside of the job directory (cleanup: $c, jobdir: $dir)";
 			}
 		}
 		chdir $dir;
@@ -2240,7 +2272,7 @@
 		wlog DEBUG, "$JOBID Sending submit reply (tag=$tag)\n";
 		queueReply($tag, ("OK"));
 		wlog DEBUG, "$JOBID Submit reply sent (tag=$tag)\n";
-		return 1;
+		return "";
 	}
 }
 
@@ -2438,13 +2470,20 @@
 		wlog DEBUG, "Got soft image\n";
 		# no need to hold lock after that
 		unlock($createLock);
+		if (-f "$SOFT_IMAGE_DIR/.error") {
+			open(my $ERRF, "<$SOFT_IMAGE_DIR/.error");
+			my $err = "";
+			while (<$ERRF>) {
+				$err .= $_;
+			}
+			dieNicely("Soft image deployment failed: $err");
+		}
 		
 		$ENV{SOFTIMAGE} = $SOFT_IMAGE_DIR;
 	}
 	
 	my $cwd = getcwd();
-	# wlog DEBUG, "CWD: $cwd\n";
-	# wlog DEBUG, "Running $executable\n";
+	
 	my $ename;
 	foreach $ename (keys %$JOBENV) {
 		$ENV{$ename} = $$JOBENV{$ename};
@@ -2473,7 +2512,10 @@
 		open STDERR, ">$serr" or dieNicely("Cannot redirect STDERR");
 	}
 	close STDIN;
-
+	
+	#wlog DEBUG, "CWD: $cwd\n";
+	#wlog DEBUG, "Running $executable\n";
+	
 	exec { $executable } @$JOBARGS or print $WR "Could not execute $executable: $!\n";
 	die "Could not execute $executable: $!";
 }



More information about the Swift-commit mailing list