[mpich2-commits] r7801 - mpich2/trunk/src/pm/hydra/pm/pmiserv
balaji at mcs.anl.gov
balaji at mcs.anl.gov
Thu Jan 20 22:16:08 CST 2011
Author: balaji
Date: 2011-01-20 22:16:08 -0600 (Thu, 20 Jan 2011)
New Revision: 7801
Modified:
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c
Log:
Handle the case where the PMI connection is abnormally broken, but the
application returns a zero exit code.
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c 2011-01-21 04:16:05 UTC (rev 7800)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c 2011-01-21 04:16:08 UTC (rev 7801)
@@ -232,7 +232,13 @@
if (pid > 0)
for (i = 0; i < HYD_pmcd_pmip.local.proxy_process_count; i++)
if (HYD_pmcd_pmip.downstream.pid[i] == pid) {
- HYD_pmcd_pmip.downstream.exit_status[i] = ret_status;
+ /* We store the new return status if either the
+ * exit status is uninitialized, or if the return
+ * status is non-zero. If the return status is
+ * zero, and the exit status has already been set
+ * to a different value, we use that. */
+ if (ret_status || HYD_pmcd_pmip.downstream.exit_status[i] ==-1)
+ HYD_pmcd_pmip.downstream.exit_status[i] = ret_status;
done++;
}
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c 2011-01-21 04:16:05 UTC (rev 7800)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c 2011-01-21 04:16:08 UTC (rev 7801)
@@ -253,16 +253,23 @@
status = HYDT_ftb_publish("FTB_MPI_PROCS_DEAD", ftb_event_payload);
HYDU_ERR_POP(status, "FTB publish failed\n");
+ /* Store a temporary erroneous exit status. In case the
+ * application does not return a non-zero exit status, we
+ * will use this. */
+ HYD_pmcd_pmip.downstream.exit_status[pid] = 1;
+
+ /* Deregister failed socket */
+ status = HYDT_dmx_deregister_fd(fd);
+ HYDU_ERR_POP(status, "unable to deregister fd\n");
+ close(fd);
+
if (HYD_pmcd_pmip.user_global.auto_cleanup) {
HYD_pmcd_pmip_kill_localprocs();
}
else {
/* If the user doesn't want to automatically cleanup,
- * deregister the socket, signal the remaining
- * processes, and send this information upstream */
- status = HYDT_dmx_deregister_fd(fd);
- HYDU_ERR_POP(status, "unable to deregister fd\n");
- close(fd);
+ * signal the remaining processes, and send this
+ * information upstream */
/* FIXME: This code needs to change from sending the
* SIGUSR1 signal to a PMI-2 notification message. */
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c 2011-01-21 04:16:05 UTC (rev 7800)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c 2011-01-21 04:16:08 UTC (rev 7801)
@@ -191,6 +191,7 @@
if (HYD_server_info.user_global.auto_cleanup) {
for (i = 0; i < proxy->proxy_process_count; i++) {
if (proxy->exit_status[i]) {
+ HYDU_dump(stdout, "ONE OF THE PROCESSES TERMINATED BADLY: CLEANING UP\n");
status = HYD_pmcd_pmiserv_cleanup_pg(proxy->pg);
HYDU_ERR_POP(status, "unable to cleanup processes\n");
break;
More information about the mpich2-commits
mailing list