[mpich2-commits] r7801 - mpich2/trunk/src/pm/hydra/pm/pmiserv

balaji at mcs.anl.gov balaji at mcs.anl.gov
Thu Jan 20 22:16:08 CST 2011


Author: balaji
Date: 2011-01-20 22:16:08 -0600 (Thu, 20 Jan 2011)
New Revision: 7801

Modified:
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c
Log:
Handle the case where the PMI connection is abnormally broken, but the
application returns a zero exit code.

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c	2011-01-21 04:16:05 UTC (rev 7800)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip.c	2011-01-21 04:16:08 UTC (rev 7801)
@@ -232,7 +232,13 @@
         if (pid > 0)
             for (i = 0; i < HYD_pmcd_pmip.local.proxy_process_count; i++)
                 if (HYD_pmcd_pmip.downstream.pid[i] == pid) {
-                    HYD_pmcd_pmip.downstream.exit_status[i] = ret_status;
+                    /* We store the new return status if either the
+                     * exit status is uninitialized, or if the return
+                     * status is non-zero. If the return status is
+                     * zero, and the exit status has already been set
+                     * to a different value, we use that. */
+                    if (ret_status || HYD_pmcd_pmip.downstream.exit_status[i] ==-1)
+                        HYD_pmcd_pmip.downstream.exit_status[i] = ret_status;
                     done++;
                 }
 

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c	2011-01-21 04:16:05 UTC (rev 7800)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c	2011-01-21 04:16:08 UTC (rev 7801)
@@ -253,16 +253,23 @@
             status = HYDT_ftb_publish("FTB_MPI_PROCS_DEAD", ftb_event_payload);
             HYDU_ERR_POP(status, "FTB publish failed\n");
 
+            /* Store a temporary erroneous exit status. In case the
+             * application does not return a non-zero exit status, we
+             * will use this. */
+            HYD_pmcd_pmip.downstream.exit_status[pid] = 1;
+
+            /* Deregister failed socket */
+            status = HYDT_dmx_deregister_fd(fd);
+            HYDU_ERR_POP(status, "unable to deregister fd\n");
+            close(fd);
+
             if (HYD_pmcd_pmip.user_global.auto_cleanup) {
                 HYD_pmcd_pmip_kill_localprocs();
             }
             else {
                 /* If the user doesn't want to automatically cleanup,
-                 * deregister the socket, signal the remaining
-                 * processes, and send this information upstream */
-                status = HYDT_dmx_deregister_fd(fd);
-                HYDU_ERR_POP(status, "unable to deregister fd\n");
-                close(fd);
+                 * signal the remaining processes, and send this
+                 * information upstream */
 
                 /* FIXME: This code needs to change from sending the
                  * SIGUSR1 signal to a PMI-2 notification message. */

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c	2011-01-21 04:16:05 UTC (rev 7800)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmiserv_cb.c	2011-01-21 04:16:08 UTC (rev 7801)
@@ -191,6 +191,7 @@
         if (HYD_server_info.user_global.auto_cleanup) {
             for (i = 0; i < proxy->proxy_process_count; i++) {
                 if (proxy->exit_status[i]) {
+                    HYDU_dump(stdout, "ONE OF THE PROCESSES TERMINATED BADLY: CLEANING UP\n");
                     status = HYD_pmcd_pmiserv_cleanup_pg(proxy->pg);
                     HYDU_ERR_POP(status, "unable to cleanup processes\n");
                     break;



More information about the mpich2-commits mailing list