[mpich2-commits] r4019 - in mpich2/trunk/src/pm/hydra: bootstrap/include bootstrap/ssh bootstrap/utils control/consys control/utils demux include launcher/mpiexec pm/central pm/include utils/signals utils/sock

balaji at mcs.anl.gov balaji at mcs.anl.gov
Thu Mar 12 00:28:34 CDT 2009


Author: balaji
Date: 2009-03-12 00:28:34 -0500 (Thu, 12 Mar 2009)
New Revision: 4019

Removed:
   mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_signal.c
Modified:
   mpich2/trunk/src/pm/hydra/bootstrap/include/bsci.h
   mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c
   mpich2/trunk/src/pm/hydra/bootstrap/utils/Makefile.sm
   mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu.h
   mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c
   mpich2/trunk/src/pm/hydra/control/consys/Makefile.sm
   mpich2/trunk/src/pm/hydra/control/consys/consys_close.c
   mpich2/trunk/src/pm/hydra/control/consys/consys_finalize.c
   mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c
   mpich2/trunk/src/pm/hydra/control/utils/Makefile.sm
   mpich2/trunk/src/pm/hydra/demux/demux.c
   mpich2/trunk/src/pm/hydra/include/hydra.h
   mpich2/trunk/src/pm/hydra/include/hydra_base.h
   mpich2/trunk/src/pm/hydra/include/hydra_utils.h
   mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c
   mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c
   mpich2/trunk/src/pm/hydra/pm/central/central.h
   mpich2/trunk/src/pm/hydra/pm/central/central_cb.c
   mpich2/trunk/src/pm/hydra/pm/central/central_finalize.c
   mpich2/trunk/src/pm/hydra/pm/central/central_launch.c
   mpich2/trunk/src/pm/hydra/pm/central/proxy.c
   mpich2/trunk/src/pm/hydra/pm/central/proxy.h
   mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c
   mpich2/trunk/src/pm/hydra/pm/central/proxy_utils.c
   mpich2/trunk/src/pm/hydra/pm/include/pmci.h
   mpich2/trunk/src/pm/hydra/utils/signals/signals.c
   mpich2/trunk/src/pm/hydra/utils/sock/sock.c
Log:
Added a connection setup between the job launcher and the proxies in
case of an abnormal exit, so that the runaway processes can be cleaned
up properly (based on the PID, instead of the executable name).

Algorithm: Each proxy keeps track of the PIDs of the processes it
launches and listens on a socket for incoming connections from the job
launcher. If the exit is clean, this socket is not used at all. However,
if the job launcher wants to kill the application (due to a timeout,
or an abort by another process in the application), a connection is
established on this socket and a message sent to the proxy to kill its
corresponding processes. We only support one command right now (KILLALL).

This should resolve ticket #447.


Modified: mpich2/trunk/src/pm/hydra/bootstrap/include/bsci.h
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/include/bsci.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/bootstrap/include/bsci.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -10,7 +10,6 @@
 #include "hydra.h"
 
 HYD_Status HYD_BSCI_Launch_procs(void);
-HYD_Status HYD_BSCI_Cleanup_procs(void);
 HYD_Status HYD_BSCI_Get_universe_size(int *size);
 HYD_Status HYD_BSCI_Wait_for_completion(void);
 HYD_Status HYD_BSCI_Finalize(void);

Modified: mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -27,12 +27,6 @@
 
     HYDU_FUNC_ENTER();
 
-    status = HYD_BSCU_Set_common_signals(HYD_BSCU_Signal_handler);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("signal utils returned error when trying to set signal\n");
-        goto fn_fail;
-    }
-
     /* FIXME: Instead of directly reading from the HYD_Handle
      * structure, the upper layers should be able to pass what exactly
      * they want launched. Without this functionality, the proxy
@@ -97,56 +91,3 @@
   fn_fail:
     goto fn_exit;
 }
-
-
-HYD_Status HYD_BSCI_Cleanup_procs(void)
-{
-    struct HYD_Proc_params *proc_params;
-    struct HYD_Partition_list *partition;
-    char *client_arg[HYD_EXEC_ARGS], *execname;
-    int arg;
-    HYD_Status status = HYD_SUCCESS;
-
-    HYDU_FUNC_ENTER();
-
-    for (proc_params = handle.proc_params; proc_params; proc_params = proc_params->next) {
-        for (partition = proc_params->partition; partition; partition = partition->next) {
-            /* Setup the executable arguments */
-            arg = 0;
-            client_arg[arg++] = MPIU_Strdup("/usr/bin/ssh");
-            client_arg[arg++] = MPIU_Strdup("-x");
-
-            /* ssh does not support any partition names other than host names */
-            client_arg[arg++] = MPIU_Strdup(partition->name);
-            client_arg[arg++] = NULL;
-
-            for (arg = 0; client_arg[arg]; arg++);
-            client_arg[arg++] = MPIU_Strdup("killall");
-
-            execname = strrchr(proc_params->exec[0], '/');
-            if (!execname)
-                execname = proc_params->exec[0];
-            else
-                execname++;
-
-            client_arg[arg++] = MPIU_Strdup(execname);
-            client_arg[arg++] = NULL;
-
-            status = HYDU_Create_process(client_arg, NULL, NULL, NULL, NULL);
-            if (status != HYD_SUCCESS) {
-                HYDU_Error_printf("bootstrap spawn process returned error\n");
-                goto fn_fail;
-            }
-
-            for (arg = 0; client_arg[arg]; arg++)
-                HYDU_FREE(client_arg[arg]);
-        }
-    }
-
-  fn_exit:
-    HYDU_FUNC_EXIT();
-    return status;
-
-  fn_fail:
-    goto fn_exit;
-}

Modified: mpich2/trunk/src/pm/hydra/bootstrap/utils/Makefile.sm
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/utils/Makefile.sm	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/bootstrap/utils/Makefile.sm	2009-03-12 05:28:34 UTC (rev 4019)
@@ -7,7 +7,7 @@
 HYDRA_LIB_PATH = ../../lib
 
 libhydra_a_DIR = ${HYDRA_LIB_PATH}
-libhydra_a_SOURCES = bscu_wait.c bscu_signal.c
+libhydra_a_SOURCES = bscu_wait.c
 INCLUDES = -I${abs_srcdir}/../../include \
 	-I${abs_srcdir}/../../../../include \
 	-I../../include \

Modified: mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu.h
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -12,7 +12,5 @@
 #include "bsci.h"
 
 HYD_Status HYD_BSCU_Wait_for_completion(void);
-HYD_Status HYD_BSCU_Set_common_signals(void (*handler) (int));
-void HYD_BSCU_Signal_handler(int signal);
 
 #endif /* BSCI_H_INCLUDED */

Deleted: mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_signal.c
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_signal.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_signal.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -1,83 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- *  (C) 2008 by Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#include "hydra.h"
-#include "hydra_utils.h"
-#include "bsci.h"
-#include "bscu.h"
-
-HYD_Status HYD_BSCU_Set_common_signals(void (*handler) (int))
-{
-    HYD_Status status = HYD_SUCCESS;
-
-    HYDU_FUNC_ENTER();
-
-    status = HYDU_Set_signal(SIGINT, handler);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("signal utils returned error when trying to set SIGINT signal\n");
-        goto fn_fail;
-    }
-
-    status = HYDU_Set_signal(SIGQUIT, handler);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("signal utils returned error when trying to set SIGQUIT signal\n");
-        goto fn_fail;
-    }
-
-    status = HYDU_Set_signal(SIGTERM, handler);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("signal utils returned error when trying to set SIGTERM signal\n");
-        goto fn_fail;
-    }
-
-#if defined SIGSTOP
-    status = HYDU_Set_signal(SIGSTOP, handler);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("signal utils returned error when trying to set SIGSTOP signal\n");
-        goto fn_fail;
-    }
-#endif /* SIGSTOP */
-
-#if defined SIGCONT
-    status = HYDU_Set_signal(SIGCONT, handler);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("signal utils returned error when trying to set SIGCONT signal\n");
-        goto fn_fail;
-    }
-#endif /* SIGCONT */
-
-  fn_exit:
-    HYDU_FUNC_EXIT();
-    return status;
-
-  fn_fail:
-    goto fn_exit;
-}
-
-
-void HYD_BSCU_Signal_handler(int signal)
-{
-    HYDU_FUNC_ENTER();
-
-    if (signal == SIGINT || signal == SIGQUIT || signal == SIGTERM
-#if defined SIGSTOP
-        || signal == SIGSTOP
-#endif /* SIGSTOP */
-#if defined SIGCONT
-        || signal == SIGCONT
-#endif /* SIGSTOP */
-) {
-        /* There's nothing we can do with the return value for now. */
-        HYD_BSCI_Cleanup_procs();
-        exit(-1);
-    }
-    else {
-        /* Ignore other signals for now */
-    }
-
-    HYDU_FUNC_EXIT();
-    return;
-}

Modified: mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -63,13 +63,8 @@
          * and should not be used. */
     } while (not_completed > 0);
 
-    if (not_completed) {
-        status = HYD_BSCI_Cleanup_procs();
-        if (status != HYD_SUCCESS) {
-            HYDU_Error_printf("bootstrap process cleanup failed\n");
-            goto fn_fail;
-        }
-    }
+    if (not_completed)
+        status = HYD_INTERNAL_ERROR;
 
   fn_exit:
     HYDU_FUNC_EXIT();

Modified: mpich2/trunk/src/pm/hydra/control/consys/Makefile.sm
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/Makefile.sm	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/control/consys/Makefile.sm	2009-03-12 05:28:34 UTC (rev 4019)
@@ -16,5 +16,4 @@
 	-I${abs_srcdir}/../include \
 	-I${abs_srcdir}/../utils \
 	-I${abs_srcdir}/../../pm/include \
-	-I${abs_srcdir}/../../bootstrap/include \
 	-I${abs_srcdir}/../../demux

Modified: mpich2/trunk/src/pm/hydra/control/consys/consys_close.c
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/consys_close.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/control/consys/consys_close.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -8,7 +8,6 @@
 #include "hydra_utils.h"
 #include "csi.h"
 #include "pmci.h"
-#include "bsci.h"
 #include "demux.h"
 
 HYD_Handle handle;

Modified: mpich2/trunk/src/pm/hydra/control/consys/consys_finalize.c
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/consys_finalize.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/control/consys/consys_finalize.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -7,7 +7,6 @@
 #include "hydra.h"
 #include "csi.h"
 #include "pmci.h"
-#include "bsci.h"
 #include "demux.h"
 
 HYD_Status HYD_CSI_Finalize(void)
@@ -22,12 +21,6 @@
         goto fn_fail;
     }
 
-    status = HYD_BSCI_Finalize();
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("bootstrap server finalize returned an error\n");
-        goto fn_fail;
-    }
-
     status = HYD_DMX_Finalize();
     if (status != HYD_SUCCESS) {
         HYDU_Error_printf("demux engine finalize returned an error\n");

Modified: mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -8,7 +8,6 @@
 #include "csi.h"
 #include "csiu.h"
 #include "pmci.h"
-#include "bsci.h"
 #include "demux.h"
 
 HYD_Handle handle;
@@ -47,11 +46,11 @@
         if (sockets_open && HYDU_Time_left(handle.start, handle.timeout))
             continue;
 
-        /* Make sure all the processes have terminated. The bootstrap
-         * control device will take care of that. */
-        status = HYD_BSCI_Wait_for_completion();
+        /* Make sure all the processes have terminated. The process
+         * manager control device will take care of that. */
+        status = HYD_PMCI_Wait_for_completion();
         if (status != HYD_SUCCESS) {
-            HYDU_Error_printf("bootstrap server returned error when waiting for completion\n");
+            HYDU_Error_printf("process manager returned error when waiting for completion\n");
             goto fn_fail;
         }
 

Modified: mpich2/trunk/src/pm/hydra/control/utils/Makefile.sm
===================================================================
--- mpich2/trunk/src/pm/hydra/control/utils/Makefile.sm	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/control/utils/Makefile.sm	2009-03-12 05:28:34 UTC (rev 4019)
@@ -15,5 +15,4 @@
 	-I${abs_srcdir}/../../launcher/utils \
 	-I${abs_srcdir}/../include \
 	-I${abs_srcdir}/../../pm/include \
-	-I${abs_srcdir}/../../bootstrap/include \
 	-I${abs_srcdir}/../../demux

Modified: mpich2/trunk/src/pm/hydra/demux/demux.c
===================================================================
--- mpich2/trunk/src/pm/hydra/demux/demux.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/demux/demux.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -137,8 +137,11 @@
         ret = poll(pollfds, total_fds, time);
         if (ret < 0) {
             if (errno == EINTR) {
-                /* We were interrupted by a system call; loop back */
-                continue;
+                /* We were interrupted by a system call; this is not
+                 * an error case in the regular sense; but the upper
+                 * layer needs to gracefully cleanup the processes. */
+                status = HYD_SUCCESS;
+                goto fn_exit;
             }
             HYDU_Error_printf("poll error (errno: %d)\n", errno);
             status = HYD_SOCK_ERROR;

Modified: mpich2/trunk/src/pm/hydra/include/hydra.h
===================================================================
--- mpich2/trunk/src/pm/hydra/include/hydra.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/include/hydra.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -12,12 +12,13 @@
 #include "hydra_utils.h"
 
 struct HYD_Handle_ {
+    char *base_path;
+    int proxy_port;
+    char *boot_server;
+
     int debug;
     int enablex;
     char *wdir;
-
-    char *base_path;
-
     char *host_file;
 
     /* Global environment */

Modified: mpich2/trunk/src/pm/hydra/include/hydra_base.h
===================================================================
--- mpich2/trunk/src/pm/hydra/include/hydra_base.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/include/hydra_base.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -51,6 +51,8 @@
 extern char **environ;
 #endif /* MANUAL_EXTERN_ENVIRON */
 
+#define HYD_DEFAULT_PROXY_PORT 9899
+
 typedef enum {
     HYD_SUCCESS = 0,
     HYD_NO_MEM,

Modified: mpich2/trunk/src/pm/hydra/include/hydra_utils.h
===================================================================
--- mpich2/trunk/src/pm/hydra/include/hydra_utils.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/include/hydra_utils.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -116,6 +116,7 @@
 #endif
 
 HYD_Status HYDU_Set_signal(int signum, void (*handler) (int));
+HYD_Status HYDU_Set_common_signals(void (*handler) (int));
 
 
 /* Timer utilities */

Modified: mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c
===================================================================
--- mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -25,6 +25,7 @@
     printf("\t-genvnone                        [Do not pass any environment variables]\n");
     printf("\t-genvall                         [Pass all environment variables (default)]\n");
     printf("\t-f {name}                        [File containing the host names]\n");
+    printf("\t--proxy-port                     [Port on which proxies can listen]\n");
 
     printf("\n");
 

Modified: mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c
===================================================================
--- mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -108,6 +108,7 @@
     handle.enablex = -1;
     handle.wdir = NULL;
     handle.host_file = NULL;
+    handle.proxy_port = -1;
 
     status = HYDU_Get_base_path(argv[0], &handle.base_path);
     if (status != HYD_SUCCESS) {
@@ -175,6 +176,22 @@
             continue;
         }
 
+        /* Check if the proxy port is set */
+        if (!strcmp(*argv, "--proxy-port")) {
+            CHECK_LOCAL_PARAM_START(local_params_started, status);
+            CHECK_NEXT_ARG_VALID(status);
+
+            if (handle.proxy_port != -1) {
+                HYDU_Error_printf("Duplicate proxy port setting; previously set to %d\n",
+                                  handle.proxy_port);
+                status = HYD_INTERNAL_ERROR;
+                goto fn_fail;
+            }
+
+            handle.proxy_port = atoi(*argv);
+            continue;
+        }
+
         /* Check what all environment variables need to be propagated */
         if (!strcmp(*argv, "-genvall") || !strcmp(*argv, "-genvnone") ||
             !strcmp(*argv, "-genvlist")) {
@@ -420,6 +437,11 @@
         proc_params = proc_params->next;
     }
 
+    /* If the proxy port is not set, set it to the default value */
+    if (handle.proxy_port == -1) {
+        handle.proxy_port = HYD_DEFAULT_PROXY_PORT;
+    }
+
   fn_exit:
     HYDU_FUNC_EXIT();
     return status;

Modified: mpich2/trunk/src/pm/hydra/pm/central/central.h
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/central.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/central.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -15,5 +15,7 @@
 extern int HYD_PMCD_Central_listenfd;
 
 HYD_Status HYD_PMCD_Central_cb(int fd, HYD_Event_t events);
+HYD_Status HYD_PMCD_Central_cleanup(void);
+void HYD_PMCD_Central_signal_cb(int signal);
 
 #endif /* CENTRAL_H_INCLUDED */

Modified: mpich2/trunk/src/pm/hydra/pm/central/central_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/central_cb.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/central_cb.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -13,6 +13,7 @@
 #include "central.h"
 
 int HYD_PMCD_Central_listenfd;
+HYD_Handle handle;
 
 /*
  * HYD_PMCD_Central_cb: This is the core PMI server part of the
@@ -58,6 +59,14 @@
             goto fn_fail;
         }
 
+        /* Make this socket non-blocking as we should not keep waiting
+         * for data on the PMI connections. */
+        status = HYDU_Sock_set_nonblock(accept_fd);
+        if (status != HYD_SUCCESS) {
+            HYDU_Error_printf("sock utils returned error setting socket to non-blocking\n");
+            goto fn_fail;
+        }
+
         status = HYD_DMX_Register_fd(1, &accept_fd, HYD_STDOUT, HYD_PMCD_Central_cb);
         if (status != HYD_SUCCESS) {
             HYDU_Error_printf("demux engine returned error when registering fd\n");
@@ -75,7 +84,7 @@
             /* This is not a clean close. If a finalize was called, we
              * would have deregistered this socket. The application
              * might have aborted. Just cleanup all the processes */
-            status = HYD_BSCI_Cleanup_procs();
+            status = HYD_PMCD_Central_cleanup();
             if (status != HYD_SUCCESS) {
                 HYDU_Error_printf("bootstrap server returned error cleaning up processes\n");
                 goto fn_fail;
@@ -151,8 +160,7 @@
             /* Cleanup all the processes and return. We don't need to
              * check the return status since we are anyway returning
              * an error */
-            HYD_BSCI_Cleanup_procs();
-
+            HYD_PMCD_Central_cleanup();
             status = HYD_INTERNAL_ERROR;
             goto fn_fail;
         }
@@ -171,3 +179,68 @@
   fn_fail:
     goto fn_exit;
 }
+
+
+HYD_Status HYD_PMCD_Central_cleanup(void)
+{
+    struct HYD_Proc_params *proc_params;
+    struct HYD_Partition_list *partition;
+    int fd;
+    enum HYD_Proxy_cmds cmd;
+    HYD_Status status = HYD_SUCCESS, overall_status = HYD_SUCCESS;
+
+    HYDU_FUNC_ENTER();
+
+    /* FIXME: Instead of doing this from this process itself, fork a
+     * bunch of processes to do this. */
+    /* Connect to all proxies and send a KILL command */
+    cmd = KILLALL_PROCS;
+    for (proc_params = handle.proc_params; proc_params; proc_params = proc_params->next) {
+        for (partition = proc_params->partition; partition; partition = partition->next) {
+            status = HYDU_Sock_connect(partition->name, handle.proxy_port, &fd);
+            if (status != HYD_SUCCESS) {
+                overall_status = HYD_INTERNAL_ERROR;
+                HYDU_Error_printf("unable to connect to the proxy on %s\n", partition->name);
+                continue;       /* Move on to the next proxy */
+            }
+
+            status = HYDU_Sock_write(fd, &cmd, sizeof(cmd));
+            if (status != HYD_SUCCESS) {
+                overall_status = HYD_INTERNAL_ERROR;
+                HYDU_Error_printf("unable to send data to the proxy on %s\n", partition->name);
+                continue;       /* Move on to the next proxy */
+            }
+
+            close(fd);
+        }
+    }
+
+    HYDU_FUNC_EXIT();
+
+    return overall_status;
+}
+
+
+void HYD_PMCD_Central_signal_cb(int signal)
+{
+    HYDU_FUNC_ENTER();
+
+    if (signal == SIGINT || signal == SIGQUIT || signal == SIGTERM
+#if defined SIGSTOP
+        || signal == SIGSTOP
+#endif /* SIGSTOP */
+#if defined SIGCONT
+        || signal == SIGCONT
+#endif /* SIGSTOP */
+) {
+        /* There's nothing we can do with the return value for now. */
+        HYD_PMCD_Central_cleanup();
+        exit(-1);
+    }
+    else {
+        /* Ignore other signals for now */
+    }
+
+    HYDU_FUNC_EXIT();
+    return;
+}

Modified: mpich2/trunk/src/pm/hydra/pm/central/central_finalize.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/central_finalize.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/central_finalize.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -36,6 +36,12 @@
         goto fn_fail;
     }
 
+    status = HYD_BSCI_Finalize();
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("unable to finalize the bootstrap server\n");
+        goto fn_fail;
+    }
+
   fn_exit:
     HYDU_FUNC_EXIT();
     return status;

Modified: mpich2/trunk/src/pm/hydra/pm/central/central_launch.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/central_launch.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/central_launch.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -49,6 +49,12 @@
 
     HYDU_FUNC_ENTER();
 
+    status = HYDU_Set_common_signals(HYD_PMCD_Central_signal_cb);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set signal\n");
+        goto fn_fail;
+    }
+
     /* Check if the user wants us to use a port within a certain
      * range. */
     port_range = getenv("MPIEXEC_PORTRANGE");
@@ -58,6 +64,7 @@
         port_range = getenv("MPICH_PORT_RANGE");
 
     /* Listen on a port in the port range */
+    port = 0;
     status = HYDU_Sock_listen(&HYD_PMCD_Central_listenfd, port_range, &port);
     if (status != HYD_SUCCESS) {
         HYDU_Error_printf("sock utils returned listen error\n");
@@ -65,8 +72,8 @@
     }
 
     /* Register the listening socket with the demux engine */
-    status =
-        HYD_DMX_Register_fd(1, &HYD_PMCD_Central_listenfd, HYD_STDOUT, HYD_PMCD_Central_cb);
+    status = HYD_DMX_Register_fd(1, &HYD_PMCD_Central_listenfd, HYD_STDOUT,
+                                 HYD_PMCD_Central_cb);
     if (status != HYD_SUCCESS) {
         HYDU_Error_printf("demux engine returned error for registering fd\n");
         goto fn_fail;
@@ -173,6 +180,16 @@
             partition->args[arg++] = MPIU_Strdup(str);
             HYDU_FREE(str);
 
+            status = HYDU_String_int_to_str(handle.proxy_port, &str);
+            if (status != HYD_SUCCESS) {
+                HYDU_Error_printf
+                    ("String utils returned error while converting int to string\n");
+                goto fn_fail;
+            }
+            partition->args[arg++] = MPIU_Strdup("--proxy-port");
+            partition->args[arg++] = MPIU_Strdup(str);
+            HYDU_FREE(str);
+
             i = 0;
             for (env = handle.system_env; env; env = env->next)
                 i++;
@@ -214,3 +231,27 @@
   fn_fail:
     goto fn_exit;
 }
+
+
+HYD_Status HYD_PMCI_Wait_for_completion(void)
+{
+    HYD_Status status = HYD_SUCCESS;
+
+    HYDU_FUNC_ENTER();
+
+    status = HYD_BSCI_Wait_for_completion();
+    if (status != HYD_SUCCESS) {
+        status = HYD_PMCD_Central_cleanup();
+        if (status != HYD_SUCCESS) {
+            HYDU_Error_printf("process manager device returned error for process cleanup\n");
+            goto fn_fail;
+        }
+    }
+
+  fn_exit:
+    HYDU_FUNC_EXIT();
+    return status;
+
+  fn_fail:
+    goto fn_exit;
+}

Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -15,9 +15,8 @@
 int main(int argc, char **argv)
 {
     int i, j, arg, sockets_open;
-    uint16_t port;
-    int HYD_Proxy_listenfd, stdin_fd, timeout;
-    char *port_range, *str, *timeout_str;
+    int stdin_fd, timeout;
+    char *str, *timeout_str;
     HYD_Env_t *env, pmi;
     char *client_args[HYD_EXEC_ARGS];
     HYD_Status status = HYD_SUCCESS;
@@ -28,16 +27,19 @@
         goto fn_fail;
     }
 
-    /* Check if the user wants us to use a port within a certain
-     * range. */
-    port_range = getenv("MPIEXEC_PORTRANGE");
-    if (!port_range)
-        port_range = getenv("MPIEXEC_PORT_RANGE");
-    if (!port_range)
-        port_range = getenv("MPICH_PORT_RANGE");
+    /* We don't know if the bootstrap server will automatically
+     * forward the signals or not. We have our signal handlers for the
+     * case where it does. For when it doesn't, we also open a listen
+     * port where an explicit kill request can be sent */
+    status = HYDU_Set_common_signals(HYD_Proxy_signal_cb);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set signal\n");
+        goto fn_fail;
+    }
 
     /* Listen on a port in the port range */
-    status = HYDU_Sock_listen(&HYD_Proxy_listenfd, port_range, &port);
+    status = HYDU_Sock_listen(&HYD_Proxy_listenfd, NULL,
+                              (uint16_t *) & HYD_Proxy_params.proxy_port);
     if (status != HYD_SUCCESS) {
         HYDU_Error_printf("sock utils returned listen error\n");
         goto fn_fail;

Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy.h
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -7,12 +7,14 @@
 #ifndef PROXY_H_INCLUDED
 #define PROXY_H_INCLUDED
 
-#include "hydra.h"
+#include "hydra_base.h"
+#include "hydra_utils.h"
 
 struct HYD_Proxy_params {
     HYD_Env_t *global_env;
     HYD_Env_t *env_list;
     int proc_count;
+    int proxy_port;
     int pmi_id;
     char *args[HYD_EXEC_ARGS];
     struct HYD_Partition_list *partition;
@@ -35,5 +37,6 @@
 HYD_Status HYD_Proxy_stdout_cb(int fd, HYD_Event_t events);
 HYD_Status HYD_Proxy_stderr_cb(int fd, HYD_Event_t events);
 HYD_Status HYD_Proxy_stdin_cb(int fd, HYD_Event_t events);
+void HYD_Proxy_signal_cb(int signal);
 
 #endif /* PROXY_H_INCLUDED */

Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -54,12 +54,20 @@
                 HYDU_Error_printf("demux engine returned error when deregistering fd\n");
                 goto fn_fail;
             }
+            close(fd);
             goto fn_exit;
         }
 
         if (cmd == KILLALL_PROCS) {     /* Got the killall command */
             for (i = 0; i < HYD_Proxy_params.proc_count; i++)
                 kill(HYD_Proxy_params.pid[i], SIGKILL);
+
+            status = HYD_DMX_Deregister_fd(fd);
+            if (status != HYD_SUCCESS) {
+                HYDU_Error_printf("demux engine returned error when deregistering fd\n");
+                goto fn_fail;
+            }
+            close(fd);
         }
         else {
             HYDU_Error_printf("got unrecognized command from mpiexec\n");
@@ -183,3 +191,31 @@
   fn_fail:
     goto fn_exit;
 }
+
+
+void HYD_Proxy_signal_cb(int signal)
+{
+    int i;
+
+    HYDU_FUNC_ENTER();
+
+    if (signal == SIGINT || signal == SIGQUIT || signal == SIGTERM
+#if defined SIGSTOP
+        || signal == SIGSTOP
+#endif /* SIGSTOP */
+#if defined SIGCONT
+        || signal == SIGCONT
+#endif /* SIGSTOP */
+) {
+        /* There's nothing we can do with the return value for now. */
+        for (i = 0; i < HYD_Proxy_params.proc_count; i++)
+            kill(HYD_Proxy_params.pid[i], SIGKILL);
+        exit(-1);
+    }
+    else {
+        /* Ignore other signals for now */
+    }
+
+    HYDU_FUNC_EXIT();
+    return;
+}

Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy_utils.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy_utils.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy_utils.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -39,6 +39,13 @@
             continue;
         }
 
+        /* Proxy port */
+        if (!strcmp(*argv, "--proxy-port")) {
+            argv++;
+            HYD_Proxy_params.proxy_port = atoi(*argv);
+            continue;
+        }
+
         /* PMI_ID: This is the PMI_ID for the first process;
          * everything else is incremented from here. */
         if (!strcmp(*argv, "--pmi-id")) {

Modified: mpich2/trunk/src/pm/hydra/pm/include/pmci.h
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/include/pmci.h	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/pm/include/pmci.h	2009-03-12 05:28:34 UTC (rev 4019)
@@ -8,6 +8,7 @@
 #define PMCI_H_INCLUDED
 
 HYD_Status HYD_PMCI_Launch_procs(void);
+HYD_Status HYD_PMCI_Wait_for_completion(void);
 HYD_Status HYD_PMCI_Finalize(void);
 
 #endif /* PMCI_H_INCLUDED */

Modified: mpich2/trunk/src/pm/hydra/utils/signals/signals.c
===================================================================
--- mpich2/trunk/src/pm/hydra/utils/signals/signals.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/utils/signals/signals.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -36,3 +36,52 @@
     HYDU_FUNC_EXIT();
     return status;
 }
+
+
+HYD_Status HYDU_Set_common_signals(void (*handler) (int))
+{
+    HYD_Status status = HYD_SUCCESS;
+
+    HYDU_FUNC_ENTER();
+
+    status = HYDU_Set_signal(SIGINT, handler);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set SIGINT signal\n");
+        goto fn_fail;
+    }
+
+    status = HYDU_Set_signal(SIGQUIT, handler);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set SIGQUIT signal\n");
+        goto fn_fail;
+    }
+
+    status = HYDU_Set_signal(SIGTERM, handler);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set SIGTERM signal\n");
+        goto fn_fail;
+    }
+
+#if defined SIGSTOP
+    status = HYDU_Set_signal(SIGSTOP, handler);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set SIGSTOP signal\n");
+        goto fn_fail;
+    }
+#endif /* SIGSTOP */
+
+#if defined SIGCONT
+    status = HYDU_Set_signal(SIGCONT, handler);
+    if (status != HYD_SUCCESS) {
+        HYDU_Error_printf("signal utils returned error when trying to set SIGCONT signal\n");
+        goto fn_fail;
+    }
+#endif /* SIGCONT */
+
+  fn_exit:
+    HYDU_FUNC_EXIT();
+    return status;
+
+  fn_fail:
+    goto fn_exit;
+}

Modified: mpich2/trunk/src/pm/hydra/utils/sock/sock.c
===================================================================
--- mpich2/trunk/src/pm/hydra/utils/sock/sock.c	2009-03-11 20:51:47 UTC (rev 4018)
+++ mpich2/trunk/src/pm/hydra/utils/sock/sock.c	2009-03-12 05:28:34 UTC (rev 4019)
@@ -20,6 +20,9 @@
     low_port = 0;
     high_port = 0;
     if (port_range) {
+        /* If port range is set, we always pick from there */
+        *port = 0;
+
         port_str = strtok(port_range, ":");
         if (port_str == NULL) {
             HYDU_Error_printf("error parsing port range string\n");
@@ -42,6 +45,12 @@
             goto fn_fail;
         }
     }
+    else {
+        /* If port range is NULL, if a port is already provided, we
+         * pick that. Otherwise, we search for an available port. */
+        low_port = *port;
+        high_port = *port;
+    }
 
     *listen_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
     if (*listen_fd < 0) {
@@ -50,8 +59,7 @@
         goto fn_fail;
     }
 
-    if (setsockopt(*listen_fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int))
-        < 0) {
+    if (setsockopt(*listen_fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)) < 0) {
         HYDU_Error_printf("unable to set the TCP_NODELAY socket option (errno: %d)\n", errno);
         status = HYD_SOCK_ERROR;
         goto fn_fail;
@@ -63,6 +71,16 @@
         sa.sin_port = htons(i);
         sa.sin_addr.s_addr = INADDR_ANY;
 
+        /* The sockets standard does not guarantee that a successful
+         * return here means that this is set. However, REUSEADDR not
+         * being set is not a fatal error, so we ignore that
+         * case. However, we do check for error cases, which means
+         * that something bad has happened. */
+        if (setsockopt(*listen_fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) {
+            HYDU_Error_printf("unable to set socket option to SO_REUSEADDR\n");
+            goto fn_fail;
+        }
+
         if (bind(*listen_fd, (struct sockaddr *) &sa, sizeof(sa)) < 0) {
             close(*listen_fd);
             /* If the address is in use, we should try the next
@@ -77,10 +95,16 @@
         else    /* We got a port */
             break;
     }
+
     *port = i;
+    if (*port > high_port) {
+        HYDU_Error_printf("unable to find an appropriate port to bind to\n");
+        status = HYD_SOCK_ERROR;
+        goto fn_fail;
+    }
 
     if (listen(*listen_fd, -1) < 0) {
-        HYDU_Error_printf("listen error (errno: %d)\n", errno);
+        HYDU_Error_printf("listen error on fd %d (errno: %d)\n", *listen_fd, errno);
         status = HYD_SOCK_ERROR;
         goto fn_fail;
     }
@@ -136,25 +160,13 @@
     memcpy(&sa.sin_addr, ht->h_addr_list[0], ht->h_length);
 
     /* Create a socket and set the required options */
-    *fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+    *fd = socket(AF_INET, SOCK_STREAM, 0);
     if (*fd < 0) {
         HYDU_Error_printf("unable to create a stream socket (errno: %d)\n", errno);
         status = HYD_SOCK_ERROR;
         goto fn_fail;
     }
 
-    if (setsockopt(*fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)) < 0) {
-        HYDU_Error_printf("unable to set the SO_REUSEADDR socket option (errno: %d)\n", errno);
-        status = HYD_SOCK_ERROR;
-        goto fn_fail;
-    }
-
-    status = HYDU_Sock_set_nonblock(*fd);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("unable to set fd %d as non-blocking\n", *fd);
-        goto fn_fail;
-    }
-
     status = HYDU_Sock_set_cloexec(*fd);
     if (status != HYD_SUCCESS) {
         HYDU_Error_printf("unable to set fd %d to close on exec\n", *fd);
@@ -189,12 +201,6 @@
         goto fn_fail;
     }
 
-    status = HYDU_Sock_set_nonblock(*fd);
-    if (status != HYD_SUCCESS) {
-        HYDU_Error_printf("unable to set fd %d as non-blocking\n", *fd);
-        goto fn_fail;
-    }
-
     status = HYDU_Sock_set_cloexec(*fd);
     if (status != HYD_SUCCESS) {
         HYDU_Error_printf("unable to set fd %d to close on exec\n", *fd);



More information about the mpich2-commits mailing list