[mpich2-commits] r4049 - mpich2/trunk/src/pm/hydra/pm/central
balaji at mcs.anl.gov
balaji at mcs.anl.gov
Thu Mar 12 20:52:57 CDT 2009
Author: balaji
Date: 2009-03-12 20:52:57 -0500 (Thu, 12 Mar 2009)
New Revision: 4049
Modified:
mpich2/trunk/src/pm/hydra/pm/central/proxy.c
mpich2/trunk/src/pm/hydra/pm/central/proxy.h
mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c
Log:
This was a big mistake in the proxy implementation that was completely
ignoring the return statuses of the processes and returning success to
the launcher every time. This commit should fix that part and some other
minor bugs in the process cleanup.
Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy.c 2009-03-12 23:01:22 UTC (rev 4048)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy.c 2009-03-13 01:52:57 UTC (rev 4049)
@@ -14,7 +14,7 @@
int main(int argc, char **argv)
{
- int i, j, arg, sockets_open;
+ int i, j, arg, count, pid, ret_status;
int stdin_fd, timeout;
char *str, *timeout_str;
char *client_args[HYD_EXEC_ARGS];
@@ -27,16 +27,6 @@
goto fn_fail;
}
- /* We don't know if the bootstrap server will automatically
- * forward the signals or not. We have our signal handlers for the
- * case where it does. For when it doesn't, we also open a listen
- * port where an explicit kill request can be sent */
- status = HYDU_Set_common_signals(HYD_Proxy_signal_cb);
- if (status != HYD_SUCCESS) {
- HYDU_Error_printf("signal utils returned error when trying to set signal\n");
- goto fn_fail;
- }
-
/* Listen on a port in the port range */
status = HYDU_Sock_listen(&HYD_Proxy_listenfd, NULL,
(uint16_t *) & HYD_Proxy_params.proxy_port);
@@ -66,7 +56,13 @@
HYD_Proxy_params.proc_count * sizeof(int), status);
HYDU_MALLOC(HYD_Proxy_params.pid, int *,
HYD_Proxy_params.proc_count * sizeof(int), status);
+ HYDU_MALLOC(HYD_Proxy_params.exit_status, int *,
+ HYD_Proxy_params.proc_count * sizeof(int), status);
+ /* Initialize the exit status */
+ for (i = 0; i < HYD_Proxy_params.proc_count; i++)
+ HYD_Proxy_params.exit_status[i] = -1;
+
/* Spawn the processes */
for (i = 0; i < HYD_Proxy_params.proc_count; i++) {
@@ -171,21 +167,64 @@
/* Check to see if there's any open read socket left; if there
* are, we will just wait for more events. */
- sockets_open = 0;
+ count = 0;
for (i = 0; i < HYD_Proxy_params.proc_count; i++) {
if (HYD_Proxy_params.out[i] != -1 || HYD_Proxy_params.err[i] != -1) {
- sockets_open++;
+ count++;
break;
}
}
/* We are done */
- if (!sockets_open)
+ if (!count)
break;
}
+ /* FIXME: If we did not break out yet, add a small usleep to yield
+ * CPU here. We can not just sleep for the remaining time, as the
+ * timeout value might be large and the application might exit
+ * much quicker. Note that the sched_yield() call is broken on
+ * newer linux kernel versions and should not be used. */
+ /* Once all the sockets are closed, wait for all the processes to
+ * finish. We poll here, but hopefully not for too long. */
+ do {
+ pid = waitpid(-1, &ret_status, WNOHANG);
+
+ /* Find the pid and mark it as complete. */
+ if (pid > 0)
+ for (i = 0; i < HYD_Proxy_params.proc_count; i++)
+ if (HYD_Proxy_params.pid[i] == pid)
+ HYD_Proxy_params.exit_status[i] = WEXITSTATUS(ret_status);
+
+ /* Check how many more processes are pending */
+ count = 0;
+ for (i = 0; i < HYD_Proxy_params.proc_count; i++) {
+ if (HYD_Proxy_params.exit_status[i] == -1) {
+ count++;
+ break;
+ }
+ }
+
+ if (count == 0)
+ break;
+
+ /* Check if there are any messages from the launcher */
+ status = HYD_DMX_Wait_for_event(0);
+ if (status != HYD_SUCCESS) {
+ HYDU_Error_printf("demux engine returned error when waiting for event\n");
+ goto fn_fail;
+ }
+ } while (1);
+
+ ret_status = 0;
+ for (i = 0; i < HYD_Proxy_params.proc_count; i++)
+ ret_status |= HYD_Proxy_params.exit_status[i];
+
fn_exit:
- return status;
+ if (status != HYD_SUCCESS)
+ return -1;
+ else
+ return ret_status;
fn_fail:
goto fn_exit;
Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy.h
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy.h 2009-03-12 23:01:22 UTC (rev 4048)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy.h 2009-03-13 01:52:57 UTC (rev 4049)
@@ -23,6 +23,7 @@
int *pid;
int *out;
int *err;
+ int *exit_status;
int in;
int stdin_buf_offset;
@@ -38,6 +39,5 @@
HYD_Status HYD_Proxy_stdout_cb(int fd, HYD_Event_t events);
HYD_Status HYD_Proxy_stderr_cb(int fd, HYD_Event_t events);
HYD_Status HYD_Proxy_stdin_cb(int fd, HYD_Event_t events);
-void HYD_Proxy_signal_cb(int signal);
#endif /* PROXY_H_INCLUDED */
Modified: mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c 2009-03-12 23:01:22 UTC (rev 4048)
+++ mpich2/trunk/src/pm/hydra/pm/central/proxy_cb.c 2009-03-13 01:52:57 UTC (rev 4049)
@@ -60,7 +60,8 @@
if (cmd == KILLALL_PROCS) { /* Got the killall command */
for (i = 0; i < HYD_Proxy_params.proc_count; i++)
- kill(HYD_Proxy_params.pid[i], SIGKILL);
+ if (HYD_Proxy_params.pid[i] != -1)
+ kill(HYD_Proxy_params.pid[i], SIGKILL);
status = HYD_DMX_Deregister_fd(fd);
if (status != HYD_SUCCESS) {
@@ -191,31 +192,3 @@
fn_fail:
goto fn_exit;
}
-
-
-void HYD_Proxy_signal_cb(int signal)
-{
- int i;
-
- HYDU_FUNC_ENTER();
-
- if (signal == SIGINT || signal == SIGQUIT || signal == SIGTERM
-#if defined SIGSTOP
- || signal == SIGSTOP
-#endif /* SIGSTOP */
-#if defined SIGCONT
- || signal == SIGCONT
-#endif /* SIGSTOP */
-) {
- /* There's nothing we can do with the return value for now. */
- for (i = 0; i < HYD_Proxy_params.proc_count; i++)
- kill(HYD_Proxy_params.pid[i], SIGKILL);
- exit(-1);
- }
- else {
- /* Ignore other signals for now */
- }
-
- HYDU_FUNC_EXIT();
- return;
-}
More information about the mpich2-commits
mailing list