[mpich2-commits] r4162 - mpich2/trunk/src/pm/hydra/pm/pmiserv

balaji at mcs.anl.gov balaji at mcs.anl.gov
Sun Mar 22 18:09:38 CDT 2009


Author: balaji
Date: 2009-03-22 18:09:37 -0500 (Sun, 22 Mar 2009)
New Revision: 4162

Modified:
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c
Log:
Fix for a major problem with the PMI_ID calculation that creeped in
r4156 when we added support to allow proxies to have non-contiguous
PMI_IDs.


Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c	2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c	2009-03-22 23:09:37 UTC (rev 4162)
@@ -15,12 +15,13 @@
 int main(int argc, char **argv)
 {
     int i, j, arg, count, pid, ret_status;
-    int stdin_fd, timeout, process_id, core;
+    int stdin_fd, timeout, process_id, core, pmi_id, rem;
     char *str, *timeout_str;
     char *client_args[HYD_EXEC_ARGS];
     char *tmp[HYDU_NUM_JOIN_STR];
     HYD_Env_t *env;
     struct HYD_Partition_exec *exec;
+    struct HYD_Partition_segment *segment;
     HYD_Status status = HYD_SUCCESS;
 
     status = HYD_PMCD_pmi_proxy_get_params(argc, argv);
@@ -44,20 +45,24 @@
      * hierarchy of proxies. */
 
     HYD_PMCD_pmi_proxy_params.partition_proc_count = 0;
+    for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment; segment = segment->next)
+        HYD_PMCD_pmi_proxy_params.partition_proc_count += segment->proc_count;
+
+    HYD_PMCD_pmi_proxy_params.exec_proc_count = 0;
     for (exec = HYD_PMCD_pmi_proxy_params.exec_list; exec; exec = exec->next)
-        HYD_PMCD_pmi_proxy_params.partition_proc_count += exec->proc_count;
+        HYD_PMCD_pmi_proxy_params.exec_proc_count += exec->proc_count;
 
     HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.out, int *,
-                HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+                HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
     HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.err, int *,
-                HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+                HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
     HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.pid, int *,
-                HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+                HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
     HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.exit_status, int *,
-                HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+                HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
 
     /* Initialize the exit status */
-    for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+    for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
         HYD_PMCD_pmi_proxy_params.exit_status[i] = -1;
 
     /* For local spawning, set the global environment here itself */
@@ -73,7 +78,21 @@
     for (exec = HYD_PMCD_pmi_proxy_params.exec_list; exec; exec = exec->next) {
         for (i = 0; i < exec->proc_count; i++) {
 
-            str = HYDU_int_to_str(HYD_PMCD_pmi_proxy_params.pmi_id + process_id);
+            pmi_id = ((process_id / HYD_PMCD_pmi_proxy_params.partition_proc_count) *
+                      HYD_PMCD_pmi_proxy_params.one_pass_count);
+            rem = (process_id % HYD_PMCD_pmi_proxy_params.partition_proc_count);
+
+            for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment;
+                 segment = segment->next) {
+                if (rem >= segment->proc_count)
+                    rem -= segment->proc_count;
+                else {
+                    pmi_id += segment->start_pid + rem;
+                    break;
+                }
+            }
+
+            str = HYDU_int_to_str(pmi_id);
             status = HYDU_env_create(&env, "PMI_ID", str);
             HYDU_ERR_POP(status, "unable to create env\n");
             HYDU_FREE(str);
@@ -89,13 +108,25 @@
             client_args[arg++] = NULL;
 
             core = HYDU_next_core(core, HYD_PMCD_pmi_proxy_params.binding);
-            if ((process_id + HYD_PMCD_pmi_proxy_params.pmi_id) == 0) {
+            if (pmi_id == 0) {
                 status = HYDU_create_process(client_args, exec->prop_env,
                                              &HYD_PMCD_pmi_proxy_params.in,
                                              &HYD_PMCD_pmi_proxy_params.out[process_id],
                                              &HYD_PMCD_pmi_proxy_params.err[process_id],
                                              &HYD_PMCD_pmi_proxy_params.pid[process_id],
                                              core);
+
+                status = HYDU_sock_set_nonblock(HYD_PMCD_pmi_proxy_params.in);
+                HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
+
+                stdin_fd = 0;
+                status = HYDU_sock_set_nonblock(stdin_fd);
+                HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
+
+                HYD_PMCD_pmi_proxy_params.stdin_buf_offset = 0;
+                HYD_PMCD_pmi_proxy_params.stdin_buf_count = 0;
+                status = HYD_DMX_register_fd(1, &stdin_fd, HYD_STDIN, HYD_PMCD_pmi_proxy_stdin_cb);
+                HYDU_ERR_POP(status, "unable to register fd\n");
             }
             else {
                 status = HYDU_create_process(client_args, exec->prop_env,
@@ -112,45 +143,25 @@
     }
 
     /* Everything is spawned, now wait for I/O */
-    status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.partition_proc_count,
+    status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.exec_proc_count,
                                  HYD_PMCD_pmi_proxy_params.out,
                                  HYD_STDOUT, HYD_PMCD_pmi_proxy_stdout_cb);
     HYDU_ERR_POP(status, "unable to register fd\n");
 
-    status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.partition_proc_count,
+    status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.exec_proc_count,
                                  HYD_PMCD_pmi_proxy_params.err,
                                  HYD_STDOUT, HYD_PMCD_pmi_proxy_stderr_cb);
     HYDU_ERR_POP(status, "unable to register fd\n");
 
-    if (HYD_PMCD_pmi_proxy_params.pmi_id == 0) {
-        status = HYDU_sock_set_nonblock(HYD_PMCD_pmi_proxy_params.in);
-        HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
-
-        stdin_fd = 0;
-        status = HYDU_sock_set_nonblock(stdin_fd);
-        HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
-
-        HYD_PMCD_pmi_proxy_params.stdin_buf_offset = 0;
-        HYD_PMCD_pmi_proxy_params.stdin_buf_count = 0;
-        status = HYD_DMX_register_fd(1, &stdin_fd, HYD_STDIN, HYD_PMCD_pmi_proxy_stdin_cb);
-        HYDU_ERR_POP(status, "unable to register fd\n");
-    }
-
-    timeout_str = getenv("MPIEXEC_TIMEOUT");
-    if (timeout_str)
-        timeout = atoi(timeout_str);
-    else
-        timeout = -1;
-
     while (1) {
         /* Wait for some event to occur */
-        status = HYD_DMX_wait_for_event(timeout);
+        status = HYD_DMX_wait_for_event(-1);
         HYDU_ERR_POP(status, "demux engine error waiting for event\n");
 
         /* Check to see if there's any open read socket left; if there
          * are, we will just wait for more events. */
         count = 0;
-        for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++) {
+        for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++) {
             if (HYD_PMCD_pmi_proxy_params.out[i] != -1 ||
                 HYD_PMCD_pmi_proxy_params.err[i] != -1) {
                 count++;
@@ -175,13 +186,13 @@
 
         /* Find the pid and mark it as complete. */
         if (pid > 0)
-            for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+            for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
                 if (HYD_PMCD_pmi_proxy_params.pid[i] == pid)
                     HYD_PMCD_pmi_proxy_params.exit_status[i] = WEXITSTATUS(ret_status);
 
         /* Check how many more processes are pending */
         count = 0;
-        for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++) {
+        for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++) {
             if (HYD_PMCD_pmi_proxy_params.exit_status[i] == -1) {
                 count++;
                 break;
@@ -197,7 +208,7 @@
     } while (1);
 
     ret_status = 0;
-    for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+    for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
         ret_status |= HYD_PMCD_pmi_proxy_params.exit_status[i];
 
   fn_exit:

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h	2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h	2009-03-22 23:09:37 UTC (rev 4162)
@@ -12,16 +12,16 @@
 
 struct HYD_PMCD_pmi_proxy_params {
     int proxy_port;
-    int pmi_id;
     char *wdir;
     HYD_Binding binding;
     HYD_Env_t *global_env;
 
     int one_pass_count;
     int partition_proc_count;
+    int exec_proc_count;
 
     /* Process segmentation information for this partition */
-    struct HYD_Partition_segment segment;
+    struct HYD_Partition_segment *segment_list;
     struct HYD_Partition_exec *exec_list;
 
     int *pid;

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c	2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c	2009-03-22 23:09:37 UTC (rev 4162)
@@ -46,7 +46,7 @@
         }
 
         if (cmd == KILLALL_PROCS) {     /* Got the killall command */
-            for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+            for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
                 if (HYD_PMCD_pmi_proxy_params.pid[i] != -1)
                     kill(HYD_PMCD_pmi_proxy_params.pid[i], SIGKILL);
 
@@ -84,7 +84,7 @@
         status = HYD_DMX_deregister_fd(fd);
         HYDU_ERR_POP(status, "unable to deregister fd\n");
 
-        for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+        for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
             if (HYD_PMCD_pmi_proxy_params.out[i] == fd)
                 HYD_PMCD_pmi_proxy_params.out[i] = -1;
     }
@@ -113,7 +113,7 @@
         status = HYD_DMX_deregister_fd(fd);
         HYDU_ERR_POP(status, "unable to deregister fd\n");
 
-        for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+        for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
             if (HYD_PMCD_pmi_proxy_params.err[i] == fd)
                 HYD_PMCD_pmi_proxy_params.err[i] = -1;
     }

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c	2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c	2009-03-22 23:09:37 UTC (rev 4162)
@@ -14,15 +14,19 @@
     int arg, i, count;
     HYD_Env_t *env;
     struct HYD_Partition_exec *exec = NULL;
+    struct HYD_Partition_segment *segment = NULL;
     HYD_Status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
 
     HYD_PMCD_pmi_proxy_params.exec_list = NULL;
+    HYD_PMCD_pmi_proxy_params.segment_list = NULL;
     HYD_PMCD_pmi_proxy_params.global_env = NULL;
 
     while (*argv) {
         ++argv;
+        if (*argv == NULL)
+            break;
 
         /* Proxy port */
         if (!strcmp(*argv, "--proxy-port")) {
@@ -74,14 +78,39 @@
             continue;
         }
 
-        /* PMI_ID: This is the PMI_ID for the first process;
-         * everything else is incremented from here. */
-        if (!strcmp(*argv, "--pmi-id")) {
+        /* New segment */
+        if (!strcmp(*argv, "--segment")) {
+            if (HYD_PMCD_pmi_proxy_params.segment_list == NULL) {
+                status = HYDU_alloc_partition_segment(&HYD_PMCD_pmi_proxy_params.segment_list);
+                HYDU_ERR_POP(status, "unable to allocate partition segment\n");
+            }
+            else {
+                for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment->next;
+                     segment = segment->next);
+                status = HYDU_alloc_partition_segment(&segment->next);
+                HYDU_ERR_POP(status, "unable to allocate partition segment\n");
+            }
+            continue;
+        }
+
+        /* Process count */
+        if (!strcmp(*argv, "--segment-proc-count")) {
             argv++;
-            HYD_PMCD_pmi_proxy_params.pmi_id = atoi(*argv);
+            for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment->next;
+                 segment = segment->next);
+            segment->proc_count = atoi(*argv);
             continue;
         }
 
+        /* Process count */
+        if (!strcmp(*argv, "--segment-start-pid")) {
+            argv++;
+            for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment->next;
+                 segment = segment->next);
+            segment->start_pid = atoi(*argv);
+            continue;
+        }
+
         /* New executable */
         if (!strcmp(*argv, "--exec")) {
             if (HYD_PMCD_pmi_proxy_params.exec_list == NULL) {
@@ -98,7 +127,7 @@
         }
 
         /* Process count */
-        if (!strcmp(*argv, "--proc-count")) {
+        if (!strcmp(*argv, "--exec-proc-count")) {
             argv++;
             for (exec = HYD_PMCD_pmi_proxy_params.exec_list; exec->next; exec = exec->next);
             exec->proc_count = atoi(*argv);
@@ -106,7 +135,7 @@
         }
 
         /* Local env */
-        if (!strcmp(*argv, "--local-env")) {
+        if (!strcmp(*argv, "--exec-local-env")) {
             argv++;
             count = atoi(*argv);
             for (i = 0; i < count; i++) {

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c	2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c	2009-03-22 23:09:37 UTC (rev 4162)
@@ -44,6 +44,7 @@
     char *path_str[HYDU_NUM_JOIN_STR];
     struct HYD_Partition *partition;
     struct HYD_Partition_exec *exec;
+    struct HYD_Partition_segment *segment;
     HYD_Status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
@@ -135,22 +136,30 @@
         HYDU_list_append_env_to_str(handle.system_env, partition->proxy_args);
         HYDU_list_append_env_to_str(handle.prop_env, partition->proxy_args);
 
-        arg = HYDU_strlist_lastidx(partition->proxy_args);
-        partition->proxy_args[arg++] = MPIU_Strdup("--pmi-id");
-        partition->proxy_args[arg++] = HYDU_int_to_str(process_id);;
-        partition->proxy_args[arg++] = NULL;
+        /* Pass the segment information */
+        for (segment = partition->segment_list; segment; segment = segment->next) {
+            arg = HYDU_strlist_lastidx(partition->proxy_args);
+            partition->proxy_args[arg++] = MPIU_Strdup("--segment");
 
+            partition->proxy_args[arg++] = MPIU_Strdup("--segment-start-pid");
+            partition->proxy_args[arg++] = HYDU_int_to_str(segment->start_pid);
+
+            partition->proxy_args[arg++] = MPIU_Strdup("--segment-proc-count");
+            partition->proxy_args[arg++] = HYDU_int_to_str(segment->proc_count);
+            partition->proxy_args[arg++] = NULL;
+        }
+
         /* Now pass the local executable information */
         for (exec = partition->exec_list; exec; exec = exec->next) {
             arg = HYDU_strlist_lastidx(partition->proxy_args);
             partition->proxy_args[arg++] = MPIU_Strdup("--exec");
 
-            partition->proxy_args[arg++] = MPIU_Strdup("--proc-count");
+            partition->proxy_args[arg++] = MPIU_Strdup("--exec-proc-count");
             partition->proxy_args[arg++] = HYDU_int_to_str(exec->proc_count);
             partition->proxy_args[arg++] = NULL;
 
             arg = HYDU_strlist_lastidx(partition->proxy_args);
-            partition->proxy_args[arg++] = MPIU_Strdup("--local-env");
+            partition->proxy_args[arg++] = MPIU_Strdup("--exec-local-env");
             for (i = 0, env = exec->prop_env; env; env = env->next, i++);
             HYDU_ERR_POP(status, "unable to convert int to string\n");
             partition->proxy_args[arg++] = HYDU_int_to_str(i);



More information about the mpich2-commits mailing list