[mpich2-commits] r3970 - in mpich2/trunk/src/pm/hydra: bootstrap/ssh bootstrap/utils control/consys include launcher/mpiexec launcher/utils pm/utils

balaji at mcs.anl.gov balaji at mcs.anl.gov
Sat Mar 7 21:44:48 CST 2009


Author: balaji
Date: 2009-03-07 21:44:47 -0600 (Sat, 07 Mar 2009)
New Revision: 3970

Modified:
   mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c
   mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_init.c
   mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c
   mpich2/trunk/src/pm/hydra/control/consys/consys_close.c
   mpich2/trunk/src/pm/hydra/control/consys/consys_launch.c
   mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c
   mpich2/trunk/src/pm/hydra/include/hydra.h
   mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c
   mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c
   mpich2/trunk/src/pm/hydra/launcher/utils/lchu.c
   mpich2/trunk/src/pm/hydra/pm/utils/pmi.c
Log:
1. Use partitions instead of hostnames. This will allow non-ssh bootstrap
servers to launch processes more naturally.

2. Get rid of wrap-around reading of hostfiles. This is making the
code unnecessarily complicated without much utility. If this is needed,
it can be added back later.


Modified: mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/bootstrap/ssh/ssh_launch.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -22,8 +22,9 @@
 HYD_Status HYD_BSCI_Launch_procs(void)
 {
     struct HYD_Proc_params *proc_params;
-    char *client_arg[HYD_EXEC_ARGS], *hostname = NULL, **proc_list = NULL;
-    int i, arg, process_id, host_id, host_id_max;
+    struct HYD_Partition_list *partition;
+    char *client_arg[HYD_EXEC_ARGS];
+    int i, arg, process_id;
     HYD_Status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
@@ -46,70 +47,59 @@
         goto fn_fail;
     }
 
-    proc_params = handle.proc_params;
     process_id = 0;
-    while (proc_params) {
-        if (proc_params->host_file != NULL) {   /* We got a new host file */
-            host_id = 0;
-            host_id_max = proc_params->total_num_procs;
-            proc_list = proc_params->total_proc_list;
-        }
+    for (proc_params = handle.proc_params; proc_params; proc_params = proc_params->next) {
+        for (partition = proc_params->partition; partition; partition = partition->next) {
+            for (i = 0; i < partition->proc_count; i++) {
+                /* Setup the executable arguments */
+                arg = 0;
+                client_arg[arg++] = MPIU_Strdup("/usr/bin/ssh");
 
-        for (i = 0; i < proc_params->user_num_procs; i++) {
-            /* Setup the executable arguments */
-            arg = 0;
-            client_arg[arg++] = MPIU_Strdup("/usr/bin/ssh");
+                /* Allow X forwarding only if explicitly requested */
+                if (handle.enablex == 1)
+                    client_arg[arg++] = MPIU_Strdup("-X");
+                else if (handle.enablex == 0)
+                    client_arg[arg++] = MPIU_Strdup("-x");
+                else        /* default mode is disable X */
+                    client_arg[arg++] = MPIU_Strdup("-x");
 
-            /* Allow X forwarding only if explicitly requested */
-            if (handle.enablex == 1)
-                client_arg[arg++] = MPIU_Strdup("-X");
-            else if (handle.enablex == 0)
-                client_arg[arg++] = MPIU_Strdup("-x");
-            else        /* default mode is disable X */
-                client_arg[arg++] = MPIU_Strdup("-x");
+                /* ssh does not support any partition names other than host names */
+                client_arg[arg++] = MPIU_Strdup(partition->name);
 
-            if (host_id == host_id_max)
-                host_id = 0;
-            hostname = proc_list[host_id];
-            host_id++;
+                client_arg[arg++] = MPIU_Strdup("sh");
+                client_arg[arg++] = MPIU_Strdup("-c");
+                client_arg[arg++] = MPIU_Strdup("\"");
+                client_arg[arg++] = NULL;
 
-            client_arg[arg++] = MPIU_Strdup(hostname);
+                HYDU_Append_env(handle.system_env, client_arg, process_id);
+                HYDU_Append_env(proc_params->prop_env, client_arg, process_id);
+                HYDU_Append_wdir(client_arg);
+                HYDU_Append_exec(proc_params->exec, client_arg);
 
-            client_arg[arg++] = MPIU_Strdup("sh");
-            client_arg[arg++] = MPIU_Strdup("-c");
-            client_arg[arg++] = MPIU_Strdup("\"");
-            client_arg[arg++] = NULL;
+                for (arg = 0; client_arg[arg]; arg++);
+                client_arg[arg++] = MPIU_Strdup("\"");
+                client_arg[arg++] = NULL;
 
-            HYDU_Append_env(handle.system_env, client_arg, process_id);
-            HYDU_Append_env(proc_params->prop_env, client_arg, process_id);
-            HYDU_Append_wdir(client_arg);
-            HYDU_Append_exec(proc_params->exec, client_arg);
+                /* The stdin pointer will be some value for process_id
+                 * 0; for everyone else, it's NULL. */
+                status = HYDU_Create_process(client_arg, (process_id == 0 ? &handle.in : NULL),
+                                             &proc_params->out[i], &proc_params->err[i],
+                                             &proc_params->pid[i]);
+                if (status != HYD_SUCCESS) {
+                    HYDU_Error_printf("bootstrap spawn process returned error\n");
+                    goto fn_fail;
+                }
 
-            for (arg = 0; client_arg[arg]; arg++);
-            client_arg[arg++] = MPIU_Strdup("\"");
-            client_arg[arg++] = NULL;
+                for (arg = 0; client_arg[arg]; arg++)
+                    HYDU_FREE(client_arg[arg]);
 
-            /* The stdin pointer will be some value for process_id 0;
-             * for everyone else, it's NULL. */
-            status = HYDU_Create_process(client_arg, (process_id == 0 ? &handle.in : NULL),
-                                         &proc_params->out[i], &proc_params->err[i],
-                                         &proc_params->pid[i]);
-            if (status != HYD_SUCCESS) {
-                HYDU_Error_printf("bootstrap spawn process returned error\n");
-                goto fn_fail;
-            }
+                /* For the remaining processes, set the stdin fd to -1 */
+                if (process_id != 0)
+                    handle.in = -1;
 
-            for (arg = 0; client_arg[arg]; arg++)
-                HYDU_FREE(client_arg[arg]);
-
-            /* For the remaining processes, set the stdin fd to -1 */
-            if (process_id != 0)
-                handle.in = -1;
-
-            process_id++;
+                process_id++;
+            }
         }
-
-        proc_params = proc_params->next;
     }
 
   fn_exit:
@@ -124,36 +114,23 @@
 HYD_Status HYD_BSCI_Cleanup_procs(void)
 {
     struct HYD_Proc_params *proc_params;
+    struct HYD_Partition_list *partition;
     char *client_arg[HYD_EXEC_ARGS], *hostname, **proc_list, *execname;
     int i, arg, host_id, host_id_max;
     HYD_Status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
 
-    proc_params = handle.proc_params;
-    while (proc_params) {
-        for (i = 0; i < proc_params->user_num_procs; i++) {
+    for (proc_params = handle.proc_params; proc_params; proc_params = proc_params->next) {
+        for (partition = proc_params->partition; partition; partition = partition->next) {
             /* Setup the executable arguments */
             arg = 0;
             client_arg[arg++] = MPIU_Strdup("/usr/bin/ssh");
             client_arg[arg++] = MPIU_Strdup("-x");
 
-            if (proc_params->host_file != NULL) {       /* We got a new host file */
-                host_id = 0;
-                host_id_max = proc_params->total_num_procs;
-                proc_list = proc_params->total_proc_list;
-            }
-            else if (host_id == host_id_max) {
-                host_id = 0;
-            }
-            hostname = proc_list[host_id];
-            host_id++;
+            /* ssh does not support any partition names other than host names */
+            client_arg[arg++] = MPIU_Strdup(partition->name);
 
-            client_arg[arg++] = MPIU_Strdup(hostname);
-            client_arg[arg++] = NULL;
-
-            HYDU_Append_wdir(client_arg);
-
             for (arg = 0; client_arg[arg]; arg++);
             client_arg[arg++] = MPIU_Strdup("killall");
 
@@ -175,8 +152,6 @@
             for (arg = 0; client_arg[arg]; arg++)
                 HYDU_FREE(client_arg[arg]);
         }
-
-        proc_params = proc_params->next;
     }
 
   fn_exit:

Modified: mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_init.c
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_init.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_init.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -24,11 +24,12 @@
      * the same loop. */
     proc_params = handle.proc_params;
     while (proc_params) {
-        HYDU_MALLOC(proc_params->pid, int *, proc_params->user_num_procs * sizeof(int), status);
-        HYDU_MALLOC(proc_params->exit_status, int *, proc_params->user_num_procs * sizeof(int), status);
-        HYDU_MALLOC(proc_params->exit_status_valid, int *, proc_params->user_num_procs * sizeof(int),
+        HYDU_MALLOC(proc_params->pid, int *, proc_params->exec_proc_count * sizeof(int), status);
+        HYDU_MALLOC(proc_params->exit_status, int *, proc_params->exec_proc_count * sizeof(int),
                     status);
-        for (i = 0; i < proc_params->user_num_procs; i++)
+        HYDU_MALLOC(proc_params->exit_status_valid, int *, proc_params->exec_proc_count * sizeof(int),
+                    status);
+        for (i = 0; i < proc_params->exec_proc_count; i++)
             proc_params->exit_status_valid[i] = 0;
         proc_params = proc_params->next;
     }
@@ -51,8 +52,8 @@
 
     proc_params = handle.proc_params;
     while (proc_params) {
-        HYDU_MALLOC(proc_params->out, int *, proc_params->user_num_procs * sizeof(int), status);
-        HYDU_MALLOC(proc_params->err, int *, proc_params->user_num_procs * sizeof(int), status);
+        HYDU_MALLOC(proc_params->out, int *, proc_params->exec_proc_count * sizeof(int), status);
+        HYDU_MALLOC(proc_params->err, int *, proc_params->exec_proc_count * sizeof(int), status);
         proc_params = proc_params->next;
     }
 

Modified: mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c
===================================================================
--- mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/bootstrap/utils/bscu_wait.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -28,7 +28,7 @@
     not_completed = 0;
     proc_params = handle.proc_params;
     while (proc_params) {
-        for (i = 0; i < proc_params->user_num_procs; i++)
+        for (i = 0; i < proc_params->exec_proc_count; i++)
             if (proc_params->exit_status_valid[i] == 0)
                 not_completed++;
         proc_params = proc_params->next;
@@ -45,7 +45,7 @@
             /* Find the pid and mark it as complete. */
             proc_params = handle.proc_params;
             while (proc_params) {
-                for (i = 0; i < proc_params->user_num_procs; i++) {
+                for (i = 0; i < proc_params->exec_proc_count; i++) {
                     if (proc_params->pid[i] == pid) {
                         proc_params->exit_status[i] = WEXITSTATUS(ret_status);
                         proc_params->exit_status_valid[i] = 1;

Modified: mpich2/trunk/src/pm/hydra/control/consys/consys_close.c
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/consys_close.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/control/consys/consys_close.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -33,7 +33,7 @@
     /* Find the FD in the handle and remove it. */
     proc_params = handle.proc_params;
     while (proc_params) {
-        for (i = 0; i < proc_params->user_num_procs; i++) {
+        for (i = 0; i < proc_params->exec_proc_count; i++) {
             if (proc_params->out[i] == fd) {
                 proc_params->out[i] = -1;
                 goto fn_exit;

Modified: mpich2/trunk/src/pm/hydra/control/consys/consys_launch.c
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/consys_launch.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/control/consys/consys_launch.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -28,14 +28,14 @@
 
     proc_params = handle.proc_params;
     while (proc_params) {
-        status = HYD_DMX_Register_fd(proc_params->user_num_procs, proc_params->out,
+        status = HYD_DMX_Register_fd(proc_params->exec_proc_count, proc_params->out,
                                      HYD_STDOUT, proc_params->stdout_cb);
         if (status != HYD_SUCCESS) {
             HYDU_Error_printf("demux engine returned error when registering fd\n");
             goto fn_fail;
         }
 
-        status = HYD_DMX_Register_fd(proc_params->user_num_procs, proc_params->err,
+        status = HYD_DMX_Register_fd(proc_params->exec_proc_count, proc_params->err,
                                      HYD_STDOUT, proc_params->stderr_cb);
         if (status != HYD_SUCCESS) {
             HYDU_Error_printf("demux engine returned error when registering fd\n");

Modified: mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c
===================================================================
--- mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/control/consys/consys_wait.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -33,7 +33,7 @@
         proc_params = handle.proc_params;
         sockets_open = 0;
         while (proc_params) {
-            for (i = 0; i < proc_params->user_num_procs; i++) {
+            for (i = 0; i < proc_params->exec_proc_count; i++) {
                 if (proc_params->out[i] != -1 || proc_params->err[i] != -1) {
                     sockets_open++;
                     break;

Modified: mpich2/trunk/src/pm/hydra/include/hydra.h
===================================================================
--- mpich2/trunk/src/pm/hydra/include/hydra.h	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/include/hydra.h	2009-03-08 03:44:47 UTC (rev 3970)
@@ -94,6 +94,8 @@
     int enablex;
     char *wdir;
 
+    char *host_file;
+
     HYD_Env_t *global_env;
     HYD_Env_t *system_env;
     HYD_Env_t *user_env;
@@ -112,13 +114,14 @@
     /* Each structure will contain all hosts/cores that use the same
      * executable and environment. */
     struct HYD_Proc_params {
-        int user_num_procs;
-        int total_num_procs;
-        char **total_proc_list;
-        int *total_core_list;
+        int  exec_proc_count;
+        struct HYD_Partition_list {
+            char  * name;
+            int     proc_count;
+            char ** mapping; /* Can be core IDs or something else */
+            struct HYD_Partition_list *next;
+        } *partition;
 
-        char *host_file;
-
         char *exec[HYD_EXEC_ARGS];
         HYD_Env_t *user_env;
         HYD_Env_prop_t prop;

Modified: mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c
===================================================================
--- mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/launcher/mpiexec/mpiexec.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -112,7 +112,7 @@
     proc_params = handle.proc_params;
     exit_status = 0;
     while (proc_params) {
-        for (i = 0; i < proc_params->user_num_procs; i++)
+        for (i = 0; i < proc_params->exec_proc_count; i++)
             exit_status |= proc_params->exit_status[i];
         proc_params = proc_params->next;
     }

Modified: mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c
===================================================================
--- mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/launcher/mpiexec/utils.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -42,13 +42,9 @@
 
     HYDU_MALLOC(proc_params, struct HYD_Proc_params *, sizeof(struct HYD_Proc_params), status);
 
-    proc_params->user_num_procs = 0;
-    proc_params->total_num_procs = 0;
-    proc_params->total_proc_list = NULL;
-    proc_params->total_core_list = NULL;
+    proc_params->exec_proc_count = 0;
+    proc_params->partition = NULL;
 
-    proc_params->host_file = NULL;
-
     proc_params->exec[0] = NULL;
     proc_params->user_env = NULL;
     proc_params->prop = HYD_ENV_PROP_UNSET;
@@ -103,7 +99,7 @@
 
 HYD_Status HYD_LCHI_Get_parameters(int t_argc, char **t_argv)
 {
-    int argc = t_argc, i, got_hostfile;
+    int argc = t_argc, i;
     char **argv = t_argv;
     int local_params_started;
     char *arg;
@@ -117,6 +113,7 @@
     handle.debug = -1;
     handle.enablex = -1;
     handle.wdir = NULL;
+    handle.host_file = NULL;
 
     status = HYDU_Env_global_list(&handle.global_env);
     if (status != HYD_SUCCESS) {
@@ -304,6 +301,7 @@
             CHECK_LOCAL_PARAM_START(local_params_started, status);
             CHECK_NEXT_ARG_VALID(status);
             handle.wdir = MPIU_Strdup(*argv);
+            continue;
         }
 
         if (!strcmp(*argv, "-n") || !strcmp(*argv, "-np")) {
@@ -317,37 +315,22 @@
             }
 
             /* Num_procs already set */
-            if (proc_params->user_num_procs != 0) {
+            if (proc_params->exec_proc_count != 0) {
                 HYDU_Error_printf("Duplicate setting for number of processes; previously set to %d\n",
-                                  proc_params->user_num_procs);
+                                  proc_params->exec_proc_count);
                 status = HYD_INTERNAL_ERROR;
                 goto fn_fail;
             }
 
-            proc_params->user_num_procs = atoi(*argv);
+            proc_params->exec_proc_count = atoi(*argv);
 
             continue;
         }
 
         if (!strcmp(*argv, "-f")) {
-            local_params_started = 1;
+            CHECK_LOCAL_PARAM_START(local_params_started, status);
             CHECK_NEXT_ARG_VALID(status);
-
-            status = get_current_proc_params(&proc_params);
-            if (status != HYD_SUCCESS) {
-                HYDU_Error_printf("get_current_proc_params returned error\n");
-                goto fn_fail;
-            }
-
-            /* host_file already set */
-            if (proc_params->host_file != NULL) {
-                HYDU_Error_printf("Duplicate setting for host file; previously set to %s\n",
-                                  proc_params->host_file);
-                status = HYD_INTERNAL_ERROR;
-                goto fn_fail;
-            }
-
-            proc_params->host_file = MPIU_Strdup(*argv);
+            handle.host_file = MPIU_Strdup(*argv);
             continue;
         }
 
@@ -405,8 +388,23 @@
         }
     }
 
+    /*
+     * We use the following priority order to specify the host file:
+     *    1. Specified to mpiexec using -f
+     *    2. Specified through the environment HYDRA_HOST_FILE
+     *    3. Specified through the environment HYDRA_USE_LOCALHOST
+     */
+    if (handle.host_file == NULL && getenv("HYDRA_HOST_FILE"))
+        handle.host_file = MPIU_Strdup(getenv("HYDRA_HOST_FILE"));
+    if (handle.host_file == NULL && getenv("HYDRA_USE_LOCALHOST"))
+        handle.host_file = MPIU_Strdup("HYDRA_USE_LOCALHOST");
+    if (handle.host_file == NULL) {
+        HYDU_Error_printf("Host file not specified\n");
+        status = HYD_INTERNAL_ERROR;
+        goto fn_fail;
+    }
+
     proc_params = handle.proc_params;
-    got_hostfile = 0;
     while (proc_params) {
         if (proc_params->exec[0] == NULL) {
             HYDU_Error_printf("no executable specified\n");
@@ -414,31 +412,12 @@
             goto fn_fail;
         }
 
-        if (proc_params->user_num_procs == 0)
-            proc_params->user_num_procs = 1;
+        if (proc_params->exec_proc_count == 0)
+            proc_params->exec_proc_count = 1;
 
-        /*
-         * We use the following priority order to specify the host file:
-         *    1. Specified to mpiexec using -f
-         *    2. Specified through the environment HYDRA_HOST_FILE
-         *    3. Specified through the environment HYDRA_USE_LOCALHOST
-         */
-        if (proc_params->host_file == NULL && got_hostfile == 0 && getenv("HYDRA_HOST_FILE"))
-            proc_params->host_file = MPIU_Strdup(getenv("HYDRA_HOST_FILE"));
-        if (proc_params->host_file == NULL && got_hostfile == 0 && getenv("HYDRA_USE_LOCALHOST"))
-            proc_params->host_file = MPIU_Strdup("HYDRA_USE_LOCALHOST");
-        if (proc_params->host_file != NULL)
-            got_hostfile = 1;
-
         proc_params = proc_params->next;
     }
 
-    if (got_hostfile == 0) {
-        HYDU_Error_printf("Host file not specified\n");
-        status = HYD_INTERNAL_ERROR;
-        goto fn_fail;
-    }
-
   fn_exit:
     HYDU_FUNC_EXIT();
     return status;

Modified: mpich2/trunk/src/pm/hydra/launcher/utils/lchu.c
===================================================================
--- mpich2/trunk/src/pm/hydra/launcher/utils/lchu.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/launcher/utils/lchu.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -11,110 +11,89 @@
 
 HYD_Status HYD_LCHU_Create_host_list(void)
 {
-    FILE *fp;
-    char line[2 * MAX_HOSTNAME_LEN], *hostfile, *hostname, *procs;
+    FILE *fp = NULL;
+    char line[2 * MAX_HOSTNAME_LEN], *hostname, *procs;
     struct HYD_Proc_params *proc_params;
-    int i, j, num_procs;
+    struct HYD_Partition_list *partition;
+    int num_procs, total_procs;
     HYD_Status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
 
-    /* FIXME: We need a better approach than this -- we make two
-     * passes for the total host list, one to find the number of
-     * hosts, and another to read the actual hosts. */
-    proc_params = handle.proc_params;
-    while (proc_params) {
-        if (proc_params->host_file != NULL) {
-            if (!strcmp(proc_params->host_file, "HYDRA_USE_LOCALHOST")) {
-                proc_params->total_num_procs++;
-            }
-            else {
-                fp = fopen(proc_params->host_file, "r");
-                if (fp == NULL) {
-                    HYDU_Error_printf("unable to open host file %s\n", proc_params->host_file);
-                    status = HYD_INTERNAL_ERROR;
-                    goto fn_fail;
-                }
-
-                proc_params->total_num_procs = 0;
-                while (!feof(fp)) {
-                    if ((fscanf(fp, "%s", line) < 0) && errno) {
-                        HYDU_Error_printf("unable to read input line (errno: %d)\n", errno);
-                        status = HYD_INTERNAL_ERROR;
-                        goto fn_fail;
-                    }
-                    if (feof(fp))
-                        break;
-
-                    hostname = strtok(line, ":");
-                    procs = strtok(NULL, ":");
-                    if (procs)
-                        num_procs = atoi(procs);
-                    else
-                        num_procs = 1;
-
-                    proc_params->total_num_procs += num_procs;
-                }
-
-                fclose(fp);
-            }
+    if (strcmp(handle.host_file, "HYDRA_USE_LOCALHOST")) {
+        fp = fopen(handle.host_file, "r");
+        if (fp == NULL) {
+            HYDU_Error_printf("unable to open host file %s\n", handle.host_file);
+            status = HYD_INTERNAL_ERROR;
+            goto fn_fail;
         }
-        proc_params = proc_params->next;
     }
 
     proc_params = handle.proc_params;
     while (proc_params) {
-        if (proc_params->host_file != NULL) {
+        if (!strcmp(handle.host_file, "HYDRA_USE_LOCALHOST")) {
+            HYDU_MALLOC(proc_params->partition, struct HYD_Partition_list *,
+                        sizeof(struct HYD_Partition_list), status);
 
-            HYDU_MALLOC(proc_params->total_proc_list, char **,
-                        proc_params->total_num_procs * sizeof(char *), status);
-            HYDU_MALLOC(proc_params->total_core_list, int *,
-                        proc_params->total_num_procs * sizeof(int), status);
-
-            if (!strcmp(proc_params->host_file, "HYDRA_USE_LOCALHOST")) {
-                proc_params->total_proc_list[0] = MPIU_Strdup("localhost");
-                proc_params->total_core_list[0] = -1;
-            }
-            else {
-                fp = fopen(proc_params->host_file, "r");
-                if (fp == NULL) {
-                    HYDU_Error_printf("unable to open host file %s\n", proc_params->host_file);
+            proc_params->partition->name = MPIU_Strdup("localhost");
+            proc_params->partition->proc_count = proc_params->exec_proc_count;
+            proc_params->partition->mapping = NULL;
+            proc_params->partition->next = NULL;
+            total_procs = proc_params->exec_proc_count;
+        }
+        else {
+            total_procs = 0;
+            while (!feof(fp)) {
+                if ((fscanf(fp, "%s", line) < 0) && errno) {
+                    HYDU_Error_printf("unable to read input line (errno: %d)\n", errno);
                     status = HYD_INTERNAL_ERROR;
                     goto fn_fail;
                 }
+                if (feof(fp))
+                    break;
 
-                i = 0;
-                while (!feof(fp)) {
-                    if ((fscanf(fp, "%s", line) < 0) && errno) {
-                        HYDU_Error_printf("unable to read input line (errno: %d)\n", errno);
-                        status = HYD_INTERNAL_ERROR;
-                        goto fn_fail;
-                    }
-                    if (feof(fp))
-                        break;
+                hostname = strtok(line, ":");
+                procs = strtok(NULL, ":");
 
-                    hostname = strtok(line, ":");
-                    procs = strtok(NULL, ":");
+                num_procs = procs ? atoi(procs) : 1;
+                if (num_procs > (proc_params->exec_proc_count - total_procs))
+                    num_procs = (proc_params->exec_proc_count - total_procs);
 
-                    if (procs)
-                        num_procs = atoi(procs);
-                    else
-                        num_procs = 1;
-
-                    for (j = 0; j < num_procs; j++) {
-                        proc_params->total_proc_list[i] = MPIU_Strdup(hostname);
-                        proc_params->total_core_list[i] = -1;
-                        i++;
-                    }
+                if (proc_params->partition) {
+                    for (partition = proc_params->partition; partition->next;
+                         partition = partition->next);
+                    HYDU_MALLOC(partition->next, struct HYD_Partition_list *,
+                                sizeof(struct HYD_Partition_list), status);
+                    partition = partition->next;
                 }
 
-                fclose(fp);
+                partition->name = MPIU_Strdup(hostname);
+
+                /* FIXME: We don't support mappings yet */
+                partition->mapping = NULL;
+                partition->proc_count = num_procs;
+                partition->next = NULL;
+
+                total_procs += num_procs;
+                if (total_procs == proc_params->exec_proc_count)
+                    break;
             }
         }
+
+        if (total_procs != proc_params->exec_proc_count)
+            break;
         proc_params = proc_params->next;
     }
 
+    if (proc_params) {
+        HYDU_Error_printf("Not enough number of hosts in host file: %s\n", handle.host_file);
+        status = HYD_INTERNAL_ERROR;
+        goto fn_fail;
+    }
+
   fn_exit:
+    if (fp)
+        fclose(fp);
     HYDU_FUNC_EXIT();
     return status;
 
@@ -126,20 +105,23 @@
 HYD_Status HYD_LCHU_Free_host_list(void)
 {
     struct HYD_Proc_params *proc_params;
+    struct HYD_Partition_list *partition;
     int i;
     HYD_Status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
 
-    proc_params = handle.proc_params;
-    while (proc_params) {
-        for (i = 0; i < proc_params->total_num_procs; i++)
-            HYDU_FREE(proc_params->total_proc_list[i]);
-        HYDU_FREE(proc_params->total_proc_list);
-        HYDU_FREE(proc_params->total_core_list);
-        HYDU_FREE(proc_params->host_file);
-        proc_params = proc_params->next;
+    for (proc_params = handle.proc_params; proc_params; proc_params = proc_params->next) {
+        for (partition = proc_params->partition; partition; partition = partition->next) {
+            HYDU_FREE(partition->name);
+            if (partition->mapping) {
+                if (partition->mapping[i])
+                    HYDU_FREE(partition->mapping[i]);
+                HYDU_FREE(partition->mapping);
+            }
+        }
     }
+    HYDU_FREE(handle.host_file);
 
     HYDU_FUNC_EXIT();
     return status;

Modified: mpich2/trunk/src/pm/hydra/pm/utils/pmi.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/utils/pmi.c	2009-03-07 23:43:00 UTC (rev 3969)
+++ mpich2/trunk/src/pm/hydra/pm/utils/pmi.c	2009-03-08 03:44:47 UTC (rev 3970)
@@ -104,7 +104,7 @@
     num_procs = 0;
     proc_params = handle.proc_params;
     while (proc_params) {
-        num_procs += proc_params->user_num_procs;
+        num_procs += proc_params->exec_proc_count;
         proc_params = proc_params->next;
     }
 
@@ -140,7 +140,7 @@
     size = 0;
     proc_params = handle.proc_params;
     while (proc_params) {
-        size += proc_params->user_num_procs;
+        size += proc_params->exec_proc_count;
         proc_params = proc_params->next;
     }
     debug = handle.debug;



More information about the mpich2-commits mailing list