[mpich2-commits] r4162 - mpich2/trunk/src/pm/hydra/pm/pmiserv
balaji at mcs.anl.gov
balaji at mcs.anl.gov
Sun Mar 22 18:09:38 CDT 2009
Author: balaji
Date: 2009-03-22 18:09:37 -0500 (Sun, 22 Mar 2009)
New Revision: 4162
Modified:
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c
Log:
Fix for a major problem with the PMI_ID calculation that creeped in
r4156 when we added support to allow proxies to have non-contiguous
PMI_IDs.
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c 2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.c 2009-03-22 23:09:37 UTC (rev 4162)
@@ -15,12 +15,13 @@
int main(int argc, char **argv)
{
int i, j, arg, count, pid, ret_status;
- int stdin_fd, timeout, process_id, core;
+ int stdin_fd, timeout, process_id, core, pmi_id, rem;
char *str, *timeout_str;
char *client_args[HYD_EXEC_ARGS];
char *tmp[HYDU_NUM_JOIN_STR];
HYD_Env_t *env;
struct HYD_Partition_exec *exec;
+ struct HYD_Partition_segment *segment;
HYD_Status status = HYD_SUCCESS;
status = HYD_PMCD_pmi_proxy_get_params(argc, argv);
@@ -44,20 +45,24 @@
* hierarchy of proxies. */
HYD_PMCD_pmi_proxy_params.partition_proc_count = 0;
+ for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment; segment = segment->next)
+ HYD_PMCD_pmi_proxy_params.partition_proc_count += segment->proc_count;
+
+ HYD_PMCD_pmi_proxy_params.exec_proc_count = 0;
for (exec = HYD_PMCD_pmi_proxy_params.exec_list; exec; exec = exec->next)
- HYD_PMCD_pmi_proxy_params.partition_proc_count += exec->proc_count;
+ HYD_PMCD_pmi_proxy_params.exec_proc_count += exec->proc_count;
HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.out, int *,
- HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+ HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.err, int *,
- HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+ HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.pid, int *,
- HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+ HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
HYDU_MALLOC(HYD_PMCD_pmi_proxy_params.exit_status, int *,
- HYD_PMCD_pmi_proxy_params.partition_proc_count * sizeof(int), status);
+ HYD_PMCD_pmi_proxy_params.exec_proc_count * sizeof(int), status);
/* Initialize the exit status */
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
HYD_PMCD_pmi_proxy_params.exit_status[i] = -1;
/* For local spawning, set the global environment here itself */
@@ -73,7 +78,21 @@
for (exec = HYD_PMCD_pmi_proxy_params.exec_list; exec; exec = exec->next) {
for (i = 0; i < exec->proc_count; i++) {
- str = HYDU_int_to_str(HYD_PMCD_pmi_proxy_params.pmi_id + process_id);
+ pmi_id = ((process_id / HYD_PMCD_pmi_proxy_params.partition_proc_count) *
+ HYD_PMCD_pmi_proxy_params.one_pass_count);
+ rem = (process_id % HYD_PMCD_pmi_proxy_params.partition_proc_count);
+
+ for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment;
+ segment = segment->next) {
+ if (rem >= segment->proc_count)
+ rem -= segment->proc_count;
+ else {
+ pmi_id += segment->start_pid + rem;
+ break;
+ }
+ }
+
+ str = HYDU_int_to_str(pmi_id);
status = HYDU_env_create(&env, "PMI_ID", str);
HYDU_ERR_POP(status, "unable to create env\n");
HYDU_FREE(str);
@@ -89,13 +108,25 @@
client_args[arg++] = NULL;
core = HYDU_next_core(core, HYD_PMCD_pmi_proxy_params.binding);
- if ((process_id + HYD_PMCD_pmi_proxy_params.pmi_id) == 0) {
+ if (pmi_id == 0) {
status = HYDU_create_process(client_args, exec->prop_env,
&HYD_PMCD_pmi_proxy_params.in,
&HYD_PMCD_pmi_proxy_params.out[process_id],
&HYD_PMCD_pmi_proxy_params.err[process_id],
&HYD_PMCD_pmi_proxy_params.pid[process_id],
core);
+
+ status = HYDU_sock_set_nonblock(HYD_PMCD_pmi_proxy_params.in);
+ HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
+
+ stdin_fd = 0;
+ status = HYDU_sock_set_nonblock(stdin_fd);
+ HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
+
+ HYD_PMCD_pmi_proxy_params.stdin_buf_offset = 0;
+ HYD_PMCD_pmi_proxy_params.stdin_buf_count = 0;
+ status = HYD_DMX_register_fd(1, &stdin_fd, HYD_STDIN, HYD_PMCD_pmi_proxy_stdin_cb);
+ HYDU_ERR_POP(status, "unable to register fd\n");
}
else {
status = HYDU_create_process(client_args, exec->prop_env,
@@ -112,45 +143,25 @@
}
/* Everything is spawned, now wait for I/O */
- status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.partition_proc_count,
+ status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.exec_proc_count,
HYD_PMCD_pmi_proxy_params.out,
HYD_STDOUT, HYD_PMCD_pmi_proxy_stdout_cb);
HYDU_ERR_POP(status, "unable to register fd\n");
- status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.partition_proc_count,
+ status = HYD_DMX_register_fd(HYD_PMCD_pmi_proxy_params.exec_proc_count,
HYD_PMCD_pmi_proxy_params.err,
HYD_STDOUT, HYD_PMCD_pmi_proxy_stderr_cb);
HYDU_ERR_POP(status, "unable to register fd\n");
- if (HYD_PMCD_pmi_proxy_params.pmi_id == 0) {
- status = HYDU_sock_set_nonblock(HYD_PMCD_pmi_proxy_params.in);
- HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
-
- stdin_fd = 0;
- status = HYDU_sock_set_nonblock(stdin_fd);
- HYDU_ERR_POP(status, "unable to set socket as non-blocking\n");
-
- HYD_PMCD_pmi_proxy_params.stdin_buf_offset = 0;
- HYD_PMCD_pmi_proxy_params.stdin_buf_count = 0;
- status = HYD_DMX_register_fd(1, &stdin_fd, HYD_STDIN, HYD_PMCD_pmi_proxy_stdin_cb);
- HYDU_ERR_POP(status, "unable to register fd\n");
- }
-
- timeout_str = getenv("MPIEXEC_TIMEOUT");
- if (timeout_str)
- timeout = atoi(timeout_str);
- else
- timeout = -1;
-
while (1) {
/* Wait for some event to occur */
- status = HYD_DMX_wait_for_event(timeout);
+ status = HYD_DMX_wait_for_event(-1);
HYDU_ERR_POP(status, "demux engine error waiting for event\n");
/* Check to see if there's any open read socket left; if there
* are, we will just wait for more events. */
count = 0;
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++) {
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++) {
if (HYD_PMCD_pmi_proxy_params.out[i] != -1 ||
HYD_PMCD_pmi_proxy_params.err[i] != -1) {
count++;
@@ -175,13 +186,13 @@
/* Find the pid and mark it as complete. */
if (pid > 0)
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
if (HYD_PMCD_pmi_proxy_params.pid[i] == pid)
HYD_PMCD_pmi_proxy_params.exit_status[i] = WEXITSTATUS(ret_status);
/* Check how many more processes are pending */
count = 0;
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++) {
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++) {
if (HYD_PMCD_pmi_proxy_params.exit_status[i] == -1) {
count++;
break;
@@ -197,7 +208,7 @@
} while (1);
ret_status = 0;
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
ret_status |= HYD_PMCD_pmi_proxy_params.exit_status[i];
fn_exit:
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h 2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy.h 2009-03-22 23:09:37 UTC (rev 4162)
@@ -12,16 +12,16 @@
struct HYD_PMCD_pmi_proxy_params {
int proxy_port;
- int pmi_id;
char *wdir;
HYD_Binding binding;
HYD_Env_t *global_env;
int one_pass_count;
int partition_proc_count;
+ int exec_proc_count;
/* Process segmentation information for this partition */
- struct HYD_Partition_segment segment;
+ struct HYD_Partition_segment *segment_list;
struct HYD_Partition_exec *exec_list;
int *pid;
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c 2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_cb.c 2009-03-22 23:09:37 UTC (rev 4162)
@@ -46,7 +46,7 @@
}
if (cmd == KILLALL_PROCS) { /* Got the killall command */
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
if (HYD_PMCD_pmi_proxy_params.pid[i] != -1)
kill(HYD_PMCD_pmi_proxy_params.pid[i], SIGKILL);
@@ -84,7 +84,7 @@
status = HYD_DMX_deregister_fd(fd);
HYDU_ERR_POP(status, "unable to deregister fd\n");
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
if (HYD_PMCD_pmi_proxy_params.out[i] == fd)
HYD_PMCD_pmi_proxy_params.out[i] = -1;
}
@@ -113,7 +113,7 @@
status = HYD_DMX_deregister_fd(fd);
HYDU_ERR_POP(status, "unable to deregister fd\n");
- for (i = 0; i < HYD_PMCD_pmi_proxy_params.partition_proc_count; i++)
+ for (i = 0; i < HYD_PMCD_pmi_proxy_params.exec_proc_count; i++)
if (HYD_PMCD_pmi_proxy_params.err[i] == fd)
HYD_PMCD_pmi_proxy_params.err[i] = -1;
}
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c 2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_proxy_utils.c 2009-03-22 23:09:37 UTC (rev 4162)
@@ -14,15 +14,19 @@
int arg, i, count;
HYD_Env_t *env;
struct HYD_Partition_exec *exec = NULL;
+ struct HYD_Partition_segment *segment = NULL;
HYD_Status status = HYD_SUCCESS;
HYDU_FUNC_ENTER();
HYD_PMCD_pmi_proxy_params.exec_list = NULL;
+ HYD_PMCD_pmi_proxy_params.segment_list = NULL;
HYD_PMCD_pmi_proxy_params.global_env = NULL;
while (*argv) {
++argv;
+ if (*argv == NULL)
+ break;
/* Proxy port */
if (!strcmp(*argv, "--proxy-port")) {
@@ -74,14 +78,39 @@
continue;
}
- /* PMI_ID: This is the PMI_ID for the first process;
- * everything else is incremented from here. */
- if (!strcmp(*argv, "--pmi-id")) {
+ /* New segment */
+ if (!strcmp(*argv, "--segment")) {
+ if (HYD_PMCD_pmi_proxy_params.segment_list == NULL) {
+ status = HYDU_alloc_partition_segment(&HYD_PMCD_pmi_proxy_params.segment_list);
+ HYDU_ERR_POP(status, "unable to allocate partition segment\n");
+ }
+ else {
+ for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment->next;
+ segment = segment->next);
+ status = HYDU_alloc_partition_segment(&segment->next);
+ HYDU_ERR_POP(status, "unable to allocate partition segment\n");
+ }
+ continue;
+ }
+
+ /* Process count */
+ if (!strcmp(*argv, "--segment-proc-count")) {
argv++;
- HYD_PMCD_pmi_proxy_params.pmi_id = atoi(*argv);
+ for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment->next;
+ segment = segment->next);
+ segment->proc_count = atoi(*argv);
continue;
}
+ /* Process count */
+ if (!strcmp(*argv, "--segment-start-pid")) {
+ argv++;
+ for (segment = HYD_PMCD_pmi_proxy_params.segment_list; segment->next;
+ segment = segment->next);
+ segment->start_pid = atoi(*argv);
+ continue;
+ }
+
/* New executable */
if (!strcmp(*argv, "--exec")) {
if (HYD_PMCD_pmi_proxy_params.exec_list == NULL) {
@@ -98,7 +127,7 @@
}
/* Process count */
- if (!strcmp(*argv, "--proc-count")) {
+ if (!strcmp(*argv, "--exec-proc-count")) {
argv++;
for (exec = HYD_PMCD_pmi_proxy_params.exec_list; exec->next; exec = exec->next);
exec->proc_count = atoi(*argv);
@@ -106,7 +135,7 @@
}
/* Local env */
- if (!strcmp(*argv, "--local-env")) {
+ if (!strcmp(*argv, "--exec-local-env")) {
argv++;
count = atoi(*argv);
for (i = 0; i < count; i++) {
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c 2009-03-22 20:25:45 UTC (rev 4161)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_serv_launch.c 2009-03-22 23:09:37 UTC (rev 4162)
@@ -44,6 +44,7 @@
char *path_str[HYDU_NUM_JOIN_STR];
struct HYD_Partition *partition;
struct HYD_Partition_exec *exec;
+ struct HYD_Partition_segment *segment;
HYD_Status status = HYD_SUCCESS;
HYDU_FUNC_ENTER();
@@ -135,22 +136,30 @@
HYDU_list_append_env_to_str(handle.system_env, partition->proxy_args);
HYDU_list_append_env_to_str(handle.prop_env, partition->proxy_args);
- arg = HYDU_strlist_lastidx(partition->proxy_args);
- partition->proxy_args[arg++] = MPIU_Strdup("--pmi-id");
- partition->proxy_args[arg++] = HYDU_int_to_str(process_id);;
- partition->proxy_args[arg++] = NULL;
+ /* Pass the segment information */
+ for (segment = partition->segment_list; segment; segment = segment->next) {
+ arg = HYDU_strlist_lastidx(partition->proxy_args);
+ partition->proxy_args[arg++] = MPIU_Strdup("--segment");
+ partition->proxy_args[arg++] = MPIU_Strdup("--segment-start-pid");
+ partition->proxy_args[arg++] = HYDU_int_to_str(segment->start_pid);
+
+ partition->proxy_args[arg++] = MPIU_Strdup("--segment-proc-count");
+ partition->proxy_args[arg++] = HYDU_int_to_str(segment->proc_count);
+ partition->proxy_args[arg++] = NULL;
+ }
+
/* Now pass the local executable information */
for (exec = partition->exec_list; exec; exec = exec->next) {
arg = HYDU_strlist_lastidx(partition->proxy_args);
partition->proxy_args[arg++] = MPIU_Strdup("--exec");
- partition->proxy_args[arg++] = MPIU_Strdup("--proc-count");
+ partition->proxy_args[arg++] = MPIU_Strdup("--exec-proc-count");
partition->proxy_args[arg++] = HYDU_int_to_str(exec->proc_count);
partition->proxy_args[arg++] = NULL;
arg = HYDU_strlist_lastidx(partition->proxy_args);
- partition->proxy_args[arg++] = MPIU_Strdup("--local-env");
+ partition->proxy_args[arg++] = MPIU_Strdup("--exec-local-env");
for (i = 0, env = exec->prop_env; env; env = env->next, i++);
HYDU_ERR_POP(status, "unable to convert int to string\n");
partition->proxy_args[arg++] = HYDU_int_to_str(i);
More information about the mpich2-commits
mailing list