[mpich2-commits] r5602 - mpich2/trunk/src/pm/hydra/pm/pmiserv

balaji at mcs.anl.gov balaji at mcs.anl.gov
Wed Oct 28 15:31:10 CDT 2009


Author: balaji
Date: 2009-10-28 15:31:10 -0500 (Wed, 28 Oct 2009)
New Revision: 5602

Modified:
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_handle_v2.c
Log:
The case where a kvs get should return an error was not correctly
implemented. We were waiting for all processes to reach a consistent
epoch. Instead, we should wait for all processes to be in a higher or
equal epoch number as this process.

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_handle_v2.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_handle_v2.c	2009-10-28 20:24:01 UTC (rev 5601)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmi_handle_v2.c	2009-10-28 20:31:10 UTC (rev 5602)
@@ -174,6 +174,19 @@
     goto fn_exit;
 }
 
+static void print_req_list(void)
+{
+    struct reqs *req;
+
+    if (pending_reqs)
+        HYDU_dump_noprefix(stdout, "( ");
+    for (req = pending_reqs; req; req = req->next)
+        HYDU_dump_noprefix(stdout, "%s ",
+                           (req->type == NODE_ATTR_GET) ? "NODE_ATTR_GET" : "KVS_GET");
+    if (pending_reqs)
+        HYDU_dump_noprefix(stdout, ")\n");
+}
+
 static HYD_status fn_fullinit(int fd, char *args[])
 {
     int id, rank, i;
@@ -387,34 +400,7 @@
         }
     }
 
-    if (!found) {       /* We need to decide whether to return not found or queue up */
-        if (waitval && !strcmp(waitval, "TRUE")) {
-            /* queue up */
-            status = queue_req(fd, NODE_ATTR_GET, args);
-            HYDU_ERR_POP(status, "unable to queue request\n");
-        }
-        else {
-            /* Tell the client that we can't find the attribute */
-            i = 0;
-            tmp[i++] = HYDU_strdup("cmd=info-getnodeattr-response;");
-            if (thrid) {
-                tmp[i++] = HYDU_strdup("thrid=");
-                tmp[i++] = HYDU_strdup(thrid);
-                tmp[i++] = HYDU_strdup(";");
-            }
-            tmp[i++] = HYDU_strdup("found=FALSE;rc=0;");
-            tmp[i++] = NULL;
-
-            status = HYDU_str_alloc_and_join(tmp, &cmd);
-            HYDU_ERR_POP(status, "unable to join strings\n");
-            HYDU_free_strlist(tmp);
-
-            status = send_command(fd, cmd);
-            HYDU_ERR_POP(status, "send command failed\n");
-            HYDU_FREE(cmd);
-        }
-    }
-    else {      /* We found the attribute */
+    if (found) {      /* We found the attribute */
         i = 0;
         tmp[i++] = HYDU_strdup("cmd=info-getnodeattr-response;");
         if (thrid) {
@@ -434,10 +420,43 @@
         status = send_command(fd, cmd);
         HYDU_ERR_POP(status, "send command failed\n");
         HYDU_FREE(cmd);
+    }
+    else if (waitval && !strcmp(waitval, "TRUE")) {
+        /* The client wants to wait for a response; queue up the request */
+        status = queue_req(fd, NODE_ATTR_GET, args);
+        HYDU_ERR_POP(status, "unable to queue request\n");
 
-        req_complete = 1;
+        goto fn_exit;
     }
+    else {
+        /* Tell the client that we can't find the attribute */
+        i = 0;
+        tmp[i++] = HYDU_strdup("cmd=info-getnodeattr-response;");
+        if (thrid) {
+            tmp[i++] = HYDU_strdup("thrid=");
+            tmp[i++] = HYDU_strdup(thrid);
+            tmp[i++] = HYDU_strdup(";");
+        }
+        tmp[i++] = HYDU_strdup("found=FALSE;rc=0;");
+        tmp[i++] = NULL;
 
+        status = HYDU_str_alloc_and_join(tmp, &cmd);
+        HYDU_ERR_POP(status, "unable to join strings\n");
+        HYDU_free_strlist(tmp);
+
+        status = send_command(fd, cmd);
+        HYDU_ERR_POP(status, "send command failed\n");
+        HYDU_FREE(cmd);
+    }
+
+    /* Mark the global completion variable, in case the progress
+     * engine is monitoring. */
+    /* FIXME: This should be an output parameter. We need to change
+     * the structure of the PMI function table to be able to take
+     * additional arguments, and not just the ones passed on the
+     * wire. */
+    req_complete = 1;
+
   fn_exit:
     HYDU_FUNC_EXIT();
     return status;
@@ -614,14 +633,14 @@
 
 static HYD_status fn_kvs_get(int fd, char *args[])
 {
-    int i, found, node_count;
+    int i, found, barrier, process_count;
     HYD_pmcd_pmi_process_t *process, *prun;
     HYD_pmcd_pmi_node_t *node;
     HYD_pmcd_pmi_kvs_pair_t *run;
     char *key, *thrid;
     char *tmp[HYD_NUM_TMP_STRINGS], *cmd;
     struct token *tokens;
-    int token_count, consistent_epoch;
+    int token_count;
     HYD_status status = HYD_SUCCESS;
 
     HYDU_FUNC_ENTER();
@@ -649,22 +668,22 @@
     }
 
     if (!found) {
-        consistent_epoch = 1;
-        node_count = 0;
+        barrier = 1;
+        process_count = 0;
         for (node = process->node->pg->node_list; node; node = node->next) {
-            node_count++;
             for (prun = node->process_list; prun; prun = prun->next) {
-                if (prun->epoch != process->epoch) {
-                    /* The epochs are not consistent */
-                    consistent_epoch = 0;
+                process_count++;
+                if (prun->epoch < process->epoch) {
+                    barrier = 0;
                     break;
                 }
             }
+            if (!barrier)
+                break;
         }
 
-        if (consistent_epoch == 0 ||
-            ((process->epoch > 0) && (node_count != HYD_pg_list->num_procs))) {
-            /* queue up */
+        if (!barrier || process_count < HYD_pg_list->num_procs) {
+            /* We haven't reached a barrier yet; queue up request */
             status = queue_req(fd, KVS_GET, args);
             HYDU_ERR_POP(status, "unable to queue request\n");
 
@@ -672,8 +691,6 @@
             goto fn_exit;
         }
     }
-    else
-        req_complete = 1;
 
     i = 0;
     tmp[i++] = HYDU_strdup("cmd=kvs-get-response;");
@@ -701,6 +718,7 @@
     HYDU_ERR_POP(status, "send command failed\n");
 
     HYDU_FREE(cmd);
+    req_complete = 1;
 
   fn_exit:
     HYDU_FUNC_EXIT();



More information about the mpich2-commits mailing list