[mpich2-commits] r6716 - in mpich2/trunk/src/pm/hydra: pm/pmiserv tools/ckpoint tools/ckpoint/blcr

buntinas at mcs.anl.gov buntinas at mcs.anl.gov
Mon May 24 13:31:50 CDT 2010


Author: buntinas
Date: 2010-05-24 13:31:50 -0500 (Mon, 24 May 2010)
New Revision: 6716

Modified:
   mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
   mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c
   mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h
   mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c
   mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h
Log:
added suffix to checkpoint context file to keep multiple proxies from clobbering each other's files

Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c	2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c	2010-05-24 18:31:50 UTC (rev 6716)
@@ -510,7 +510,8 @@
         HYDU_ERR_POP(status, "unable to create env\n");
 
         /* Restart the proxy.  Specify stdin fd only if pmi_rank 0 is in this proxy. */
-        status = HYDT_ckpoint_restart(env, HYD_pmcd_pmip.local.proxy_process_count,
+        status = HYDT_ckpoint_restart(HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id,
+                                      env, HYD_pmcd_pmip.local.proxy_process_count,
                                       pmi_ranks,
                                       pmi_ranks[0] ? NULL :
                                       HYD_pmcd_pmip.system_global.enable_stdin ?
@@ -892,7 +893,7 @@
     }
     else if (cmd == CKPOINT) {
         HYD_pmcd_pmi_proxy_dump(status, STDOUT_FILENO, "requesting checkpoint\n");
-        status = HYDT_ckpoint_suspend();
+        status = HYDT_ckpoint_suspend(HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id);
         HYDU_ERR_POP(status, "checkpoint suspend failed\n");
         HYD_pmcd_pmi_proxy_dump(status, STDOUT_FILENO, "checkpoint completed\n");
     }

Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c	2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c	2010-05-24 18:31:50 UTC (rev 6716)
@@ -130,7 +130,7 @@
     goto fn_exit;
 }
 
-HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix)
+HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix, int pgid, int id)
 {
     HYD_status status = HYD_SUCCESS;
     int ret;
@@ -142,7 +142,7 @@
     HYDU_FUNC_ENTER();
 
     /* build the checkpoint filename */
-    snprintf(filename, sizeof(filename), "%s/context", prefix);
+    snprintf(filename, sizeof(filename), "%s/context-%d-%d", prefix, pgid, id);
 
     /* remove existing checkpoint file, if any */
     (void) unlink(filename);
@@ -201,7 +201,7 @@
     goto fn_exit;
 }
 
-HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, struct HYD_env *envlist,
+HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, struct HYD_env *envlist,
                                      int num_ranks, int ranks[], int *in, int *out, int *err)
 {
     HYD_status status = HYD_SUCCESS;
@@ -224,7 +224,7 @@
     if (status)
         HYDU_ERR_POP(status, "blcr restart\n");
 
-    snprintf(filename, sizeof(filename), "%s/context", prefix);
+    snprintf(filename, sizeof(filename), "%s/context-%d-%d", prefix, pgid, id);
 
     context_fd = open(filename, O_RDONLY /* | O_LARGEFILE */);
     HYDU_ERR_CHKANDJUMP(status, context_fd < 0, HYD_INTERNAL_ERROR, "open failed, %s\n",

Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h	2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h	2010-05-24 18:31:50 UTC (rev 6716)
@@ -8,8 +8,8 @@
 #define CKPOINT_BLCR_H_INCLUDED
 
 HYD_status HYDT_ckpoint_blcr_init(void);
-HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix);
-HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, struct HYD_env *envlist,
+HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix, int pgid, int id);
+HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, struct HYD_env *envlist,
                                      int num_ranks, int ranks[], int *in, int *out, int *err);
 
 #endif /* CKPOINT_BLCR_H_INCLUDED */

Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c	2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c	2010-05-24 18:31:50 UTC (rev 6716)
@@ -50,7 +50,7 @@
 #endif /* HAVE_BLCR */
 }
 
-HYD_status HYDT_ckpoint_suspend(void)
+HYD_status HYDT_ckpoint_suspend(int pgid, int id)
 {
     HYD_status status = HYD_SUCCESS;
 
@@ -61,7 +61,7 @@
 
 #if defined HAVE_BLCR
     if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
-        status = HYDT_ckpoint_blcr_suspend(HYDT_ckpoint_info.ckpoint_prefix);
+        status = HYDT_ckpoint_blcr_suspend(HYDT_ckpoint_info.ckpoint_prefix, pgid, id);
         HYDU_ERR_POP(status, "blcr checkpoint returned error\n");
     }
 #endif /* HAVE_BLCR */
@@ -74,8 +74,7 @@
     goto fn_exit;
 }
 
-HYD_status HYDT_ckpoint_restart(struct HYD_env *envlist, int num_ranks, int ranks[], int *in,
-                                int *out, int *err)
+HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env *envlist, int num_ranks, int ranks[], int *in, int *out, int *err)
 {
     HYD_status status = HYD_SUCCESS;
 
@@ -87,8 +86,7 @@
 #if defined HAVE_BLCR
     if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
         status =
-            HYDT_ckpoint_blcr_restart(HYDT_ckpoint_info.ckpoint_prefix, envlist, num_ranks,
-                                      ranks, in, out, err);
+            HYDT_ckpoint_blcr_restart(HYDT_ckpoint_info.ckpoint_prefix, pgid, id, envlist, num_ranks, ranks, in, out, err);
         HYDU_ERR_POP(status, "blcr checkpoint returned error\n");
     }
 #endif /* HAVE_BLCR */

Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h	2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h	2010-05-24 18:31:50 UTC (rev 6716)
@@ -49,15 +49,20 @@
 /**
  * \brief HYDT_ckpoint_suspend - Initiate suspend of child processes
  *
+ * \param[in] pgid  process group id
+ * \param[in] id    proxy id
+ *
  * This function is called by a proxy to suspend all of its child
  * processes.
  */
-HYD_status HYDT_ckpoint_suspend(void);
+HYD_status HYDT_ckpoint_suspend(int pgid, int id);
 
 
 /**
  * \brief HYDT_ckpoint_restart - Restart child processes
  *
+ * \param[in] pgid       process group id
+ * \param[in] id         proxy id
  * \param[in] envlist    Environment setup from before the checkpoint
  * \param[in] num_ranks  Number of child processes to restart
  * \param[in] ranks      Array of ranks of the child processes
@@ -70,7 +75,7 @@
  * reestablished. The environment passed in this list is resetup for
  * each process.
  */
-HYD_status HYDT_ckpoint_restart(struct HYD_env *envlist, int num_ranks, int ranks[], int *in,
+HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env *envlist, int num_ranks, int ranks[], int *in,
                                 int *out, int *err);
 
 /*!



More information about the mpich2-commits mailing list