[mpich2-commits] r6716 - in mpich2/trunk/src/pm/hydra: pm/pmiserv tools/ckpoint tools/ckpoint/blcr
buntinas at mcs.anl.gov
buntinas at mcs.anl.gov
Mon May 24 13:31:50 CDT 2010
Author: buntinas
Date: 2010-05-24 13:31:50 -0500 (Mon, 24 May 2010)
New Revision: 6716
Modified:
mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c
mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h
mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c
mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h
Log:
added suffix to checkpoint context file to keep multiple proxies from clobbering each other's files
Modified: mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c
===================================================================
--- mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c 2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/pm/pmiserv/pmip_cb.c 2010-05-24 18:31:50 UTC (rev 6716)
@@ -510,7 +510,8 @@
HYDU_ERR_POP(status, "unable to create env\n");
/* Restart the proxy. Specify stdin fd only if pmi_rank 0 is in this proxy. */
- status = HYDT_ckpoint_restart(env, HYD_pmcd_pmip.local.proxy_process_count,
+ status = HYDT_ckpoint_restart(HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id,
+ env, HYD_pmcd_pmip.local.proxy_process_count,
pmi_ranks,
pmi_ranks[0] ? NULL :
HYD_pmcd_pmip.system_global.enable_stdin ?
@@ -892,7 +893,7 @@
}
else if (cmd == CKPOINT) {
HYD_pmcd_pmi_proxy_dump(status, STDOUT_FILENO, "requesting checkpoint\n");
- status = HYDT_ckpoint_suspend();
+ status = HYDT_ckpoint_suspend(HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id);
HYDU_ERR_POP(status, "checkpoint suspend failed\n");
HYD_pmcd_pmi_proxy_dump(status, STDOUT_FILENO, "checkpoint completed\n");
}
Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c 2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c 2010-05-24 18:31:50 UTC (rev 6716)
@@ -130,7 +130,7 @@
goto fn_exit;
}
-HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix)
+HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix, int pgid, int id)
{
HYD_status status = HYD_SUCCESS;
int ret;
@@ -142,7 +142,7 @@
HYDU_FUNC_ENTER();
/* build the checkpoint filename */
- snprintf(filename, sizeof(filename), "%s/context", prefix);
+ snprintf(filename, sizeof(filename), "%s/context-%d-%d", prefix, pgid, id);
/* remove existing checkpoint file, if any */
(void) unlink(filename);
@@ -201,7 +201,7 @@
goto fn_exit;
}
-HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, struct HYD_env *envlist,
+HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, struct HYD_env *envlist,
int num_ranks, int ranks[], int *in, int *out, int *err)
{
HYD_status status = HYD_SUCCESS;
@@ -224,7 +224,7 @@
if (status)
HYDU_ERR_POP(status, "blcr restart\n");
- snprintf(filename, sizeof(filename), "%s/context", prefix);
+ snprintf(filename, sizeof(filename), "%s/context-%d-%d", prefix, pgid, id);
context_fd = open(filename, O_RDONLY /* | O_LARGEFILE */);
HYDU_ERR_CHKANDJUMP(status, context_fd < 0, HYD_INTERNAL_ERROR, "open failed, %s\n",
Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h 2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h 2010-05-24 18:31:50 UTC (rev 6716)
@@ -8,8 +8,8 @@
#define CKPOINT_BLCR_H_INCLUDED
HYD_status HYDT_ckpoint_blcr_init(void);
-HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix);
-HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, struct HYD_env *envlist,
+HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix, int pgid, int id);
+HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, struct HYD_env *envlist,
int num_ranks, int ranks[], int *in, int *out, int *err);
#endif /* CKPOINT_BLCR_H_INCLUDED */
Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c 2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.c 2010-05-24 18:31:50 UTC (rev 6716)
@@ -50,7 +50,7 @@
#endif /* HAVE_BLCR */
}
-HYD_status HYDT_ckpoint_suspend(void)
+HYD_status HYDT_ckpoint_suspend(int pgid, int id)
{
HYD_status status = HYD_SUCCESS;
@@ -61,7 +61,7 @@
#if defined HAVE_BLCR
if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
- status = HYDT_ckpoint_blcr_suspend(HYDT_ckpoint_info.ckpoint_prefix);
+ status = HYDT_ckpoint_blcr_suspend(HYDT_ckpoint_info.ckpoint_prefix, pgid, id);
HYDU_ERR_POP(status, "blcr checkpoint returned error\n");
}
#endif /* HAVE_BLCR */
@@ -74,8 +74,7 @@
goto fn_exit;
}
-HYD_status HYDT_ckpoint_restart(struct HYD_env *envlist, int num_ranks, int ranks[], int *in,
- int *out, int *err)
+HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env *envlist, int num_ranks, int ranks[], int *in, int *out, int *err)
{
HYD_status status = HYD_SUCCESS;
@@ -87,8 +86,7 @@
#if defined HAVE_BLCR
if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
status =
- HYDT_ckpoint_blcr_restart(HYDT_ckpoint_info.ckpoint_prefix, envlist, num_ranks,
- ranks, in, out, err);
+ HYDT_ckpoint_blcr_restart(HYDT_ckpoint_info.ckpoint_prefix, pgid, id, envlist, num_ranks, ranks, in, out, err);
HYDU_ERR_POP(status, "blcr checkpoint returned error\n");
}
#endif /* HAVE_BLCR */
Modified: mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h
===================================================================
--- mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h 2010-05-24 18:30:32 UTC (rev 6715)
+++ mpich2/trunk/src/pm/hydra/tools/ckpoint/ckpoint.h 2010-05-24 18:31:50 UTC (rev 6716)
@@ -49,15 +49,20 @@
/**
* \brief HYDT_ckpoint_suspend - Initiate suspend of child processes
*
+ * \param[in] pgid process group id
+ * \param[in] id proxy id
+ *
* This function is called by a proxy to suspend all of its child
* processes.
*/
-HYD_status HYDT_ckpoint_suspend(void);
+HYD_status HYDT_ckpoint_suspend(int pgid, int id);
/**
* \brief HYDT_ckpoint_restart - Restart child processes
*
+ * \param[in] pgid process group id
+ * \param[in] id proxy id
* \param[in] envlist Environment setup from before the checkpoint
* \param[in] num_ranks Number of child processes to restart
* \param[in] ranks Array of ranks of the child processes
@@ -70,7 +75,7 @@
* reestablished. The environment passed in this list is resetup for
* each process.
*/
-HYD_status HYDT_ckpoint_restart(struct HYD_env *envlist, int num_ranks, int ranks[], int *in,
+HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env *envlist, int num_ranks, int ranks[], int *in,
int *out, int *err);
/*!
More information about the mpich2-commits
mailing list