[mpich2-commits] r7728 - in mpich2/trunk/src/mpid/ch3: include src
buntinas at mcs.anl.gov
buntinas at mcs.anl.gov
Thu Jan 13 17:30:20 CST 2011
Author: buntinas
Date: 2011-01-13 17:30:20 -0600 (Thu, 13 Jan 2011)
New Revision: 7728
Modified:
mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c
mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c
mpich2/trunk/src/mpid/ch3/src/mpid_init.c
Log:
Added attribute to COMM_WORLD to list failed processes. The attribute is called MPICH_ATTR_FAILED_PROCESSES, and should be declared 'extern' in the application. (I'm not sure about the implications of adding a symbol to mpi.h on the ABI.)
Modified: mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidimpl.h 2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/include/mpidimpl.h 2011-01-13 23:30:20 UTC (rev 7728)
@@ -1429,7 +1429,10 @@
MPID_Comm_connect and accept */
int MPIDI_CH3_Connect_to_root(const char *, MPIDI_VC_t **);
+/* keyval for COMM_WORLD attribute holding list of failed processes */
+extern int MPICH_ATTR_FAILED_PROCESSES;
+
/*
* Channel utility prototypes
*/
Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c 2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c 2011-01-13 23:30:20 UTC (rev 7728)
@@ -396,6 +396,8 @@
++c; \
} while (0)
+#define ALLOC_STEP 10
+
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3U_Check_for_failed_procs
#undef FCNAME
@@ -405,10 +407,13 @@
int mpi_errno = MPI_SUCCESS;
int pmi_errno;
char *val;
+ int *attr_val = NULL, *ret;
char *c;
int len;
char *kvsname;
int rank, rank_hi;
+ int i;
+ int alloc_len;
MPIU_CHKLMEM_DECL(1);
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
@@ -424,12 +429,22 @@
MPIU_DBG_MSG_S(CH3_DISCONNECT, TYPICAL, "Received proc fail notification: %s", val);
- if (*val == '\0')
+ if (*val == '\0') {
/* there are no failed processes */
+ attr_val = MPIU_Malloc(sizeof(int));
+ if (!attr_val) { MPIU_CHKMEM_SETERR(mpi_errno, sizeof(int), "attr_val"); goto fn_fail; }
+ *attr_val = MPI_PROC_NULL;
+ mpi_errno = MPIR_Comm_set_attr_impl(MPIR_Process.comm_world, MPICH_ATTR_FAILED_PROCESSES, attr_val, MPIR_ATTR_PTR);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_exit;
+ }
+ attr_val = MPIU_Malloc(sizeof(int) * ALLOC_STEP);
+ alloc_len = ALLOC_STEP;
+
/* parse list of failed processes. This is a comma separated list
of ranks or ranges of ranks (e.g., "1, 3-5, 11") */
+ i = 0;
c = val;
while(1) {
parse_rank(&rank);
@@ -440,9 +455,19 @@
rank_hi = rank;
while (rank <= rank_hi) {
MPIDI_VC_t *vc;
+ /* terminate the VC */
MPIDI_PG_Get_vc(MPIDI_Process.my_pg, rank, &vc);
mpi_errno = MPIU_CALL(MPIDI_CH3,Connection_terminate(vc));
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ /* update the dead node attribute list */
+ if (alloc_len <= i) {
+ /* allocate more space */
+ ret = MPIU_Realloc(attr_val, sizeof(int) * (alloc_len + ALLOC_STEP));
+ if (!ret) { MPIU_CHKMEM_SETERR(mpi_errno, sizeof(int) * (alloc_len + ALLOC_STEP), "attr_val"); goto fn_fail; }
+ attr_val = ret;
+ }
+ attr_val[i] = rank;
+ ++i;
++rank;
}
MPIU_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list");
@@ -450,12 +475,25 @@
break;
++c; /* skip ',' */
}
+ /* terminate dead node attribute list with an MPI_PROC_NULL */
+ if (alloc_len <= i) {
+ /* allocate more space */
+ ret = MPIU_Realloc(attr_val, alloc_len + ALLOC_STEP);
+ if (!ret) { MPIU_CHKMEM_SETERR(mpi_errno, alloc_len + ALLOC_STEP, attr_val); goto fn_fail; }
+ attr_val = ret;
+ }
+ attr_val[i] = MPI_PROC_NULL;
+ mpi_errno = MPIR_Comm_set_attr_impl(MPIR_Process.comm_world, MPICH_ATTR_FAILED_PROCESSES, attr_val, MPIR_ATTR_PTR);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
fn_exit:
MPIU_CHKLMEM_FREEALL();
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
return mpi_errno;
fn_fail:
+ if (attr_val)
+ MPIU_Free(attr_val);
goto fn_exit;
}
Modified: mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c 2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c 2011-01-13 23:30:20 UTC (rev 7728)
@@ -93,7 +93,9 @@
* cancel it, in which case an error shouldn't be generated.
*/
-
+ MPIR_Comm_free_keyval_impl(MPICH_ATTR_FAILED_PROCESSES);
+ MPICH_ATTR_FAILED_PROCESSES = MPI_KEYVAL_INVALID;
+
#ifdef MPID_NEEDS_ICOMM_WORLD
mpi_errno = MPIR_Comm_release_always(MPIR_Process.icomm_world, 0);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
Modified: mpich2/trunk/src/mpid/ch3/src/mpid_init.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/mpid_init.c 2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/src/mpid_init.c 2011-01-13 23:30:20 UTC (rev 7728)
@@ -36,6 +36,9 @@
static int MPIDI_CH3I_PG_Compare_ids(void * id1, void * id2);
static int MPIDI_CH3I_PG_Destroy(MPIDI_PG_t * pg );
+int MPICH_ATTR_FAILED_PROCESSES = MPI_KEYVAL_INVALID;
+static int failed_procs_delete_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra_data);
+
MPIDI_Process_t MPIDI_Process = { NULL };
MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool = NULL;
@@ -53,6 +56,7 @@
int pg_size;
MPID_Comm * comm;
int p;
+ int *attr_val = NULL;
MPIDI_STATE_DECL(MPID_STATE_MPID_INIT);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_INIT);
@@ -276,6 +280,17 @@
MPICH_THREAD_LEVEL : requested;
}
+ /* create attribute to list failed processes */
+ mpi_errno = MPIR_Comm_create_keyval_impl(MPI_COMM_NULL_COPY_FN,
+ failed_procs_delete_fn,
+ &MPICH_ATTR_FAILED_PROCESSES, 0);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ attr_val = MPIU_Malloc(sizeof(int));
+ if (!attr_val) { MPIU_CHKMEM_SETERR(mpi_errno, sizeof(int), "attr_val"); goto fn_fail; }
+ *attr_val = MPI_PROC_NULL;
+ mpi_errno = MPIR_Comm_set_attr_impl(MPIR_Process.comm_world, MPICH_ATTR_FAILED_PROCESSES, attr_val, MPIR_ATTR_PTR);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_INIT);
return mpi_errno;
@@ -546,3 +561,17 @@
return MPI_SUCCESS;
}
+
+static int failed_procs_delete_fn(MPI_Comm comm ATTRIBUTE((unused)),
+ int keyval ATTRIBUTE((unused)),
+ void *attr_val,
+ void *extra_data ATTRIBUTE((unused)))
+{
+ MPIU_UNREFERENCED_ARG(comm);
+ MPIU_UNREFERENCED_ARG(keyval);
+ MPIU_UNREFERENCED_ARG(extra_data);
+
+ MPIU_Free(attr_val);
+ return MPI_SUCCESS;
+}
+
More information about the mpich2-commits
mailing list