[mpich2-commits] r7728 - in mpich2/trunk/src/mpid/ch3: include src

buntinas at mcs.anl.gov buntinas at mcs.anl.gov
Thu Jan 13 17:30:20 CST 2011


Author: buntinas
Date: 2011-01-13 17:30:20 -0600 (Thu, 13 Jan 2011)
New Revision: 7728

Modified:
   mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
   mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c
   mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c
   mpich2/trunk/src/mpid/ch3/src/mpid_init.c
Log:
Added attribute to COMM_WORLD to list failed processes.  The attribute is called MPICH_ATTR_FAILED_PROCESSES, and should be declared 'extern' in the application. (I'm not sure about the implications of adding a symbol to mpi.h on the ABI.)

Modified: mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidimpl.h	2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/include/mpidimpl.h	2011-01-13 23:30:20 UTC (rev 7728)
@@ -1429,7 +1429,10 @@
    MPID_Comm_connect and accept */
 int MPIDI_CH3_Connect_to_root(const char *, MPIDI_VC_t **);
 
+/* keyval for COMM_WORLD attribute holding list of failed processes */
+extern int MPICH_ATTR_FAILED_PROCESSES;
 
+
 /*
  * Channel utility prototypes
  */

Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c	2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_handle_connection.c	2011-01-13 23:30:20 UTC (rev 7728)
@@ -396,6 +396,8 @@
             ++c;                                                                                \
     } while (0)
 
+#define ALLOC_STEP 10
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3U_Check_for_failed_procs
 #undef FCNAME
@@ -405,10 +407,13 @@
     int mpi_errno = MPI_SUCCESS;
     int pmi_errno;
     char *val;
+    int *attr_val = NULL, *ret;
     char *c;
     int len;
     char *kvsname;
     int rank, rank_hi;
+    int i;
+    int alloc_len;
     MPIU_CHKLMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
 
@@ -424,12 +429,22 @@
 
     MPIU_DBG_MSG_S(CH3_DISCONNECT, TYPICAL, "Received proc fail notification: %s", val);
     
-    if (*val == '\0')
+    if (*val == '\0') {
         /* there are no failed processes */
+        attr_val = MPIU_Malloc(sizeof(int));
+        if (!attr_val) { MPIU_CHKMEM_SETERR(mpi_errno, sizeof(int), "attr_val"); goto fn_fail; }
+        *attr_val = MPI_PROC_NULL;
+        mpi_errno = MPIR_Comm_set_attr_impl(MPIR_Process.comm_world, MPICH_ATTR_FAILED_PROCESSES, attr_val, MPIR_ATTR_PTR);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         goto fn_exit;
+    }
 
+    attr_val = MPIU_Malloc(sizeof(int) * ALLOC_STEP);
+    alloc_len = ALLOC_STEP;
+    
     /* parse list of failed processes.  This is a comma separated list
        of ranks or ranges of ranks (e.g., "1, 3-5, 11") */
+    i = 0;
     c = val;
     while(1) {
         parse_rank(&rank);
@@ -440,9 +455,19 @@
             rank_hi = rank;
         while (rank <= rank_hi) {
             MPIDI_VC_t *vc;
+            /* terminate the VC */
             MPIDI_PG_Get_vc(MPIDI_Process.my_pg, rank, &vc);
             mpi_errno = MPIU_CALL(MPIDI_CH3,Connection_terminate(vc));
             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            /* update the dead node attribute list */
+            if (alloc_len <= i) {
+                /* allocate more space */
+                ret = MPIU_Realloc(attr_val, sizeof(int) * (alloc_len + ALLOC_STEP));
+                if (!ret) { MPIU_CHKMEM_SETERR(mpi_errno, sizeof(int) * (alloc_len + ALLOC_STEP), "attr_val"); goto fn_fail; }
+                attr_val = ret;
+            }
+            attr_val[i] = rank;
+            ++i;
             ++rank;
         }
         MPIU_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list");
@@ -450,12 +475,25 @@
             break;
         ++c; /* skip ',' */
     }
+    /* terminate dead node attribute list with an MPI_PROC_NULL */
+    if (alloc_len <= i) {
+        /* allocate more space */
+        ret = MPIU_Realloc(attr_val, alloc_len + ALLOC_STEP);
+        if (!ret) { MPIU_CHKMEM_SETERR(mpi_errno, alloc_len + ALLOC_STEP, attr_val); goto fn_fail; }
+        attr_val = ret;
+    }
+    attr_val[i] = MPI_PROC_NULL;
 
+    mpi_errno = MPIR_Comm_set_attr_impl(MPIR_Process.comm_world, MPICH_ATTR_FAILED_PROCESSES, attr_val, MPIR_ATTR_PTR);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
  fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
     return mpi_errno;
  fn_fail:
+    if (attr_val)
+        MPIU_Free(attr_val);
     goto fn_exit;
 }
 

Modified: mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c	2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/src/mpid_finalize.c	2011-01-13 23:30:20 UTC (rev 7728)
@@ -93,7 +93,9 @@
       *    cancel it, in which case an error shouldn't be generated.
       */
     
-
+    MPIR_Comm_free_keyval_impl(MPICH_ATTR_FAILED_PROCESSES);
+    MPICH_ATTR_FAILED_PROCESSES = MPI_KEYVAL_INVALID;
+    
 #ifdef MPID_NEEDS_ICOMM_WORLD
     mpi_errno = MPIR_Comm_release_always(MPIR_Process.icomm_world, 0);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);

Modified: mpich2/trunk/src/mpid/ch3/src/mpid_init.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/mpid_init.c	2011-01-13 22:59:25 UTC (rev 7727)
+++ mpich2/trunk/src/mpid/ch3/src/mpid_init.c	2011-01-13 23:30:20 UTC (rev 7728)
@@ -36,6 +36,9 @@
 static int MPIDI_CH3I_PG_Compare_ids(void * id1, void * id2);
 static int MPIDI_CH3I_PG_Destroy(MPIDI_PG_t * pg );
 
+int MPICH_ATTR_FAILED_PROCESSES = MPI_KEYVAL_INVALID;
+static int failed_procs_delete_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra_data);
+
 MPIDI_Process_t MPIDI_Process = { NULL };
 MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool = NULL;
 
@@ -53,6 +56,7 @@
     int pg_size;
     MPID_Comm * comm;
     int p;
+    int *attr_val = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPID_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_INIT);
@@ -276,6 +280,17 @@
 	    MPICH_THREAD_LEVEL : requested;
     }
 
+    /* create attribute to list failed processes */
+    mpi_errno = MPIR_Comm_create_keyval_impl(MPI_COMM_NULL_COPY_FN,
+                                             failed_procs_delete_fn,
+                                             &MPICH_ATTR_FAILED_PROCESSES, 0);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    attr_val = MPIU_Malloc(sizeof(int));
+    if (!attr_val) { MPIU_CHKMEM_SETERR(mpi_errno, sizeof(int), "attr_val"); goto fn_fail; }
+    *attr_val = MPI_PROC_NULL;
+    mpi_errno = MPIR_Comm_set_attr_impl(MPIR_Process.comm_world, MPICH_ATTR_FAILED_PROCESSES, attr_val, MPIR_ATTR_PTR);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_INIT);
     return mpi_errno;
@@ -546,3 +561,17 @@
     
     return MPI_SUCCESS;
 }
+
+static int failed_procs_delete_fn(MPI_Comm comm ATTRIBUTE((unused)),
+                                  int keyval ATTRIBUTE((unused)),
+                                  void *attr_val,
+                                  void *extra_data ATTRIBUTE((unused)))
+{
+    MPIU_UNREFERENCED_ARG(comm);
+    MPIU_UNREFERENCED_ARG(keyval);
+    MPIU_UNREFERENCED_ARG(extra_data);
+
+    MPIU_Free(attr_val);
+    return MPI_SUCCESS;
+}
+



More information about the mpich2-commits mailing list