[mpich2-commits] r7819 - in mpich2/trunk/src: mpi/coll util/param

buntinas at mcs.anl.gov buntinas at mcs.anl.gov
Mon Jan 24 17:15:45 CST 2011


Author: buntinas
Date: 2011-01-24 17:15:45 -0600 (Mon, 24 Jan 2011)
New Revision: 7819

Modified:
   mpich2/trunk/src/mpi/coll/helper_fns.c
   mpich2/trunk/src/util/param/params.yml
Log:
added parameter to enable collective error returns.  disabled by default

Modified: mpich2/trunk/src/mpi/coll/helper_fns.c
===================================================================
--- mpich2/trunk/src/mpi/coll/helper_fns.c	2011-01-24 20:59:23 UTC (rev 7818)
+++ mpich2/trunk/src/mpi/coll/helper_fns.c	2011-01-24 23:15:45 UTC (rev 7819)
@@ -584,7 +584,7 @@
 
     MPIU_DBG_MSG_S(PT2PT, TYPICAL, "IN: errflag = %s", *errflag?"TRUE":"FALSE");
 
-    if (*errflag)
+    if (*errflag && MPIR_PARAM_ENABLE_COLL_FT_RET)
         mpi_errno = MPIC_Send(buf, count, datatype, dest, MPIR_ERROR_TAG, comm);
     else
         mpi_errno = MPIC_Send(buf, count, datatype, dest, tag, comm);
@@ -611,6 +611,11 @@
 
     MPIU_DBG_MSG_S(PT2PT, TYPICAL, "IN: errflag = %s", *errflag?"TRUE":"FALSE");
 
+    if (!MPIR_PARAM_ENABLE_COLL_FT_RET) {
+            mpi_errno = MPIC_Recv(buf, count, datatype, source, tag, comm, status);
+            goto fn_exit;
+    }
+    
     if (status == MPI_STATUS_IGNORE)
         status = &mystatus;
     
@@ -650,7 +655,7 @@
 
     MPIU_DBG_MSG_S(PT2PT, TYPICAL, "IN: errflag = %s", *errflag?"TRUE":"FALSE");
     
-    if (*errflag)
+    if (*errflag && MPIR_PARAM_ENABLE_COLL_FT_RET)
         mpi_errno = MPIC_Ssend(buf, count, datatype, dest, MPIR_ERROR_TAG, comm);
     else
         mpi_errno = MPIC_Ssend(buf, count, datatype, dest, tag, comm);
@@ -679,6 +684,13 @@
 
     MPIU_DBG_MSG_S(PT2PT, TYPICAL, "IN: errflag = %s", *errflag?"TRUE":"FALSE");
 
+    if (!MPIR_PARAM_ENABLE_COLL_FT_RET) {
+        mpi_errno = MPIC_Sendrecv(sendbuf, sendcount, sendtype, dest, sendtag,
+                                  recvbuf, recvcount, recvtype, source, recvtag,
+                                  comm, status);
+        goto fn_exit;
+    }
+    
     if (status == MPI_STATUS_IGNORE)
         status = &mystatus;
     
@@ -728,13 +740,21 @@
 
     MPIU_DBG_MSG_S(PT2PT, TYPICAL, "IN: errflag = %s", *errflag?"TRUE":"FALSE");
 
+    if (!MPIR_PARAM_ENABLE_COLL_FT_RET) {
+        mpi_errno = MPIC_Sendrecv_replace(buf, count, datatype,
+                                          dest, sendtag,
+                                          source, recvtag,
+                                          comm, status);
+        goto fn_exit;
+    }
+
     if (status == MPI_STATUS_IGNORE)
         status = &mystatus;
     
     if (*errflag) {
         mpi_errno = MPIC_Sendrecv_replace(buf, count, datatype,
                                           dest, MPIR_ERROR_TAG,
-                                          source, recvtag,
+                                          source, MPI_ANY_TAG,
                                           comm, status);
         goto fn_exit;
     }
@@ -775,7 +795,7 @@
 
     MPIU_DBG_MSG_S(PT2PT, TYPICAL, "IN: errflag = %s", *errflag?"TRUE":"FALSE");
 
-    if (*errflag)
+    if (*errflag && MPIR_PARAM_ENABLE_COLL_FT_RET)
         mpi_errno = MPIC_Isend(buf, count, datatype, dest, MPIR_ERROR_TAG, comm, request);
     else
         mpi_errno = MPIC_Isend(buf, count, datatype, dest, tag, comm, request);
@@ -799,7 +819,10 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIC_IRECV_FT);
 
-    mpi_errno = MPIC_Irecv(buf, count, datatype, source, MPI_ANY_TAG, comm, request);
+    if (MPIR_PARAM_ENABLE_COLL_FT_RET)
+        mpi_errno = MPIC_Irecv(buf, count, datatype, source, MPI_ANY_TAG, comm, request);
+    else
+        mpi_errno = MPIC_Irecv(buf, count, datatype, source, tag, comm, request);
 
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIC_IRECV_FT);
@@ -828,7 +851,7 @@
     mpi_errno = MPIR_Waitall_impl(numreq, requests, statuses);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    if (*errflag)
+    if (*errflag || !MPIR_PARAM_ENABLE_COLL_FT_RET)
         goto fn_exit;
 
     for (i = 0; i < numreq; ++i) {

Modified: mpich2/trunk/src/util/param/params.yml
===================================================================
--- mpich2/trunk/src/util/param/params.yml	2011-01-24 20:59:23 UTC (rev 7818)
+++ mpich2/trunk/src/util/param/params.yml	2011-01-24 23:15:45 UTC (rev 7819)
@@ -32,6 +32,8 @@
       description : parameters relevant to the "MPIR" debugger interface
     - name        : checkpointing
       description : parameters relevant to checkpointing
+    - name        : fault_tolerance
+      description : parameters that control fault tolerance behavior
     - name        : threads
       description : multi-threading parameters
     - name        : nemesis
@@ -276,6 +278,19 @@
         checkpointing library cannot be initialized.
 
   ##############################################################
+    # fault-tolerance parameters
+    - category    : fault_tolerance
+      name        : ENABLE_COLL_FT_RET
+      type        : boolean
+      default     : false
+      description : >-
+        Collectives called on a communicator with a failed process
+        should not hang, however the result of the operation may be
+        invalid even though the function returns MPI_SUCCESS.  This
+        option enables an experimental feature that will return an
+        error if the result of the collective is invalid.
+
+  ##############################################################
     # memory parameters
     - category    : memory
       name        : ABORT_ON_LEAKED_HANDLES



More information about the mpich2-commits mailing list