[mpich2-commits] r7723 - in mpich2/branches/release/mpich2-1.3.x: . confdb maint src/include src/mpi/coll src/mpi/errhan src/mpid src/mpid/ch3/channels/nemesis/include src/mpid/ch3/channels/nemesis/nemesis/include src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp src/mpid/ch3/channels/nemesis/nemesis/src src/mpid/ch3/channels/nemesis/src src/mpid/ch3/channels/sctp/include src/mpid/ch3/channels/sock/include src/mpid/ch3/include src/mpid/ch3/src src/mpl/src src/pm/hydra src/pm/hydra/examples src/pm/hydra/include src/pm/hydra/pm src/pm/hydra/tools src/pm/hydra/tools/bootstrap/external src/pm/hydra/tools/bootstrap/src src/pm/hydra/tools/bootstrap/utils src/pm/hydra/ui src/pm/hydra/utils

buntinas at mcs.anl.gov buntinas at mcs.anl.gov
Thu Jan 13 16:17:25 CST 2011


Author: buntinas
Date: 2011-01-13 16:17:24 -0600 (Thu, 13 Jan 2011)
New Revision: 7723

Modified:
   mpich2/branches/release/mpich2-1.3.x/
   mpich2/branches/release/mpich2-1.3.x/confdb/
   mpich2/branches/release/mpich2-1.3.x/maint/Version
   mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c
   mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt
   mpich2/branches/release/mpich2-1.3.x/src/mpid/
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c
   mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c
   mpich2/branches/release/mpich2-1.3.x/src/mpl/src/mplstr.c
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/Makefile.am
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/README
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/autogen.sh
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/configure.in
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/examples/
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/hydra-doxygen.cfg.in
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/include/
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/mpich2prereq
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/pm/
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/ui/
   mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/utils/
   mpich2/branches/release/mpich2-1.3.x/winconfigure.wsf
Log:
merged changesets 7604, 7671, 7674, 7683, 7685, 7687, 7701, 7702, 7720 and 7722 from trunk into 1.3.x


Property changes on: mpich2/branches/release/mpich2-1.3.x
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt:5050
/mpich2/branches/dev/ckpt2:5057-6537
/mpich2/branches/dev/ftb:5661-5730
/mpich2/branches/dev/lapi:5817
/mpich2/branches/dev/wintcp_async_progress:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2:5406
/mpich2/trunk:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt:5050
/mpich2/branches/dev/ckpt2:5057-6537
/mpich2/branches/dev/error-return:7662-7670
/mpich2/branches/dev/ftb:5661-5730
/mpich2/branches/dev/lapi:5817
/mpich2/branches/dev/wintcp_async_progress:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2:5406
/mpich2/trunk:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/confdb
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt2/confdb:5180,5182,5196,5198
/mpich2/branches/dev/ftb/confdb:5661-5730
/mpich2/branches/dev/lapi/confdb:5817
/mpich2/branches/dev/wintcp_async_progress/confdb:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/confdb:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/confdb:5406
/mpich2/trunk/confdb:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt2/confdb:5180,5182,5196,5198
/mpich2/branches/dev/error-return/confdb:7662-7670
/mpich2/branches/dev/ftb/confdb:5661-5730
/mpich2/branches/dev/lapi/confdb:5817
/mpich2/branches/dev/wintcp_async_progress/confdb:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/confdb:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/confdb:5406
/mpich2/trunk/confdb:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/maint/Version
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/maint/Version:5050
/mpich2/branches/dev/ckpt2/maint/Version:5057-6537
/mpich2/branches/dev/ftb/maint/Version:5661-5730
/mpich2/branches/dev/lapi/maint/Version:5817
/mpich2/branches/dev/wintcp_async_progress/maint/Version:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/maint/Version:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/maint/Version:5406
/mpich2/trunk/maint/Version:7422-7425,7429-7433,7435,7437-7438,7442-7447,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
/mpich2/trunk/src/pm/hydra/VERSION:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7447
   + /mpich2/branches/dev/ckpt/maint/Version:5050
/mpich2/branches/dev/ckpt2/maint/Version:5057-6537
/mpich2/branches/dev/ftb/maint/Version:5661-5730
/mpich2/branches/dev/lapi/maint/Version:5817
/mpich2/branches/dev/wintcp_async_progress/maint/Version:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/maint/Version:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/maint/Version:5406
/mpich2/trunk/maint/Version:7422-7425,7429-7433,7435,7437-7438,7442-7447,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
/mpich2/trunk/src/pm/hydra/VERSION:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7447,7604,7671,7674,7683,7685,7687,7701-7702,7720,7722

Modified: mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -3116,6 +3116,11 @@
 int MPID_VCRT_Get_ptr(MPID_VCRT vcrt, MPID_VCR **vc_pptr);
 
 /*@
+  MPID_VCRT_Contains_failed_vc - returns TRUE iff a VC in this VCRT is in MORUBIND state
+  @*/
+int MPID_VCRT_Contains_failed_vc(MPID_VCRT vcrt);
+
+/*@
   MPID_VCR_Dup - Create a duplicate reference to a virtual connection
   @*/
 int MPID_VCR_Dup(MPID_VCR orig_vcr, MPID_VCR * new_vcr);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -83,11 +83,12 @@
     MPI_Datatype recvtype, 
     MPID_Comm *comm_ptr )
 {
-    int        comm_size, rank;
-    int        mpi_errno = MPI_SUCCESS;
+    int comm_size, rank;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Aint   recvtype_extent, tot_bytes;
     MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
-    int        j, i, pof2, src, rem;
+    int j, i, pof2, src, rem;
     void *tmp_buf = NULL;
     int curr_cnt, dst, type_size, left, right, jnext;
     MPI_Comm comm;
@@ -173,11 +174,13 @@
 					      (comm_size-dst_tree_root)*recvcount,
                                               recvtype, dst,
                                               MPIR_ALLGATHER_TAG, comm, &status);
-		    if (mpi_errno) { 
-			MPIU_ERR_POP(mpi_errno);
-		    }
-                    
-                    MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+		    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        last_recv_cnt = 0;
+		    } else
+                        MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
                     curr_cnt += last_recv_cnt;
                 }
                 
@@ -235,9 +238,11 @@
                             /* last_recv_cnt was set in the previous
                                receive. that's the amount of data to be
                                sent now. */
-			    if (mpi_errno) { 
-				MPIU_ERR_POP(mpi_errno);
-			    }
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                         }
                         /* recv only if this proc. doesn't have data and sender
                            has data */
@@ -251,10 +256,13 @@
                                                   comm, &status); 
                             /* nprocs_completed is also equal to the
                                no. of processes whose data we don't have */
-			    if (mpi_errno) { 
-				MPIU_ERR_POP(mpi_errno);
-			    }
-                            MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                                last_recv_cnt = 0;
+                            } else
+                                MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
                             curr_cnt += last_recv_cnt;
                         }
                         tmp_mask >>= 1;
@@ -331,11 +339,13 @@
 					      tmp_buf_size - recv_offset,
                                               MPI_BYTE, dst,
                                               MPIR_ALLGATHER_TAG, comm, &status);
-		    if (mpi_errno) { 
-			MPIU_ERR_POP(mpi_errno);
-		    }
-                    
-                    MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        last_recv_cnt = 0;
+                    } else
+                        MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
                     curr_cnt += last_recv_cnt;
                 }
                 
@@ -382,8 +392,12 @@
                             mpi_errno = MPIC_Send(((char *)tmp_buf + offset),
                                                   last_recv_cnt, MPI_BYTE,
                                                   dst, MPIR_ALLGATHER_TAG,
-                                                  comm);  
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                                                  comm);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                             /* last_recv_cnt was set in the previous
                                receive. that's the amount of data to be
                                sent now. */
@@ -398,10 +412,15 @@
                                                   MPI_BYTE, dst,
                                                   MPIR_ALLGATHER_TAG,
                                                   comm, &status); 
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                             /* nprocs_completed is also equal to the
                                no. of processes whose data we don't have */
-                            MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                                last_recv_cnt = 0;
+                            } else
+                                MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
                             curr_cnt += last_recv_cnt;
                         }
                         tmp_mask >>= 1;
@@ -469,10 +488,11 @@
                                       curr_cnt, recvtype,
                                       src, MPIR_ALLGATHER_TAG, comm,
                                       MPI_STATUS_IGNORE);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
-
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             curr_cnt *= 2;
             pof2 *= 2;
         }
@@ -490,9 +510,11 @@
                                       rem * recvcount, recvtype,
                                       src, MPIR_ALLGATHER_TAG, comm,
                                       MPI_STATUS_IGNORE);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* Rotate blocks in tmp_buf down by (rank) blocks and store
@@ -549,9 +571,11 @@
                                       recvcount, recvtype, left, 
                                       MPIR_ALLGATHER_TAG, comm,
                                       MPI_STATUS_IGNORE);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             j	    = jnext;
             jnext = (comm_size + jnext - 1) % comm_size;
         }
@@ -560,8 +584,10 @@
  fn_exit:
     MPIU_CHKLMEM_FREEALL();
     /* check if multiple threads are calling this collective function */
-    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );    
-    return (mpi_errno);
+    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
 
  fn_fail:
     goto fn_exit;
@@ -590,6 +616,7 @@
     */
 
     int rank, local_size, remote_size, mpi_errno = MPI_SUCCESS, root;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Aint true_extent, true_lb = 0, extent, send_extent;
     void *tmp_buf=NULL;
     MPID_Comm *newcomm_ptr = NULL;
@@ -624,9 +651,11 @@
     if (sendcount != 0) {
         mpi_errno = MPIR_Gather_impl(sendbuf, sendcount, sendtype, tmp_buf, sendcount,
                                      sendtype, 0, newcomm_ptr);
-	if (mpi_errno) { 
-	    MPIU_ERR_POP(mpi_errno);
-	}
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     /* first broadcast from left to right group, then from right to
@@ -637,9 +666,11 @@
             root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
             mpi_errno = MPIR_Bcast_inter(tmp_buf, sendcount*local_size,
                                          sendtype, root, comm_ptr);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* receive bcast from right */
@@ -647,9 +678,11 @@
             root = 0;
             mpi_errno = MPIR_Bcast_inter(recvbuf, recvcount*remote_size,
                                          recvtype, root, comm_ptr);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
     else {
@@ -658,9 +691,11 @@
             root = 0;
             mpi_errno = MPIR_Bcast_inter(recvbuf, recvcount*remote_size,
                                          recvtype, root, comm_ptr);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* bcast to left */
@@ -668,14 +703,18 @@
             root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
             mpi_errno = MPIR_Bcast_inter(tmp_buf, sendcount*local_size,
                                          sendtype, root, comm_ptr);
-	    if (mpi_errno) { 
-		MPIU_ERR_POP(mpi_errno);
-	    }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
-  fn_exit:    
+  fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 
   fn_fail:

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -79,7 +79,8 @@
 {
     MPI_Comm comm;
     int        comm_size, rank, j, i, left, right;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     MPI_Aint recvbuf_extent, recvtype_extent, recvtype_true_extent, 
 	recvtype_true_lb;
@@ -191,11 +192,15 @@
                                               total_count - recv_offset, recvtype, dst,
                                               MPIR_ALLGATHERV_TAG,
                                               comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                    /* for convenience, recv is posted for a bigger amount
-                       than will be sent */ 
-                    
-                    MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        last_recv_cnt = 0;
+                    } else
+                        /* for convenience, recv is posted for a bigger amount
+                           than will be sent */
+                        MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
                     curr_cnt += last_recv_cnt;
                 }
                 
@@ -254,7 +259,11 @@
                                                   last_recv_cnt,
                                                   recvtype, dst,
                                                   MPIR_ALLGATHERV_TAG, comm);
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                             /* last_recv_cnt was set in the previous
                                receive. that's the amount of data to be
                                sent now. */
@@ -273,11 +282,15 @@
                                                   total_count - offset, recvtype,
                                                   dst, MPIR_ALLGATHERV_TAG,
                                                   comm, &status);
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                            /* for convenience, recv is posted for a
-                               bigger amount than will be sent */ 
-                            
-                            MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                                last_recv_cnt = 0;
+                            } else
+                                /* for convenience, recv is posted for a
+                                   bigger amount than will be sent */
+                                MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
                             curr_cnt += last_recv_cnt;
                         }
                         tmp_mask >>= 1;
@@ -377,11 +390,15 @@
                                               ((char *)tmp_buf + recv_offset),
                                               tmp_buf_size-recv_offset, MPI_BYTE, dst,
                                               MPIR_ALLGATHERV_TAG, comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                    /* for convenience, recv is posted for a bigger amount
-                       than will be sent */ 
-                    
-                    MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        last_recv_cnt = 0;
+                    } else
+                        /* for convenience, recv is posted for a bigger amount
+                           than will be sent */
+                        MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
                     curr_cnt += last_recv_cnt;
                 }
                 
@@ -432,7 +449,11 @@
                                                   last_recv_cnt, MPI_BYTE,
                                                   dst, MPIR_ALLGATHERV_TAG,
                                                   comm);
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                             /* last_recv_cnt was set in the previous
                                receive. that's the amount of data to be
                                sent now. */
@@ -447,10 +468,15 @@
                                                   dst,
                                                   MPIR_ALLGATHERV_TAG,
                                                   comm, &status);
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                            /* for convenience, recv is posted for a bigger amount
-                               than will be sent */ 
-                            MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                                last_recv_cnt = 0;
+                            } else
+                                /* for convenience, recv is posted for a bigger amount
+                                   than will be sent */ 
+                                MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
                             curr_cnt += last_recv_cnt;
                         }
                         tmp_mask >>= 1;
@@ -523,9 +549,13 @@
                                   ((char *)tmp_buf + curr_cnt*recvtype_extent),
                                       total_count - curr_cnt, recvtype,
                                       src, MPIR_ALLGATHERV_TAG, comm, &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-            MPIR_Get_count_impl(&status, recvtype, &recv_cnt);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                recv_cnt = 0;
+            } else
+                MPIR_Get_count_impl(&status, recvtype, &recv_cnt);
             curr_cnt += recv_cnt;
 
             pof2 *= 2;
@@ -548,7 +578,11 @@
                                       total_count - curr_cnt, recvtype,
                                       src, MPIR_ALLGATHERV_TAG, comm,
                                       MPI_STATUS_IGNORE);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* Rotate blocks in tmp_buf down by (rank) blocks and store
@@ -631,19 +665,31 @@
 	    }
 	    else if (!sendnow) { /* If there's no data to send, just do a recv call */
 		mpi_errno = MPIC_Recv(rbuf, recvnow, recvtype, left, MPIR_ALLGATHERV_TAG, comm, &status);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
 		torecv -= recvnow;
 	    }
 	    else if (!recvnow) { /* If there's no data to receive, just do a send call */
 		mpi_errno = MPIC_Send(sbuf, sendnow, recvtype, right, MPIR_ALLGATHERV_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
 		tosend -= sendnow;
 	    }
 	    else { /* There's data to be sent and received */
 		mpi_errno = MPIC_Sendrecv(sbuf, sendnow, recvtype, right, MPIR_ALLGATHERV_TAG, 
 					  rbuf, recvnow, recvtype, left, MPIR_ALLGATHERV_TAG,
 					  comm, &status);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
 		tosend -= sendnow;
 		torecv -= recvnow;
 	    }
@@ -665,6 +711,8 @@
     MPIU_CHKLMEM_FREEALL();
   /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;
@@ -696,6 +744,7 @@
    and then does an intracommunicator broadcast. 
 */
     int remote_size, mpi_errno, root, rank;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPID_Comm *newcomm_ptr = NULL;
     MPI_Datatype newtype = MPI_DATATYPE_NULL;
 
@@ -710,13 +759,21 @@
         mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
                                       recvcounts, displs, recvtype, root,
                                       comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         /* gatherv to right group */
         root = 0;
         mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
                                       recvcounts, displs, recvtype, root,
                                       comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* gatherv to left group  */
@@ -724,13 +781,21 @@
         mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
                                       recvcounts, displs, recvtype, root,
                                       comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         /* gatherv from left group */
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
                                       recvcounts, displs, recvtype, root,
                                       comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     /* now do an intracommunicator broadcast within each group. we use
@@ -751,11 +816,17 @@
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
     mpi_errno = MPIR_Bcast_intra(recvbuf, 1, newtype, 0, newcomm_ptr);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
 
     MPIR_Type_free_impl(&newtype);
 
  fn_exit:
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     /* --BEGIN ERROR HANDLING-- */

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -131,7 +131,8 @@
     int rc;
 #endif
     int        comm_size, rank, type_size;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int mask, dst, is_commutative, pof2, newrank, rem, newdst, i,
         send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps; 
     MPI_Aint true_extent, true_lb, extent;
@@ -174,10 +175,18 @@
                    allreduce is in recvbuf. Pass that as the sendbuf to reduce. */
 			
                 mpi_errno = MPIR_Reduce_impl(recvbuf, NULL, count, datatype, op, 0, comm_ptr->node_comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             } else {
                 mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, 0, comm_ptr->node_comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
         } else {
             /* only one process on the node. copy sendbuf to recvbuf */
@@ -190,13 +199,21 @@
         /* now do an IN_PLACE allreduce among the local roots of all nodes */
         if (comm_ptr->node_roots_comm != NULL) {
             mpi_errno = allreduce_intra_or_coll_fn(MPI_IN_PLACE, recvbuf, count, datatype, op, comm_ptr->node_roots_comm);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* now broadcast the result among local processes */
         if (comm_ptr->node_comm != NULL) {
             mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, comm_ptr->node_comm);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         goto fn_exit;
     }
@@ -215,17 +232,18 @@
            do a reduce to 0 and then broadcast. */
         mpi_errno = MPIR_Reduce_impl ( sendbuf, recvbuf, count, datatype,
                                        op, 0, comm_ptr );
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-	/* FIXME: mpi_errno is error CODE, not necessarily the error
-	   class MPI_ERR_OP.  In MPICH2, we can get the error class 
-	   with 
-	       errorclass = mpi_errno & ERROR_CLASS_MASK;
-	*/
-        if (mpi_errno == MPI_ERR_OP || mpi_errno == MPI_SUCCESS) {
-	    /* Allow MPI_ERR_OP since we can continue from this error */
-            rc = MPIR_Bcast_impl( recvbuf, count, datatype, 0, comm_ptr );
-            if (rc) mpi_errno = rc;
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
         }
+
+        mpi_errno = MPIR_Bcast_impl( recvbuf, count, datatype, 0, comm_ptr );
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else 
 #endif /* MPID_HAS_HETERO */
@@ -299,7 +317,11 @@
                 mpi_errno = MPIC_Send(recvbuf, count, 
                                       datatype, rank+1,
                                       MPIR_ALLREDUCE_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* temporarily set the rank to -1 so that this
                    process does not pariticipate in recursive
@@ -311,7 +333,11 @@
                                       datatype, rank-1,
                                       MPIR_ALLREDUCE_TAG, comm,
                                       MPI_STATUS_IGNORE);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
 
                 /* do the reduction on received data. since the
                    ordering is right, it doesn't matter whether
@@ -360,7 +386,11 @@
                                               count, datatype, dst,
                                               MPIR_ALLREDUCE_TAG, comm,
                                               MPI_STATUS_IGNORE);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                     
                     /* tmp_buf contains data received in this step.
                        recvbuf contains data accumulated so far */
@@ -456,7 +486,11 @@
                                               recv_cnt, datatype, dst,
                                               MPIR_ALLREDUCE_TAG, comm,
                                               MPI_STATUS_IGNORE);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                     
                     /* tmp_buf contains data received in this step.
                        recvbuf contains data accumulated so far */
@@ -516,7 +550,11 @@
                                               recv_cnt, datatype, dst,
                                               MPIR_ALLREDUCE_TAG, comm,
                                               MPI_STATUS_IGNORE);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
 
                     if (newrank > newdst) send_idx = recv_idx;
 
@@ -538,7 +576,11 @@
                                       datatype, rank+1,
                                       MPIR_ALLREDUCE_TAG, comm,
                                       MPI_STATUS_IGNORE);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         if (MPIU_THREADPRIV_FIELD(op_errno)) 
@@ -550,6 +592,8 @@
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
 
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return (mpi_errno);
 
   fn_fail:
@@ -580,6 +624,7 @@
    broadcasts because it would require allocation of a temporary buffer. 
 */
     int rank, mpi_errno, root;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPID_Comm *newcomm_ptr = NULL;
     
     rank = comm_ptr->rank;
@@ -591,26 +636,42 @@
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
 				      root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         /* reduce to rank 0 of right group */
         root = 0;
         mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
 				      root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* reduce to rank 0 of left group */
         root = 0;
         mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
 				      root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         /* reduce from right group to rank 0 */
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
 				      root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     /* Get the local intracommunicator */
@@ -620,9 +681,15 @@
     newcomm_ptr = comm_ptr->local_comm;
 
     mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, newcomm_ptr);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
 
   fn_exit:
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 
   fn_fail:

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -88,6 +88,7 @@
     MPI_Aint     sendtype_extent, recvtype_extent;
     MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
     int mpi_errno=MPI_SUCCESS, src, dst, rank, nbytes;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     int sendtype_size, pack_size, block, position, *displs, count;
     MPI_Datatype newtype = MPI_DATATYPE_NULL;
@@ -138,7 +139,11 @@
                                                       j, MPIR_ALLTOALL_TAG,
                                                       j, MPIR_ALLTOALL_TAG,
                                                       comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
                 else if (rank == j) {
                     /* same as above with i/j args reversed */
@@ -147,7 +152,11 @@
                                                       i, MPIR_ALLTOALL_TAG,
                                                       i, MPIR_ALLTOALL_TAG,
                                                       comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
         }
@@ -216,7 +225,11 @@
                                       MPIR_ALLTOALL_TAG, recvbuf, 1, newtype,
                                       src, MPIR_ALLTOALL_TAG, comm,
                                       MPI_STATUS_IGNORE);
-	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
 
             MPIR_Type_free_impl(&newtype);
 
@@ -302,11 +315,15 @@
 					  sendbuf_extent*(comm_size-dst_tree_root),
                                           sendtype, dst, MPIR_ALLTOALL_TAG, 
                                           comm, &status);
-		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-                
-                /* in case of non-power-of-two nodes, less data may be
-                   received than specified */
-                MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    last_recv_cnt = 0;
+                } else
+                    /* in case of non-power-of-two nodes, less data may be
+                       received than specified */
+                    MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
                 curr_cnt += last_recv_cnt;
             }
             
@@ -351,7 +368,11 @@
                                               last_recv_cnt, sendtype,
                                               dst, MPIR_ALLTOALL_TAG,
                                               comm);  
-			if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+                        if (mpi_errno) {
+                            /* for communication errors, just record the error but continue */
+                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        }
                     }
                     /* recv only if this proc. doesn't have data and sender
                        has data */
@@ -364,8 +385,13 @@
                                               sendtype,   
                                               dst, MPIR_ALLTOALL_TAG,
                                               comm, &status); 
-			if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-                        MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
+                        if (mpi_errno) {
+                            /* for communication errors, just record the error but continue */
+                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            last_recv_cnt = 0;
+                        } else
+                            MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
                         curr_cnt += last_recv_cnt;
                     }
                     tmp_mask >>= 1;
@@ -430,7 +456,7 @@
                                        recvcount, recvtype, dst,
                                        MPIR_ALLTOALL_TAG, comm,
                                        &reqarray[i]);
-                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
             }
 
             for ( i=0; i<ss; i++ ) { 
@@ -440,7 +466,7 @@
                                        sendcount, sendtype, dst,
                                        MPIR_ALLTOALL_TAG, comm,
                                        &reqarray[i+ss]);
-                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
             }
   
             /* ... then wait for them to finish: */
@@ -452,7 +478,11 @@
                 for (j=0; j<2*ss; j++) {
                     if (starray[j].MPI_ERROR != MPI_SUCCESS) {
                         mpi_errno = starray[j].MPI_ERROR;
-                        MPIU_ERR_POP(mpi_errno);
+                        if (mpi_errno) {
+                            /* for communication errors, just record the error but continue */
+                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        }
                     }
                 }
             }
@@ -502,7 +532,11 @@
                                        src*recvcount*recvtype_extent),
                                       recvcount, recvtype, src,
                                       MPIR_ALLTOALL_TAG, comm, &status);
-	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
@@ -510,7 +544,9 @@
     MPIU_CHKLMEM_FREEALL();
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
  fn_fail:
     if (newtype != MPI_DATATYPE_NULL)
         MPIR_Type_free_impl(&newtype);
@@ -544,7 +580,8 @@
 */
     int          local_size, remote_size, max_size, i;
     MPI_Aint     sendtype_extent, recvtype_extent;
-    int          mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     int src, dst, rank;
     char *sendaddr, *recvaddr;
@@ -590,13 +627,19 @@
                                   MPIR_ALLTOALL_TAG, recvaddr,
                                   recvcount, recvtype, src,
                                   MPIR_ALLTOALL_TAG, comm, &status);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
  fn_exit:
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
  fn_fail:
     goto fn_exit;
 }

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -62,7 +62,8 @@
 	MPID_Comm *comm_ptr )
 {
     int        comm_size, i, j;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     MPI_Status *starray;
     MPI_Request *reqarray;
@@ -100,7 +101,11 @@
                                                       j, MPIR_ALLTOALL_TAG,
                                                       j, MPIR_ALLTOALL_TAG,
                                                       comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
                 else if (rank == j) {
                     /* same as above with i/j args reversed */
@@ -109,7 +114,11 @@
                                                       i, MPIR_ALLTOALL_TAG,
                                                       i, MPIR_ALLTOALL_TAG,
                                                       comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
         }
@@ -167,7 +176,11 @@
                 for (i=0; i<outstanding_requests; i++) {
                     if (starray[i].MPI_ERROR != MPI_SUCCESS) {
                         mpi_errno = starray[i].MPI_ERROR;
-                        MPIU_ERR_POP(mpi_errno);
+                        if (mpi_errno) {
+                            /* for communication errors, just record the error but continue */
+                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        }
                     }
                 }
             }
@@ -193,7 +206,11 @@
                                       ((char *)recvbuf+rdispls[src]), 
                                       recvcnts[src], recvtypes[dst], src,
                                       MPIR_ALLTOALLW_TAG, comm, &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 #endif
     }
@@ -202,7 +219,9 @@
   fn_exit:
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );  
     MPIU_CHKLMEM_FREEALL();
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
 
   fn_fail:
     goto fn_exit;
@@ -238,6 +257,7 @@
 */
     int local_size, remote_size, max_size, i;
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     int src, dst, rank, sendcount, recvcount;
     char *sendaddr, *recvaddr;
@@ -284,13 +304,19 @@
                                   dst, MPIR_ALLTOALLW_TAG, recvaddr, 
                                   recvcount, recvtype, src,
                                   MPIR_ALLTOALLW_TAG, comm, &status);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     
  fn_exit:
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
  fn_fail:
     goto fn_exit;
 }

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -55,6 +55,7 @@
 int MPIR_Barrier_intra( MPID_Comm *comm_ptr )
 {
     int size, rank, src, dst, mask, mpi_errno=MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Comm comm;
 
     /* Only one collective operation per communicator can be active at any
@@ -76,13 +77,18 @@
                                   MPIR_BARRIER_TAG, NULL, 0, MPI_BYTE,
                                   src, MPIR_BARRIER_TAG, comm,
                                   MPI_STATUS_IGNORE);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         mask <<= 1;
     }
 
  fn_exit:
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;
@@ -128,6 +134,7 @@
 int MPIR_Barrier_inter( MPID_Comm *comm_ptr )
 {
     int rank, mpi_errno = MPI_SUCCESS, root;
+    int mpi_errno_ret = MPI_SUCCESS;
     int i = 0;
     MPID_Comm *newcomm_ptr = NULL;
 
@@ -143,7 +150,11 @@
 
     /* do a barrier on the local intracommunicator */
     mpi_errno = MPIR_Barrier_intra(newcomm_ptr);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
 
     /* rank 0 on each group does an intercommunicator broadcast to the
        remote group to indicate that all processes in the local group
@@ -156,23 +167,41 @@
         /* bcast to right*/
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         /* receive bcast from right */
         root = 0;
         mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* receive bcast from left */
         root = 0;
         mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         /* bcast to left */
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
  fn_exit:
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;
@@ -217,6 +246,7 @@
 int MPIR_Barrier_impl(MPID_Comm *comm_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     
     if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Barrier != NULL)
     {
@@ -233,13 +263,21 @@
                 if (comm_ptr->node_comm != NULL)
                 {
                     mpi_errno = MPIR_Barrier_or_coll_fn(comm_ptr->node_comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
 
                 /* do the barrier across roots of all nodes */
                 if (comm_ptr->node_roots_comm != NULL) {
                     mpi_errno = MPIR_Barrier_or_coll_fn(comm_ptr->node_roots_comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
 
                 /* release the local processes on each node with a 1-byte broadcast
@@ -248,7 +286,11 @@
                 {
 		    int i=0;
                     mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
             else {
@@ -268,6 +310,8 @@
     }
         
  fn_exit:
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -58,7 +58,8 @@
 {
     int        rank, comm_size, src, dst;
     int        relative_rank, mask;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int nbytes=0;
     int type_size, is_contig, is_homogeneous;
     int position;
@@ -154,7 +155,11 @@
             else
                 mpi_errno = MPIC_Recv(buffer,count,datatype,src,
                                       MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             break;
         }
         mask <<= 1;
@@ -184,7 +189,11 @@
             else
                 mpi_errno = MPIC_Send(buffer,count,datatype,dst,
                                       MPIR_BCAST_TAG,comm); 
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         mask >>= 1;
     }
@@ -203,6 +212,8 @@
 
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -236,7 +247,8 @@
     MPI_Status status;
     int        rank, comm_size, src, dst;
     int        relative_rank, mask;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int scatter_size, curr_size, recv_size = 0, send_size;
     MPI_Comm comm;
 
@@ -283,10 +295,14 @@
                                        relative_rank*scatter_size),
                                       recv_size, MPI_BYTE, src,
                                       MPIR_BCAST_TAG, comm, &status);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-                /* query actual size of data received */
-                MPIR_Get_count_impl(&status, MPI_BYTE, &curr_size);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    curr_size = 0;
+                } else
+                    /* query actual size of data received */
+                    MPIR_Get_count_impl(&status, MPI_BYTE, &curr_size);
             }
             break;
         }
@@ -314,7 +330,11 @@
                                         scatter_size*(relative_rank+mask)),
                                        send_size, MPI_BYTE, dst,
                                        MPIR_BCAST_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
 
                 curr_size -= send_size;
             }
@@ -323,6 +343,8 @@
     }
 
 fn_exit:
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -363,6 +385,7 @@
     int rank, comm_size, dst;
     int relative_rank, mask;
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int scatter_size, nbytes=0, curr_size, recv_size = 0;
     int type_size, j, k, i, tmp_mask, is_contig, is_homogeneous;
     int relative_dst, dst_tree_root, my_tree_root, send_offset;
@@ -436,7 +459,11 @@
 
     mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
                                   nbytes, tmp_buf, is_contig, is_homogeneous);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
 
     /* medium size allgather and pof2 comm_size. use recurive doubling. */
 
@@ -470,9 +497,13 @@
                                       ((char *)tmp_buf + recv_offset),
                                       (nbytes-recv_offset < 0 ? 0 : nbytes-recv_offset), 
                                       MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-            MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                recv_size = 0;
+            } else
+                MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
             curr_size += recv_size;
         }
 
@@ -540,7 +571,11 @@
                     /* recv_size was set in the previous
                        receive. that's the amount of data to be
                        sent now. */
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
                 /* recv only if this proc. doesn't have data and sender
                    has data */
@@ -556,9 +591,13 @@
                                           comm, &status); 
                     /* nprocs_completed is also equal to the no. of processes
                        whose data we don't have */
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-                    MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        recv_size = 0;
+                    } else
+                        MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
                     curr_size += recv_size;
                     /* printf("Rank %d, recv from %d, offset %d, size %d\n", rank, dst, offset, recv_size);
                        fflush(stdout);*/
@@ -586,6 +625,8 @@
 
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -621,6 +662,7 @@
     int rank, comm_size;
     int relative_rank;
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int scatter_size, nbytes;
     int type_size, j, i, is_contig, is_homogeneous;
     int position;
@@ -690,7 +732,11 @@
 
     mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
                                   nbytes, tmp_buf, is_contig, is_homogeneous);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
 
     /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */ 
 
@@ -727,7 +773,11 @@
                           recvcnts[(jnext-root+comm_size)%comm_size],  
                           MPI_BYTE, left,   
                           MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         j     = jnext;
         jnext = (comm_size + jnext - 1) % comm_size;
@@ -746,6 +796,8 @@
 
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -769,7 +821,11 @@
         {                                                                                        \
             mpi_errno_ = bcast_fn_(buffer_, count_, datatype_, root_, comm_ptr_);                \
         }                                                                                        \
-        if (mpi_errno_) MPIU_ERR_POP(mpi_errno_);                                                \
+        if (mpi_errno) {                                                                         \
+            /* for communication errors, just record the error but continue */                   \
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");                                    \
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);                                              \
+        }                                                                                        \
     } while (0)
 
 /* FIXME This function uses some heuristsics based off of some testing on a
@@ -786,6 +842,7 @@
         MPID_Comm *comm_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int type_size, is_homogeneous;
     int nbytes=0;
 
@@ -828,7 +885,11 @@
                 mpi_errno = MPIC_Recv(buffer,count,datatype,MPIU_Get_intranode_rank(comm_ptr, root),
                                       MPIR_BCAST_TAG,comm_ptr->node_comm->handle,MPI_STATUS_IGNORE);
             }
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* perform the internode broadcast */
@@ -905,11 +966,17 @@
                algorithm that (at least approximately) minimized internode
                communication. */
             mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
 fn_exit:
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -971,6 +1038,7 @@
         MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int comm_size;
     int nbytes=0;
     int type_size, is_homogeneous;
@@ -987,7 +1055,11 @@
 #if defined(USE_SMP_COLLECTIVES)
     if (MPIR_Comm_is_node_aware(comm_ptr)) {
         mpi_errno = MPIR_SMP_Bcast(buffer, count, datatype, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         goto fn_exit;
     }
 #endif
@@ -1018,14 +1090,22 @@
     if ((nbytes < MPIR_PARAM_BCAST_SHORT_MSG_SIZE) || (comm_size < MPIR_PARAM_BCAST_MIN_PROCS))
     {
         mpi_errno = MPIR_Bcast_binomial(buffer, count, datatype, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else /* (nbytes >= MPIR_PARAM_BCAST_SHORT_MSG_SIZE) && (comm_size >= MPIR_PARAM_BCAST_MIN_PROCS) */
     {
         if ((nbytes < MPIR_PARAM_BCAST_LONG_MSG_SIZE) && (MPIU_is_pof2(comm_size, NULL)))
         {
             mpi_errno = MPIR_Bcast_scatter_doubling_allgather(buffer, count, datatype, root, comm_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         else /* (nbytes >= MPIR_PARAM_BCAST_LONG_MSG_SIZE) || !(comm_size_is_pof2) */
         {
@@ -1033,7 +1113,11 @@
                topologically aware communicator.  Doing inter/intra-node
                communication phases breaks the pipelining of the algorithm.  */
             mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
@@ -1043,6 +1127,8 @@
 
     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST);
 
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -1067,6 +1153,7 @@
     intracommunicator broadcast.
 */
     int rank, mpi_errno;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     MPID_Comm *newcomm_ptr = NULL;
     MPI_Comm comm;
@@ -1087,7 +1174,11 @@
         MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
         mpi_errno =  MPIC_Send(buffer, count, datatype, 0,
                                MPIR_BCAST_TAG, comm); 
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
     }
     else
@@ -1100,7 +1191,11 @@
         {
             mpi_errno = MPIC_Recv(buffer, count, datatype, root,
                                   MPIR_BCAST_TAG, comm, &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         
         /* Get the local intracommunicator */
@@ -1112,11 +1207,17 @@
         /* now do the usual broadcast on this intracommunicator
            with rank 0 as root. */
         mpi_errno = MPIR_Bcast_intra(buffer, count, datatype, 0, newcomm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
 fn_fail:
     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST_INTER);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 }
 

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -106,7 +106,8 @@
 {
     MPI_Status status;
     int        rank, comm_size;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int mask, dst, is_commutative, flag; 
     MPI_Aint true_extent, true_lb, extent;
     void *partial_scan, *tmp_buf;
@@ -187,7 +188,11 @@
                                       count, datatype, dst,
                                       MPIR_EXSCAN_TAG, comm,
                                       &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
 
             if (rank > dst) {
                 call_uop(tmp_buf, partial_scan, count, datatype);
@@ -237,7 +242,9 @@
 
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
 fn_fail:
     goto fn_exit;
 }

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -61,7 +61,8 @@
 	MPID_Comm *comm_ptr )
 {
     int        comm_size, rank;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int curr_cnt=0, relative_rank, nbytes, is_homogeneous;
     int mask, sendtype_size, recvtype_size, src, dst, relative_src;
     int recvblks;
@@ -194,7 +195,11 @@
 						  recvblks * recvcnt, recvtype, src,
 						  MPIR_GATHER_TAG, comm,
 						  &status);
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
 			}
 			else if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) {
 			    mpi_errno = MPIC_Recv(tmp_buf, recvblks * nbytes, MPI_BYTE,
@@ -218,8 +223,9 @@
 			    mpi_errno = MPIC_Recv(recvbuf, 1, tmp_type, src,
 						  MPIR_GATHER_TAG, comm, &status);
                             if (mpi_errno) {
-                                MPIR_Type_free_impl(&tmp_type);
-                                MPIU_ERR_POP(mpi_errno);
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                             }
 
 			    MPIR_Type_free_impl(&tmp_type);
@@ -243,7 +249,11 @@
 					      recvblks * nbytes, MPI_BYTE, src,
 					      MPIR_GATHER_TAG, comm,
 					      &status);
-                        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                        if (mpi_errno) {
+                            /* for communication errors, just record the error but continue */
+                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        }
 			curr_cnt += (recvblks * nbytes);
                     }
                 }
@@ -258,12 +268,20 @@
                     /* leaf nodes send directly from sendbuf */
                     mpi_errno = MPIC_Send(sendbuf, sendcnt, sendtype, dst,
                                           MPIR_GATHER_TAG, comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
                 else if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) {
 		    mpi_errno = MPIC_Send(tmp_buf, curr_cnt, MPI_BYTE, dst,
 					  MPIR_GATHER_TAG, comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
 		}
 		else {
 		    blocks[0] = sendcnt;
@@ -282,8 +300,9 @@
 		    mpi_errno = MPIC_Send(MPI_BOTTOM, 1, tmp_type, dst,
 					  MPIR_GATHER_TAG, comm);
                     if (mpi_errno) {
-                        MPIR_Type_free_impl(&tmp_type);
-                        MPIU_ERR_POP(mpi_errno);
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                     }
 		    MPIR_Type_free_impl(&tmp_type);
 		}
@@ -352,10 +371,15 @@
                                           tmp_buf_size-curr_cnt, MPI_BYTE, src,
                                           MPIR_GATHER_TAG, comm, 
                                           &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                    /* the recv size is larger than what may be sent in
-                       some cases. query amount of data actually received */
-                    MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        recv_size = 0;
+                    } else
+                        /* the recv size is larger than what may be sent in
+                           some cases. query amount of data actually received */
+                        MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
                     curr_cnt += recv_size;
                 }
             }
@@ -365,7 +389,11 @@
                 dst = (dst + root) % comm_size;
                 mpi_errno = MPIC_Send(tmp_buf, curr_cnt, MPI_BYTE, dst,
                                       MPIR_GATHER_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 break;
             }
             mask <<= 1;
@@ -404,6 +432,8 @@
     MPIU_CHKLMEM_FREEALL();
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;
@@ -438,6 +468,7 @@
 */
 
     int rank, local_size, remote_size, mpi_errno=MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int i, nbytes, sendtype_size, recvtype_size;
     MPI_Status status;
     MPI_Aint extent, true_extent, true_lb = 0;
@@ -478,8 +509,11 @@
             mpi_errno = MPIC_Recv(recvbuf, recvcnt*remote_size,
                                   recvtype, 0, MPIR_GATHER_TAG, comm,
                                   &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-            
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         else
 	{
@@ -513,14 +547,22 @@
             mpi_errno = MPIR_Gather_impl(sendbuf, sendcnt, sendtype,
                                          tmp_buf, sendcnt, sendtype, 0,
                                          newcomm_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             if (rank == 0)
 	    {
                 mpi_errno = MPIC_Send(tmp_buf, sendcnt*local_size,
                                       sendtype, root,
                                       MPIR_GATHER_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
         }
     }
@@ -538,20 +580,30 @@
                 mpi_errno = MPIC_Recv(((char *)recvbuf+recvcnt*i*extent), 
                                       recvcnt, recvtype, i,
                                       MPIR_GATHER_TAG, comm, &status);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
         }
         else
 	{
             mpi_errno = MPIC_Send(sendbuf,sendcnt,sendtype,root,
                                   MPIR_GATHER_TAG,comm);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
  fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -61,6 +61,7 @@
 {
     int        comm_size, rank;
     int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Comm comm;
     MPI_Aint       extent;
     int            i, reqs;
@@ -120,7 +121,11 @@
             for (i = 0; i < reqs; i++) {
                 if (starray[i].MPI_ERROR != MPI_SUCCESS) {
                     mpi_errno = starray[i].MPI_ERROR;
-                    MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
         }
@@ -143,12 +148,20 @@
             if (comm_size >= min_procs) {
                 mpi_errno = MPIC_Ssend(sendbuf, sendcnt, sendtype, root, 
                                        MPIR_GATHERV_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
             else {
                 mpi_errno = MPIC_Send(sendbuf, sendcnt, sendtype, root, 
                                       MPIR_GATHERV_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
         }
     }
@@ -158,6 +171,8 @@
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -83,6 +83,7 @@
     MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int comm_size = comm_ptr->local_size;
     int rank = comm_ptr->rank;
     int pof2;
@@ -191,7 +192,11 @@
                                   incoming_data + recv_offset*true_extent,
                                   size, datatype, peer, MPIR_REDUCE_SCATTER_TAG,
                                   comm, MPI_STATUS_IGNORE);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         /* always perform the reduction at recv_offset, the data at send_offset
            is now our peer's responsibility */
         if (rank > peer) {
@@ -222,6 +227,8 @@
                                recvbuf, size, datatype);
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -292,7 +299,8 @@
     MPI_Aint extent, true_extent, true_lb; 
     int  *disps;
     void *tmp_recvbuf, *tmp_results;
-    int   mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int type_size, dis[2], blklens[2], total_count, nbytes, src, dst;
     int mask, dst_tree_root, my_tree_root, j, k;
     int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx,
@@ -408,7 +416,11 @@
                 mpi_errno = MPIC_Send(tmp_results, total_count, 
                                       datatype, rank+1,
                                       MPIR_REDUCE_SCATTER_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* temporarily set the rank to -1 so that this
                    process does not pariticipate in recursive
@@ -420,7 +432,11 @@
                                       datatype, rank-1,
                                       MPIR_REDUCE_SCATTER_TAG, comm,
                                       MPI_STATUS_IGNORE);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* do the reduction on received data. since the
                    ordering is right, it doesn't matter whether
@@ -519,7 +535,11 @@
                                           dst, MPIR_REDUCE_SCATTER_TAG,
                                           comm);  
 
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* tmp_recvbuf contains data received in this step.
                    tmp_results contains data accumulated so far */
@@ -567,7 +587,11 @@
                                       disps[rank-1]*extent, recvcnts[rank-1],
                                       datatype, rank-1,
                                       MPIR_REDUCE_SCATTER_TAG, comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
             else  {   /* even */
@@ -576,7 +600,11 @@
                                       datatype, rank+1,
                                       MPIR_REDUCE_SCATTER_TAG, comm,
                                       MPI_STATUS_IGNORE);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
         }
@@ -621,7 +649,11 @@
                                           MPIR_REDUCE_SCATTER_TAG, comm,
                                           MPI_STATUS_IGNORE);
             
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             if (is_commutative || (src < rank)) {
                 if (sendbuf != MPI_IN_PLACE) {
@@ -818,7 +850,11 @@
                                               MPIR_REDUCE_SCATTER_TAG, comm,
                                               MPI_STATUS_IGNORE); 
                     received = 1;
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
 
                 /* if some processes in this process's subtree in this step
@@ -871,7 +907,11 @@
                                                   MPIR_REDUCE_SCATTER_TAG,
                                                   comm, MPI_STATUS_IGNORE); 
                             received = 1;
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                         }
                         tmp_mask >>= 1;
                         k--;
@@ -959,7 +999,9 @@
     if (MPIU_THREADPRIV_FIELD(op_errno)) 
 	mpi_errno = MPIU_THREADPRIV_FIELD(op_errno);
 
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
 fn_fail:
     goto fn_exit;
 }
@@ -986,6 +1028,7 @@
 */
     
     int rank, mpi_errno, root, local_size, total_count, i;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Aint true_extent, true_lb = 0, extent;
     void *tmp_buf=NULL;
     int *disps=NULL;
@@ -1026,26 +1069,42 @@
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         /* reduce to rank 0 of right group */
         root = 0;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* reduce to rank 0 of left group */
         root = 0;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         /* reduce from right group to rank 0 */
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     /* Get the local intracommunicator */
@@ -1058,10 +1117,16 @@
 
     mpi_errno = MPIR_Scatterv(tmp_buf, recvcnts, disps, datatype, recvbuf,
                               recvcnts[rank], datatype, 0, newcomm_ptr);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
     
  fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;
@@ -1073,7 +1138,7 @@
    implementations of reduce_scatter.  In all other cases
    MPIR_Reduce_Scatter_impl should be used. */
 #undef FUNCNAME
-#define FUNCNAME MPIR_Reduce_scatter_impl
+#define FUNCNAME MPIR_Reduce_scatter
 #undef FCNAME
 #define FCNAME MPIU_QUOTE(FUNCNAME)
 int MPIR_Reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts,

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -89,6 +89,7 @@
     MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int comm_size = comm_ptr->local_size;
     int rank = comm_ptr->rank;
     int pof2;
@@ -193,7 +194,11 @@
                                   incoming_data + recv_offset*true_extent,
                                   size, datatype, peer, MPIR_REDUCE_SCATTER_BLOCK_TAG,
                                   comm, MPI_STATUS_IGNORE);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         /* always perform the reduction at recv_offset, the data at send_offset
            is now our peer's responsibility */
         if (rank > peer) {
@@ -226,6 +231,8 @@
     
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -296,7 +303,8 @@
     MPI_Aint extent, true_extent, true_lb; 
     int  *disps;
     void *tmp_recvbuf, *tmp_results;
-    int   mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int type_size, dis[2], blklens[2], total_count, nbytes, src, dst;
     int mask, dst_tree_root, my_tree_root, j, k;
     int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx,
@@ -411,7 +419,11 @@
                 mpi_errno = MPIC_Send(tmp_results, total_count, 
                                       datatype, rank+1,
                                       MPIR_REDUCE_SCATTER_BLOCK_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* temporarily set the rank to -1 so that this
                    process does not pariticipate in recursive
@@ -423,7 +435,11 @@
                                       datatype, rank-1,
                                       MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
                                       MPI_STATUS_IGNORE);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* do the reduction on received data. since the
                    ordering is right, it doesn't matter whether
@@ -522,7 +538,11 @@
                                           dst, MPIR_REDUCE_SCATTER_BLOCK_TAG,
                                           comm);  
 
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 
                 /* tmp_recvbuf contains data received in this step.
                    tmp_results contains data accumulated so far */
@@ -573,7 +593,11 @@
                                       MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
                                       MPI_STATUS_IGNORE); 
             }
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
     
@@ -616,7 +640,11 @@
                                           MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
                                           MPI_STATUS_IGNORE);
             
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             if (is_commutative || (src < rank)) {
                 if (sendbuf != MPI_IN_PLACE) {
@@ -803,7 +831,11 @@
                                               MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
                                               MPI_STATUS_IGNORE); 
                     received = 1;
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
 
                 /* if some processes in this process's subtree in this step
@@ -845,7 +877,11 @@
                             mpi_errno = MPIC_Send(tmp_recvbuf, 1, recvtype,
                                                   dst, MPIR_REDUCE_SCATTER_BLOCK_TAG,
                                                   comm);  
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                         }
                         /* recv only if this proc. doesn't have data and sender
                            has data */
@@ -856,7 +892,11 @@
                                                   MPIR_REDUCE_SCATTER_BLOCK_TAG,
                                                   comm, MPI_STATUS_IGNORE); 
                             received = 1;
-                            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                            if (mpi_errno) {
+                                /* for communication errors, just record the error but continue */
+                                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                            }
                         }
                         tmp_mask >>= 1;
                         k--;
@@ -944,7 +984,9 @@
     if (MPIU_THREADPRIV_FIELD(op_errno)) 
 	mpi_errno = MPIU_THREADPRIV_FIELD(op_errno);
 
-    return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
 fn_fail:
     goto fn_exit;
 }
@@ -971,6 +1013,7 @@
 */
     
     int rank, mpi_errno, root, local_size, total_count;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Aint true_extent, true_lb = 0, extent;
     void *tmp_buf=NULL;
     MPID_Comm *newcomm_ptr = NULL;
@@ -1001,26 +1044,42 @@
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
         
         /* reduce to rank 0 of right group */
         root = 0;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* reduce to rank 0 of left group */
         root = 0;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         /* reduce from right group to rank 0 */
         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
         mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
                                 root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     /* Get the local intracommunicator */
@@ -1031,10 +1090,16 @@
 
     mpi_errno = MPIR_Scatter_impl(tmp_buf, recvcount, datatype, recvbuf,
                                   recvcount, datatype, 0, newcomm_ptr);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno) {
+        /* for communication errors, just record the error but continue */
+        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+    }
     
  fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -41,6 +41,7 @@
     MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     int comm_size, rank, is_commutative, type_size;
     int mask, relrank, source, lroot;
@@ -168,7 +169,11 @@
                 source = (source + lroot) % comm_size;
                 mpi_errno = MPIC_Recv (tmp_buf, count, datatype, source, 
                                        MPIR_REDUCE_TAG, comm, &status);
-                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
 
                 /* The sender is above us, so the received buffer must be
                    the second argument (in the noncommutative case). */
@@ -203,7 +208,11 @@
             source = ((relrank & (~ mask)) + lroot) % comm_size;
             mpi_errno  = MPIC_Send( recvbuf, count, datatype, 
                                     source, MPIR_REDUCE_TAG, comm );
-            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             break;
         }
         mask <<= 1;
@@ -221,7 +230,11 @@
             mpi_errno = MPIC_Recv ( recvbuf, count, datatype, 0, 
                                     MPIR_REDUCE_TAG, comm, &status);
         }
-        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     /* FIXME does this need to be checked after each uop invocation for
@@ -235,6 +248,8 @@
 
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -281,6 +296,7 @@
     MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int comm_size, rank, is_commutative, type_size, pof2, rem, newrank;
     int mask, *cnts, *disps, i, j, send_idx=0;
     int recv_idx, last_idx=0, newdst;
@@ -389,7 +405,11 @@
             mpi_errno = MPIC_Send(recvbuf, count, 
                                   datatype, rank-1,
                                   MPIR_REDUCE_TAG, comm);
-            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             /* temporarily set the rank to -1 so that this
                process does not pariticipate in recursive
@@ -401,7 +421,11 @@
                                   datatype, rank+1,
                                   MPIR_REDUCE_TAG, comm,
                                   MPI_STATUS_IGNORE);
-            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             /* do the reduction on received data. */
             /* This algorithm is used only for predefined ops
@@ -480,7 +504,11 @@
                                       recv_cnt, datatype, dst,
                                       MPIR_REDUCE_TAG, comm,
                                       MPI_STATUS_IGNORE);
-            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             /* tmp_buf contains data received in this step.
                recvbuf contains data accumulated so far */
@@ -534,7 +562,11 @@
                 mpi_errno = MPIC_Recv(recvbuf, cnts[0], datatype,  
                                       0, MPIR_REDUCE_TAG, comm,
                                       MPI_STATUS_IGNORE);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 newrank = 0;
                 send_idx = 0;
                 last_idx = 2;
@@ -542,7 +574,11 @@
             else if (newrank == 0) {  /* send */
                 mpi_errno = MPIC_Send(recvbuf, cnts[0], datatype,  
                                       root, MPIR_REDUCE_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 newrank = -1;
             }
             newroot = 0;
@@ -611,7 +647,11 @@
                                       send_cnt, datatype,  
                                       dst, MPIR_REDUCE_TAG, 
                                       comm);
-                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 break;
             }
             else {
@@ -623,7 +663,11 @@
                                       recv_cnt, datatype, dst,
                                       MPIR_REDUCE_TAG, comm,
                                       MPI_STATUS_IGNORE);
-                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
             
             if (newrank > newdst) send_idx = recv_idx;
@@ -644,6 +688,8 @@
 
 fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;
@@ -720,6 +766,7 @@
     MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int comm_size, is_commutative, type_size, pof2;
     MPID_Op *op_ptr;
 #if defined(USE_SMP_COLLECTIVES)
@@ -762,7 +809,11 @@
             MPIU_Get_intranode_rank(comm_ptr, root) == -1) {
             mpi_errno = MPIR_Reduce_impl(sendbuf, tmp_buf, count, datatype,
                                          op, 0, comm_ptr->node_comm);
-            if (mpi_errno) goto fn_fail;
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* do the internode reduce to the root's node */
@@ -774,7 +825,11 @@
                 mpi_errno = MPIR_Reduce_impl(buf, NULL, count, datatype,
                                              op, MPIU_Get_internode_rank(comm_ptr, root),
                                              comm_ptr->node_roots_comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
             else { /* I am on root's node. I have not participated in the earlier reduce. */
                 if (comm_ptr->rank != root) {
@@ -784,7 +839,11 @@
                     mpi_errno = MPIR_Reduce_impl(sendbuf, tmp_buf, count, datatype,
                                                  op, MPIU_Get_internode_rank(comm_ptr, root),
                                                  comm_ptr->node_roots_comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
 
                     /* point sendbuf at tmp_buf to make final intranode reduce easy */
                     sendbuf = tmp_buf;
@@ -795,7 +854,11 @@
                     mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype,
                                                  op, MPIU_Get_internode_rank(comm_ptr, root),
                                                  comm_ptr->node_roots_comm);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
 
                     /* set sendbuf to MPI_IN_PLACE to make final intranode reduce easy. */
                     sendbuf = MPI_IN_PLACE;
@@ -810,7 +873,11 @@
             mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype,
                                          op, MPIU_Get_intranode_rank(comm_ptr, root),
                                          comm_ptr->node_comm);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         
         goto fn_exit;
@@ -842,12 +909,20 @@
         (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) && (count >= pof2)) {
         /* do a reduce-scatter followed by gather to root. */
         mpi_errno = MPIR_Reduce_redscat_gather(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* use a binomial tree algorithm */ 
         mpi_errno = MPIR_Reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
         
 
@@ -857,6 +932,8 @@
 #if defined(USE_SMP_COLLECTIVES)
     MPIU_CHKLMEM_FREEALL();
 #endif
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
   fn_fail:
     goto fn_exit;
@@ -886,6 +963,7 @@
 */
 
     int rank, mpi_errno;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Status status;
     MPI_Aint true_extent, true_lb, extent;
     void *tmp_buf=NULL;
@@ -906,7 +984,11 @@
         /* root receives data from rank 0 on remote group */
         mpi_errno = MPIC_Recv(recvbuf, count, datatype, 0,
                               MPIR_REDUCE_TAG, comm, &status);
-	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else {
         /* remote group. Rank 0 allocates temporary buffer, does
@@ -939,19 +1021,29 @@
         /* now do a local reduce on this intracommunicator */
         mpi_errno = MPIR_Reduce_intra(sendbuf, tmp_buf, count, datatype,
                                       op, 0, newcomm_ptr);
-	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         if (rank == 0)
 	{
             mpi_errno = MPIC_Send(tmp_buf, count, datatype, root,
                                   MPIR_REDUCE_TAG, comm); 
-	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
   fn_exit:
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); 
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 
   fn_fail:

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -75,7 +75,8 @@
 {
     MPI_Status status;
     int        rank, comm_size;
-    int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int mask, dst, is_commutative; 
     MPI_Aint true_extent, true_lb, extent;
     void *partial_scan, *tmp_buf;
@@ -171,7 +172,11 @@
                                       count, datatype, dst,
                                       MPIR_SCAN_TAG, comm,
                                       &status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             
             if (rank > dst) {
 #ifdef HAVE_CXX_BINDING
@@ -228,7 +233,9 @@
      /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
     
-   return (mpi_errno);
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
+    return mpi_errno;
  fn_fail:
     goto fn_exit;
 }
@@ -252,6 +259,7 @@
     MPID_Comm *comm_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPIU_CHKLMEM_DECL(3);
     MPIU_THREADPRIV_DECL;
     int rank = comm_ptr->rank;
@@ -303,7 +311,11 @@
     {
         mpi_errno = MPIR_Scan_impl(sendbuf, recvbuf, count, datatype, 
                                    op, comm_ptr->node_comm);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else if (sendbuf != MPI_IN_PLACE)
     {
@@ -321,7 +333,11 @@
         mpi_errno = MPIC_Recv(localfulldata, count, datatype, 
                               comm_ptr->node_comm->local_size - 1, MPIR_SCAN_TAG, 
                               comm_ptr->node_comm->handle, &status);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else if (comm_ptr->node_roots_comm == NULL && 
              comm_ptr->node_comm != NULL && 
@@ -329,7 +345,11 @@
     {
         mpi_errno = MPIC_Send(recvbuf, count, datatype,
                               0, MPIR_SCAN_TAG, comm_ptr->node_comm->handle);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
     else if (comm_ptr->node_roots_comm != NULL)
     {
@@ -344,7 +364,11 @@
     {
         mpi_errno = MPIR_Scan_impl(localfulldata, prefulldata, count, datatype,
                                    op, comm_ptr->node_roots_comm);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
 
         if (MPIU_Get_internode_rank(comm_ptr, rank) != 
             comm_ptr->node_roots_comm->local_size-1)
@@ -352,7 +376,11 @@
             mpi_errno = MPIC_Send(prefulldata, count, datatype,
                                   MPIU_Get_internode_rank(comm_ptr, rank) + 1,
                                   MPIR_SCAN_TAG, comm_ptr->node_roots_comm->handle);
-            if(mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
         if (MPIU_Get_internode_rank(comm_ptr, rank) != 0)
         {
@@ -361,7 +389,11 @@
                                   MPIR_SCAN_TAG, comm_ptr->node_roots_comm->handle, 
                                   &status);
             noneed = 0;
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
@@ -373,7 +405,11 @@
 
     if (comm_ptr->node_comm != NULL) {
         mpi_errno = MPIR_Bcast_impl(&noneed, 1, MPI_INT, 0, comm_ptr->node_comm);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) {
+            /* for communication errors, just record the error but continue */
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+        }
     }
 
     if (noneed == 0) {
@@ -382,7 +418,11 @@
 #endif
         if (comm_ptr->node_comm != NULL) {
             mpi_errno = MPIR_Bcast_impl(tempbuf, count, datatype, 0, comm_ptr->node_comm);
-            if(mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
 
         /* do reduce on tempbuf and recvbuf, finish scan. */
@@ -420,6 +460,8 @@
 
   fn_exit:
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 
   fn_fail:

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -68,6 +68,7 @@
     int tmp_buf_size = 0;
     void *tmp_buf=NULL;
     int        mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Comm comm;
     MPIU_CHKLMEM_DECL(4);
     
@@ -171,16 +172,24 @@
                     mpi_errno = MPIC_Recv(recvbuf, recvcnt, recvtype,
                                           src, MPIR_SCATTER_TAG, comm, 
                                           &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
                 else {
                     mpi_errno = MPIC_Recv(tmp_buf, tmp_buf_size, MPI_BYTE, src,
                                           MPIR_SCATTER_TAG, comm, &status);
-                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-		    /* the recv size is larger than what may be sent in
-                       some cases. query amount of data actually received */
-                    MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                        curr_cnt = 0;
+                    } else
+                        /* the recv size is larger than what may be sent in
+                           some cases. query amount of data actually received */
+                        MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
                 }
                 break;
             }
@@ -218,7 +227,11 @@
                                            MPI_BYTE, dst,
                                            MPIR_SCATTER_TAG, comm);
                 }
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 curr_cnt -= send_subtree_cnt;
             }
             mask >>= 1;
@@ -319,10 +332,15 @@
                 
                 mpi_errno = MPIC_Recv(tmp_buf, tmp_buf_size, MPI_BYTE, src,
                                      MPIR_SCATTER_TAG, comm, &status);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                /* the recv size is larger than what may be sent in
-                   some cases. query amount of data actually received */
-                MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    curr_cnt = 0;
+                } else
+                    /* the recv size is larger than what may be sent in
+                       some cases. query amount of data actually received */
+                    MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
                 break;
             }
             mask <<= 1;
@@ -344,7 +362,11 @@
                 mpi_errno = MPIC_Send (((char *)tmp_buf + nbytes*mask),
                                       send_subtree_cnt, MPI_BYTE, dst,
                                       MPIR_SCATTER_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
                 curr_cnt -= send_subtree_cnt;
             }
             mask >>= 1;
@@ -364,6 +386,8 @@
     MPIU_CHKLMEM_FREEALL();
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;
@@ -396,6 +420,7 @@
 */
 
     int rank, local_size, remote_size, mpi_errno=MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     int i, nbytes, sendtype_size, recvtype_size;
     MPI_Status status;
     MPI_Aint extent, true_extent, true_lb = 0;
@@ -429,7 +454,11 @@
             /* root sends all data to rank 0 on remote group and returns */
             mpi_errno = MPIC_Send(sendbuf, sendcnt*remote_size,
                                   sendtype, 0, MPIR_SCATTER_TAG, comm);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
             goto fn_exit;
         }
         else {
@@ -454,7 +483,11 @@
                 mpi_errno = MPIC_Recv(tmp_buf, recvcnt*local_size,
                                       recvtype, root,
                                       MPIR_SCATTER_TAG, comm, &status);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
             
             /* Get the local intracommunicator */
@@ -467,7 +500,11 @@
             mpi_errno = MPIR_Scatter_impl(tmp_buf, recvcnt, recvtype,
                                           recvbuf, recvcnt, recvtype, 0,
                                           newcomm_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
     else {
@@ -478,13 +515,21 @@
                 mpi_errno = MPIC_Send(((char *)sendbuf+sendcnt*i*extent), 
                                       sendcnt, sendtype, i,
                                       MPIR_SCATTER_TAG, comm);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno) {
+                    /* for communication errors, just record the error but continue */
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                    MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                }
             }
         }
         else {
             mpi_errno = MPIC_Recv(recvbuf,recvcnt,recvtype,root,
                                   MPIR_SCATTER_TAG,comm,&status);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
 
@@ -492,6 +537,8 @@
     MPIU_CHKLMEM_FREEALL();
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
  fn_fail:
     goto fn_exit;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -60,6 +60,7 @@
 	MPID_Comm *comm_ptr )
 {
     int rank, comm_size, mpi_errno = MPI_SUCCESS;
+    int mpi_errno_ret = MPI_SUCCESS;
     MPI_Comm comm;
     MPI_Aint extent;
     int      i, reqs;
@@ -120,7 +121,11 @@
             for (i = 0; i < reqs; i++) {
                 if (starray[i].MPI_ERROR != MPI_SUCCESS) {
                     mpi_errno = starray[i].MPI_ERROR;
-                    MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno) {
+                        /* for communication errors, just record the error but continue */
+                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+                    }
                 }
             }
         }
@@ -131,7 +136,11 @@
         if (recvcnt) {
             mpi_errno = MPIC_Recv(recvbuf,recvcnt,recvtype,root,
                                   MPIR_SCATTERV_TAG,comm,MPI_STATUS_IGNORE);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                /* for communication errors, just record the error but continue */
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+                MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+            }
         }
     }
     
@@ -140,6 +149,8 @@
     /* check if multiple threads are calling this collective function */
     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
     MPIU_CHKLMEM_FREEALL();
+    if (mpi_errno_ret)
+        mpi_errno = mpi_errno_ret;
     return mpi_errno;
 fn_fail:
     goto fn_exit;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt	2011-01-13 22:17:24 UTC (rev 7723)
@@ -883,6 +883,7 @@
 
 **signal:signal() failed
 **signal %s:signal() failed: %s
+**sigusr1:This version of MPICH requires the SIGUSR1 signal, but the application has already installed a handler
 
 #
 # mpi functions


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/mpid
___________________________________________________________________
Added: svn:mergeinfo
   + /mpich2/branches/dev/ckpt/src/mpid:5050
/mpich2/branches/dev/ckpt2/src/mpid:5057-6537
/mpich2/branches/dev/error-return/src/mpid:7405-7603,7662-7670
/mpich2/branches/dev/ftb/src/mpid:5661-5730
/mpich2/branches/dev/lapi/src/mpid:5817
/mpich2/branches/dev/wintcp_async_progress/src/mpid:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/mpid:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpid:5406
/mpich2/trunk/src/mpid:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -10,12 +10,8 @@
 /* #define MPIDI_CH3_EAGER_MAX_MSG_SIZE (1500 - sizeof(MPIDI_CH3_Pkt_t)) */
 #define MPIDI_CH3_EAGER_MAX_MSG_SIZE   (128*1024)
 
-#define MPIDI_CH3_Progress_start(progress_state_)                                       \
-do {                                                                                    \
-    MPIU_THREAD_CS_ENTER(COMPLETION,);                                                  \
-    (progress_state_)->ch.completion_count = MPIDI_CH3I_progress_completion_count;      \
-    MPIU_THREAD_CS_EXIT(COMPLETION,);                                                   \
-} while (0)
+#define MPIDI_CH3_Progress_start(progress_state_)                                                       \
+        (progress_state_)->ch.completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
 #define MPIDI_CH3_Progress_end(progress_state_)
 
 enum {

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -14,6 +14,7 @@
     #include <winsock2.h>
     #include <windows.h>
 #endif
+#include "opa_primitives.h"
 
 /*#define MPID_USE_SEQUENCE_NUMBERS*/
 /*#define HAVE_CH3_PRE_INIT*/
@@ -130,5 +131,14 @@
 
 #define MPIDI_CH3_PROGRESS_STATE_DECL MPIDI_CH3I_Progress_state ch;
 
+extern OPA_int_t MPIDI_CH3I_progress_completion_count;
+#define MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT do {                                  \
+        OPA_write_barrier();                                                            \
+        OPA_incr_int(&MPIDI_CH3I_progress_completion_count);                            \
+        MPIU_DBG_MSG_D(CH3_PROGRESS,VERBOSE,                                            \
+                       "just incremented MPIDI_CH3I_progress_completion_count=%d",      \
+                       OPA_load_int(&MPIDI_CH3I_progress_completion_count));            \
+    } while(0)
+
 #endif /* !defined(MPICH_MPIDI_CH3_PRE_H_INCLUDED) */
 

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -23,7 +23,7 @@
 static inline int MPID_nem_mpich2_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
 static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
 static inline int MPID_nem_mpich2_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
-static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
+static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
 static inline int MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
 static inline int MPID_nem_mpich2_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
 static inline void MPID_nem_mpich2_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
@@ -863,10 +863,9 @@
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int
-MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
+MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
 {
     int mpi_errno = MPI_SUCCESS;
-    unsigned completions = MPIDI_CH3I_progress_completion_count;
 #ifndef ENABLE_NO_YIELD
     int pollcount = 0;
 #endif
@@ -905,7 +904,7 @@
 	    mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
             if (mpi_errno) MPIU_ERR_POP (mpi_errno);
 
-            if (completions != MPIDI_CH3I_progress_completion_count || MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
+            if (MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
                 || MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
             {
                 *cell = NULL;
@@ -921,6 +920,12 @@
 	}
 	++pollcount;
 #endif
+
+        if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
+            *cell = NULL;
+            *in_fbox = 0;
+            goto exit_l;
+        }
     }
 
     MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -117,7 +117,7 @@
 static int find_free_entry(int *index);
 static int close_cleanup_and_free_sc_plfd(sockconn_t *const sc);
 static int cleanup_and_free_sc_plfd(sockconn_t *const sc);
-static int error_closed(struct MPIDI_VC *const vc);
+static int error_closed(struct MPIDI_VC *const vc, int req_errno);
 
 #undef FUNCNAME
 #define FUNCNAME is_same_connection
@@ -584,10 +584,8 @@
         *got_sc_eof = 1;
         goto fn_exit;
     }
-    if (nread == -1 && errno != EAGAIN) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
-    }
-    MPIU_ERR_CHKANDJUMP1(nread != hdr_len, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));  /* FIXME-Z1 */
+    MPIU_ERR_CHKANDJUMP1(nread == -1 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
+    MPIU_ERR_CHKANDJUMP(nread != hdr_len, mpi_errno, MPI_ERR_OTHER, "**read");  /* FIXME-Z1 */
     MPIU_Assert(hdr.pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_INFO ||
 		hdr.pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_TMPVC_INFO);
     MPIU_Assert(hdr.datalen != 0);
@@ -603,10 +601,8 @@
 	    ++iov_cnt;
 	} 
 	CHECK_EINTR (nread, readv(sc->fd, iov, iov_cnt));
-        if (nread == -1 && errno != EAGAIN) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
-        }
-	MPIU_ERR_CHKANDJUMP1(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno)); /* FIXME-Z1 */
+        MPIU_ERR_CHKANDJUMP1(nread == -1 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
+	MPIU_ERR_CHKANDJUMP(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read"); /* FIXME-Z1 */
 	if (pg_id_len == 0) {
 	    sc->is_same_pg = TRUE;
             mpi_errno = MPID_nem_tcp_get_vc_from_conninfo (MPIDI_Process.my_pg->id,
@@ -665,10 +661,8 @@
         iov[0].iov_len = sizeof(sc->vc->port_name_tag);
 
         CHECK_EINTR (nread, readv(sc->fd, iov, iov_cnt));
-        if (nread == -1 && errno != EAGAIN) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
-        }
-        MPIU_ERR_CHKANDJUMP1(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno)); /* FIXME-Z1 */
+        MPIU_ERR_CHKANDJUMP1(nread == -1 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
+        MPIU_ERR_CHKANDJUMP(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read"); /* FIXME-Z1 */
         sc->is_same_pg = FALSE;
         sc->pg_id = NULL;
         sc->is_tmpvc = TRUE;
@@ -800,14 +794,10 @@
         int rc = 0;
 
         if (vc_tcp->connect_retry_count > MPIDI_NEM_TCP_MAX_CONNECT_RETRIES) {
-            int mpi_errno2 = MPI_SUCCESS;
             MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "exceeded retries, closing sc");
-            mpi_errno2 = error_closed(vc);
-            if (mpi_errno2) {
-                MPIU_ERR_SET(mpi_errno2, MPI_ERR_OTHER, "**tcp_cleanup_fail");
-                if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
-            }
             MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**exceeded_connect_tries", "**exceeded_connect_tries %d", vc->pg_rank);
+            mpi_errno = error_closed(vc, mpi_errno);
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
             goto fn_fail;
         }
         
@@ -987,6 +977,7 @@
 int close_cleanup_and_free_sc_plfd(sockconn_t *const sc)
 {
     int mpi_errno = MPI_SUCCESS;
+    int mpi_errno2 = MPI_SUCCESS;
     int rc;
     MPIDI_VC_t *const sc_vc = sc->vc;
     MPIDI_STATE_DECL(MPID_STATE_CLOSE_CLEANUP_AND_FREE_SC_PLFD);
@@ -1002,8 +993,8 @@
     if (rc == -1 && errno != EAGAIN && errno != EBADF)
         MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**close", "**close %s", MPIU_Strerror(errno));
 
-    mpi_errno = cleanup_and_free_sc_plfd(sc);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    mpi_errno2 = cleanup_and_free_sc_plfd(sc);
+    if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
 
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_CLOSE_CLEANUP_AND_FREE_SC_PLFD);
@@ -1545,7 +1536,9 @@
                        the other side performs a tcp close() before we do and we
                        blow up here. */
                     MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "other side closed, but we're shutting down, closing sc");
-                    mpi_errno = MPID_nem_tcp_cleanup_on_error(sc_vc);
+                    /* it's really not an error, but we're calling
+                       cleanup_on_error because it does what we want it to */
+                    mpi_errno = MPID_nem_tcp_cleanup_on_error(sc_vc, MPI_SUCCESS);
                     goto fn_exit;
                 }
                 else
@@ -1643,14 +1636,12 @@
     return mpi_errno;
 fn_fail: /* comm related failures jump here */
     {
-        int cleanup_errno = MPI_SUCCESS;
 
-        cleanup_errno = MPID_nem_tcp_cleanup_on_error(sc_vc); /* QUIESCENT */
-        if (cleanup_errno) {
-            MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
-            MPIU_ERR_ADD(mpi_errno, cleanup_errno);
-        }
         MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", sc_vc->pg_rank);
+        mpi_errno = MPID_nem_tcp_cleanup_on_error(sc_vc, mpi_errno);
+        if (mpi_errno) {
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+        }
     }
 fn_noncomm_fail: /* NON-comm related failures jump here */
     goto fn_exit;
@@ -1792,9 +1783,7 @@
     num_skipped_polls = 0;
 
     CHECK_EINTR(n, poll(MPID_nem_tcp_plfd_tbl, num_polled, 0));
-    if (n == -1) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**poll", "**poll %s", MPIU_Strerror(errno));
-    }
+    MPIU_ERR_CHKANDJUMP1(n == -1, mpi_errno, MPI_ERR_OTHER, "**poll", "**poll %s", MPIU_Strerror(errno));
     /* MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "some sc fd poll event")); */
     for(i = 0; i < num_polled; i++)
     {
@@ -1806,7 +1795,7 @@
             /* We could check for POLLHUP here, but HUP/HUP+EOF is not erroneous
              * on many platforms, including modern Linux. */
             if (it_plfd->revents & POLLERR || it_plfd->revents & POLLNVAL) {
-                int cleanup_errno = MPI_SUCCESS;
+                int req_errno = MPI_SUCCESS;
                 int rc;
                 char dummy;
                 const char *err_str = "UNKNOWN";
@@ -1819,24 +1808,15 @@
                 
                 MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "error polling fd, closing sc");
                 if (it_sc->vc) {
-#ifdef HAVE_ERROR_CHECKING
-                    int pg_rank = it_sc->vc->pg_rank; /* vc goes away on cleanup */
-#endif
-                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(it_sc->vc);
-                    if (cleanup_errno) {
-                        MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
-                        MPIU_ERR_ADD(mpi_errno, cleanup_errno);
-                    }
-                    MPIU_ERR_SET2(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d %s", pg_rank, err_str);
+                    MPIU_ERR_SET2(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d %s", it_sc->vc->pg_rank, err_str);
+                    mpi_errno = MPID_nem_tcp_cleanup_on_error(it_sc->vc, req_errno);
+                    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
                 } else {
-                    cleanup_errno = close_cleanup_and_free_sc_plfd(it_sc);
-                    if (cleanup_errno) {
-                        MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
-                        MPIU_ERR_ADD(mpi_errno, cleanup_errno);
-                    }
-                    MPIU_ERR_SET2(mpi_errno, MPI_ERR_OTHER, "**comm_fail_conn", "**comm_fail_conn %s %s", CONN_STATE_STR[it_sc->state.cstate], err_str);
+                    MPIU_ERR_SET2(req_errno, MPI_ERR_OTHER, "**comm_fail_conn", "**comm_fail_conn %s %s", CONN_STATE_STR[it_sc->state.cstate], err_str);
+                    mpi_errno = close_cleanup_and_free_sc_plfd(it_sc);
+                    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
                 }
-                goto fn_fail;
+                continue;
             }
             
             mpi_errno = it_sc->handler(it_plfd, it_sc);
@@ -1900,13 +1880,14 @@
         len = sizeof(SA_IN);
         MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "before accept"));
         if ((connfd = accept(l_sc->fd, (SA *) &rmt_addr, &len)) < 0) {
-            MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "after accept, l_sc=%p lstnfd=%d connfd=%d, errno=%d:%s ", l_sc, l_sc->fd, connfd, errno, MPIU_Strerror(errno)));
+            int save_errno = errno;
+            MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "after accept, l_sc=%p lstnfd=%d connfd=%d, errno=%d:%s ", l_sc, l_sc->fd, connfd, errno, MPIU_Strerror(save_errno)));
             if (errno == EINTR) 
                 continue;
             else if (errno == EWOULDBLOCK || errno == EAGAIN)
                 break; /*  no connection in the listen queue. get out of here.(N1) */
 
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**sock_accept", "**sock_accept %s", MPIU_Strerror(errno));
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**sock_accept", "**sock_accept %s", MPIU_Strerror(save_errno));
         }
         else {
             int index = -1;
@@ -1945,7 +1926,7 @@
 #define FUNCNAME error_closed
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int error_closed(struct MPIDI_VC *const vc)
+static int error_closed(struct MPIDI_VC *const vc, int req_errno)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_nem_tcp_vc_area * const vc_tcp = VC_TCP(vc);
@@ -1957,8 +1938,10 @@
 
     mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    /* complete pending send/recv requests with error ??? */
 
+    mpi_errno = MPID_nem_tcp_error_out_send_queue(vc, req_errno);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_ERROR_CLOSED);
     return mpi_errno;
@@ -1967,12 +1950,13 @@
 }
 
 /* This is called when an communication error has occurred on a VC to
-   close the VC and release associated resources. */
+   close the VC and release associated resources.
+   Any outstanding requests will have MPI_ERROR set to req_errno */
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_tcp_cleanup_on_error
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc)
+int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc, int req_errno)
 {
     int mpi_errno = MPI_SUCCESS;
     int mpi_errno2 = MPI_SUCCESS;
@@ -1983,7 +1967,7 @@
     mpi_errno = MPID_nem_tcp_cleanup(vc);
     /* not jumping on error, keep going */
     
-    mpi_errno2 = error_closed(vc);
+    mpi_errno2 = error_closed(vc, req_errno);
     if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
 
  fn_exit:

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -93,7 +93,8 @@
 int MPID_nem_tcp_is_sock_connected(int fd);
 int MPID_nem_tcp_disconnect(struct MPIDI_VC *const vc);
 int MPID_nem_tcp_cleanup (struct MPIDI_VC *const vc);
-int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc);
+int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc, int req_errno);
+int MPID_nem_tcp_error_out_send_queue(struct MPIDI_VC *const vc, int req_errno);
 int MPID_nem_tcp_ckpt_cleanup(void);
 int MPID_nem_tcp_state_listening_handler(struct pollfd *const l_plfd, sockconn_t *const l_sc);
 int MPID_nem_tcp_send_queued(MPIDI_VC_t *vc, MPIDI_nem_tcp_request_queue_t *send_queue);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -562,6 +562,7 @@
 int MPID_nem_tcp_vc_terminate (MPIDI_VC_t *vc)
 {
     int mpi_errno = MPI_SUCCESS;
+    int req_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_NEM_TCP_VC_TERMINATE);
 
     MPIDI_FUNC_ENTER(MPID_NEM_TCP_VC_TERMINATE);
@@ -569,6 +570,10 @@
     mpi_errno = MPID_nem_tcp_cleanup(vc);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     
+    MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+    mpi_errno = MPID_nem_tcp_error_out_send_queue(vc, req_errno);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_NEM_TCP_VC_TERMINATE);
     return mpi_errno;

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -98,13 +98,13 @@
         
         CHECK_EINTR(offset, writev(vc_tcp->sc->fd, iov, sreq->dev.iov_count));
         if (offset == 0) {
-            int cleanup_errno = MPI_SUCCESS;
+            int req_errno = MPI_SUCCESS;
 
-            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
-            MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-            cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-            if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
-            goto fn_fail;
+            MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+            MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+            mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            goto fn_exit; /* this vc is closed now, just bail out */
         }
         if (offset == -1)
         {
@@ -114,13 +114,12 @@
                 MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "EAGAIN");
                 break;
             } else {
-                int cleanup_errno = MPI_SUCCESS;
-
-                MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
-                MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
-                goto fn_fail;
+                int req_errno = MPI_SUCCESS;
+                MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+                MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                goto fn_exit; /* this vc is closed now, just bail out */
             }
         }
         MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "write " MPIDI_MSG_SZ_FMT, offset);
@@ -264,12 +263,12 @@
                 
                 CHECK_EINTR(offset, writev(sc->fd, iov, 2));
                 if (offset == 0) {
-                    int cleanup_errno = MPI_SUCCESS;
+                    int req_errno = MPI_SUCCESS;
 
-                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
-                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+                    MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                    mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                     goto fn_fail;
                 }
                 if (offset == -1)
@@ -277,12 +276,11 @@
                     if (errno == EAGAIN)
                         offset = 0;
                     else {
-                        int cleanup_errno = MPI_SUCCESS;
-
-                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
-                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                        cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                        if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                        int req_errno = MPI_SUCCESS;
+                        MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+                        MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                        mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                         goto fn_fail;
                     }
                 }
@@ -404,12 +402,12 @@
                 
             CHECK_EINTR(offset, writev(sc->fd, iov, 2));
             if (offset == 0) {
-                int cleanup_errno = MPI_SUCCESS;
+                int req_errno = MPI_SUCCESS;
 
-                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
-                MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+                MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                 goto fn_fail;
             }
             if (offset == -1)
@@ -417,12 +415,12 @@
                 if (errno == EAGAIN)
                     offset = 0;
                 else {
-                    int cleanup_errno = MPI_SUCCESS;
+                    int req_errno = MPI_SUCCESS;
+                    MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+                    MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
 
-                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
-                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc); /* ignoring return code */
-                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                     goto fn_fail;
                 }
             }
@@ -539,12 +537,12 @@
                 
                 CHECK_EINTR(offset, writev(sc->fd, iov, 2));
                 if (offset == 0) {
-                    int cleanup_errno = MPI_SUCCESS;
+                    int req_errno = MPI_SUCCESS;
 
-                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
-                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+                    MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                    mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                     goto fn_fail;
                 }
                 if (offset == -1)
@@ -552,12 +550,11 @@
                     if (errno == EAGAIN)
                         offset = 0;
                     else {
-                        int cleanup_errno = MPI_SUCCESS;
-
-                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
-                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                        cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                        if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                        int req_errno = MPI_SUCCESS;
+                        MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+                        MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                        mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                         goto fn_fail;
                     }
                 }
@@ -697,12 +694,12 @@
             {
                 CHECK_EINTR(offset, writev(vc_tcp->sc->fd, iov, iov_n));
                 if (offset == 0) {
-                    int cleanup_errno = MPI_SUCCESS;
+                    int req_errno = MPI_SUCCESS;
 
-                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
-                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+                    MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                    mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                     goto fn_fail;
                 }
                 if (offset == -1)
@@ -710,12 +707,11 @@
                     if (errno == EAGAIN)
                         offset = 0;
                     else {
-                        int cleanup_errno = MPI_SUCCESS;
-
-                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
-                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
-                        cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
-                        if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                        int req_errno = MPI_SUCCESS;
+                        MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+                        MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                        mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+                        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                         goto fn_fail;
                     }
                 }
@@ -814,3 +810,42 @@
     MPIDI_CH3_Request_destroy(sreq);
     goto fn_exit;
 }
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_tcp_error_out_send_queue
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPID_nem_tcp_error_out_send_queue(struct MPIDI_VC *const vc, int req_errno)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Request *req;
+    MPID_nem_tcp_vc_area *const vc_tcp = VC_TCP(vc);
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_ERROR_OUT_SEND_QUEUE);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_ERROR_OUT_SEND_QUEUE);
+
+    /* we don't call onDataAvail or onFinal handlers because this is
+       an error condition and we just want to mark them as complete */
+
+    /* send queue */
+    while (!SENDQ_EMPTY(vc_tcp->send_queue)) {
+        SENDQ_DEQUEUE(&vc_tcp->send_queue, &req);
+        req->status.MPI_ERROR = req_errno;
+
+        MPIDI_CH3U_Request_complete(req);
+    }
+
+    /* paused send queue */
+    while (!SENDQ_EMPTY(vc_tcp->paused_send_queue)) {
+        SENDQ_DEQUEUE(&vc_tcp->paused_send_queue, &req);
+        req->status.MPI_ERROR = req_errno;
+
+        MPIDI_CH3U_Request_complete(req);
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_ERROR_OUT_SEND_QUEUE);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5050
/mpich2/branches/dev/ckpt2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5057-6537
/mpich2/branches/dev/ftb/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5661-5730
/mpich2/branches/dev/lapi/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5817
/mpich2/branches/dev/win_rrvm/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:6416,6428
/mpich2/branches/dev/wintcp_async_progress/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5406
/mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5050
/mpich2/branches/dev/ckpt2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5057-6537
/mpich2/branches/dev/error-return/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:7405-7603,7662-7670
/mpich2/branches/dev/ftb/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5661-5730
/mpich2/branches/dev/lapi/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5817
/mpich2/branches/dev/win_rrvm/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:6416,6428
/mpich2/branches/dev/wintcp_async_progress/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5406
/mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -68,9 +68,12 @@
     int rc, ret;
     const struct cr_restart_info* ri;
 
-    if (MPIDI_Process.my_pg_rank == 0)
+    if (MPIDI_Process.my_pg_rank == 0) {
         MPIDI_nem_ckpt_start_checkpoint = TRUE;
-
+        /* poke the progress engine in case we're waiting in a blocking recv */
+        MPIDI_CH3_Progress_signal_completion();
+    }
+    
     ret = sem_wait(&ckpt_sem);
     CHECK_ERR(ret, "sem_wait");
 

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -24,6 +24,13 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISEND);
 
+    if (vc->state == MPIDI_VC_STATE_MORIBUND) {
+        sreq->status.MPI_ERROR = MPI_SUCCESS;
+        MPIU_ERR_SET1(sreq->status.MPI_ERROR, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+        MPIDI_CH3U_Request_complete(sreq);
+        goto fn_fail;
+    }
+
     if (((MPIDI_CH3I_VC *)vc->channel_private)->iSendContig)
     {
         mpi_errno = ((MPIDI_CH3I_VC *)vc->channel_private)->iSendContig(vc, sreq, hdr, hdr_sz, NULL, 0);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -28,6 +28,13 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISENDV);
 
+    if (vc->state == MPIDI_VC_STATE_MORIBUND) {
+        sreq->status.MPI_ERROR = MPI_SUCCESS;
+        MPIU_ERR_SET1(sreq->status.MPI_ERROR, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+        MPIDI_CH3U_Request_complete(sreq);
+        goto fn_fail;
+    }
+
     if (vc_ch->iSendContig)
     {
         MPIU_Assert(n_iov > 0);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -33,6 +33,8 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSG);
 
+    MPIU_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+
     if (((MPIDI_CH3I_VC *)vc->channel_private)->iStartContigMsg)
     {
         mpi_errno = ((MPIDI_CH3I_VC *)vc->channel_private)->iStartContigMsg(vc, hdr, hdr_sz, NULL, 0, sreq_ptr);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -41,6 +41,8 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSGV);
 
+    MPIU_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+
     if (((MPIDI_CH3I_VC *)vc->channel_private)->iStartContigMsg)
     {
         MPIU_Assert (n_iov > 0);

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -11,6 +11,9 @@
 #if defined (MPID_NEM_INLINE) && MPID_NEM_INLINE
 #include "mpid_nem_inline.h"
 #endif
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
 
 
 #define PKTARRAY_SIZE (MPIDI_NEM_PKT_END+1)
@@ -30,11 +33,7 @@
 extern MPID_Request ** const MPID_Recvq_unexpected_tail_ptr;
 #endif
 
-/* MT any races on this var reported by DRD/helgrind/TSan are probably bugs.
- * This var is protected by the COMPLETION critical section in non-global mode. */
-/* FIXME volatile is probably unnecessary, access is arbitrated entirely by
- * mutex, but the decl is shared among channels */
-volatile unsigned int MPIDI_CH3I_progress_completion_count = 0;
+OPA_int_t MPIDI_CH3I_progress_completion_count = OPA_INT_T_INITIALIZER(0);
 
 /* NEMESIS MULTITHREADING: Extra Data Structures Added */
 #ifdef MPICH_IS_THREADED
@@ -46,6 +45,9 @@
 #endif /* MPICH_IS_THREADED */
 /* NEMESIS MULTITHREADING - End block*/
 
+static volatile int sigusr1_count = 0;
+static int my_sigusr1_count = 0;
+
 struct MPID_Request *MPIDI_CH3I_sendq_head[CH3_NUM_QUEUES] = {0};
 struct MPID_Request *MPIDI_CH3I_sendq_tail[CH3_NUM_QUEUES] = {0};
 struct MPID_Request *MPIDI_CH3I_active_send[CH3_NUM_QUEUES] = {0};
@@ -66,6 +68,13 @@
 
 static qn_ent_t *qn_head = NULL;
 
+static void sigusr1_handler(int sig)
+{
+    ++sigusr1_count;
+    /* poke the progress engine in case we're waiting in a blocking recv */
+    MPIDI_CH3_Progress_signal_completion();
+}
+
 /* MPIDI_CH3I_Shm_send_progress() this function makes progress sending
    queued messages on the shared memory queues.  This function is
    nonblocking and does not call netmod functions..*/
@@ -241,6 +250,12 @@
         MPIU_Assert(progress_state != NULL);
     }
 
+    if (sigusr1_count > my_sigusr1_count) {
+        my_sigusr1_count = sigusr1_count;
+        mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    
 #ifdef ENABLE_CHECKPOINTING
     if (MPIR_PARAM_ENABLE_CKPOINT) {
         if (MPIDI_nem_ckpt_start_checkpoint) {
@@ -326,7 +341,7 @@
 #endif
                 )
             {
-                mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox);
+                mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox, progress_state->ch.completion_count);
             }
             else
             {
@@ -431,19 +446,20 @@
 
         /* in the case of progress_wait, bail out if anything completed (CC-1) */
         if (is_blocking) {
-            int made_progress = FALSE;
-            MPIU_THREAD_CS_ENTER(COMPLETION,);
-            if (progress_state->ch.completion_count != MPIDI_CH3I_progress_completion_count) {
-                made_progress = TRUE;
+            int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
+            if (progress_state->ch.completion_count != completion_count) {
+                /* Read barrier to make sure no reads get values before the
+                   completion counter was incremented  */
+                OPA_read_barrier();
                 /* reset for the next iteration */
-                progress_state->ch.completion_count = MPIDI_CH3I_progress_completion_count;
+                progress_state->ch.completion_count = completion_count;
+                break;
             }
-            MPIU_THREAD_CS_EXIT(COMPLETION,);
-            if (made_progress) break;
         }
     }
     while (is_blocking);
 
+    
 #ifdef MPICH_IS_THREADED
     MPIU_THREAD_CHECK_BEGIN;
     {
@@ -481,15 +497,9 @@
     {
 	while (1)
 	{
-            /* we also currently hold the MPIDCOMM CS */
-            MPIU_THREAD_CS_ENTER(COMPLETION,);
-            if (completion_count != MPIDI_CH3I_progress_completion_count ||
+            if (completion_count != OPA_load_int(&MPIDI_CH3I_progress_completion_count) ||
                 MPIDI_CH3I_progress_blocked != TRUE)
-            {
-                MPIU_THREAD_CS_EXIT(COMPLETION,);
                 break;
-            }
-            MPIU_THREAD_CS_EXIT(COMPLETION,);
 	    MPID_Thread_cond_wait(&MPIDI_CH3I_progress_completion_cond, &MPIR_ThreadInfo.global_mutex/*MPIDCOMM*/);
 	}
     }
@@ -787,6 +797,18 @@
     /* other pkt handlers */
     pktArray[MPIDI_NEM_PKT_NETMOD] = pkt_NETMOD_handler;
    
+#ifdef HAVE_SIGNAL
+    {
+        /* install signal handler for process failure notifications from hydra */
+        void *ret;
+        
+        ret = signal(SIGUSR1, &sigusr1_handler);
+        MPIU_ERR_CHKANDJUMP1(ret == SIG_ERR, mpi_errno, MPI_ERR_OTHER, "**signal", "**signal %s", MPIU_Strerror(errno));
+        /* Error if the app set its own SIGUSR1 handler. */
+        MPIU_ERR_CHKANDJUMP(ret != SIG_DFL, mpi_errno, MPI_ERR_OTHER, "**sigusr1");
+    }
+#endif
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PROGRESS_INIT);
     return mpi_errno;
@@ -826,6 +848,14 @@
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_CONNECTION_TERMINATE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_CONNECTION_TERMINATE);
+
+    MPIU_DBG_MSG_D(CH3_DISCONNECT, TYPICAL, "Terminating VC %d", vc->pg_rank);
+
+    /* if this is already closed, exit */
+    if (vc->state == MPIDI_VC_STATE_MORIBUND ||
+        vc->state == MPIDI_VC_STATE_INACTIVE_CLOSED)
+        goto fn_exit;
+
     if (((MPIDI_CH3I_VC *)vc->channel_private)->is_local)
         mpi_errno = MPID_nem_vc_terminate(vc);
     else

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -229,4 +229,9 @@
 
 #define MPIDI_CH3_PROGRESS_STATE_DECL MPIDI_CH3I_Progress_state ch;
 
+
+/* This variable is used in the definitions of the MPID_Progress_xxx macros,
+   and must be available to the routines in src/mpi */
+extern volatile unsigned int MPIDI_CH3I_progress_completion_count;
+
 #endif /* !defined(MPICH_MPIDI_CH3_PRE_H_INCLUDED) */

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -58,6 +58,11 @@
 
 #define MPIDI_CH3_PROGRESS_STATE_DECL MPIDI_CH3I_Progress_state ch;
 
+/* This variable is used in the definitions of the MPID_Progress_xxx macros,
+   and must be available to the routines in src/mpi */
+extern volatile unsigned int MPIDI_CH3I_progress_completion_count;
+
+
 /* MPICH_IS_THREADED isn't defined yet (handled by mpiimplthread.h) */
 #if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)
 #define MPIDI_CH3I_PROGRESS_WAKEUP                                                                \

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -787,6 +787,8 @@
 typedef struct MPIDI_VC * MPID_VCR;
 #endif
 
+/* number of VCs that are in MORIBUND state */
+extern int MPIDI_Failed_vc_count;
 
 /* Initialize a new VC */
 int MPIDI_VC_Init( MPIDI_VC_t *, MPIDI_PG_t *, int );
@@ -1440,6 +1442,7 @@
 MPID_Request * MPIDI_CH3U_Recvq_FDP_or_AEU(MPIDI_Message_match * match, 
 					   int * found);
 int MPIDI_CH3U_Recvq_count_unexp(void);
+int MPIDI_CH3U_Complete_posted_with_error(MPIDI_VC_t *vc);
 
 
 int MPIDI_CH3U_Request_load_send_iov(MPID_Request * const sreq, 
@@ -1583,6 +1586,9 @@
 #else
 #define MPIDI_CH3_Channel_close( )   MPI_SUCCESS
 #endif
+/* MPIDI_CH3U_Check_for_failed_procs() reads PMI_dead_processes key
+   and marks VCs to those processes as failed */
+int MPIDI_CH3U_Check_for_failed_procs(void);
 
 /*@
   MPIDI_CH3_Pre_init - Allows the channel to initialize before PMI_init is 

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h	2011-01-13 22:17:24 UTC (rev 7723)
@@ -159,10 +159,6 @@
 @*/
 void MPIDI_CH3U_Request_destroy(MPID_Request * req);
 
-/* This variable is used in the definitions of the MPID_Progress_xxx macros,
-   and must be available to the routines in src/mpi */
-extern volatile unsigned int MPIDI_CH3I_progress_completion_count;
-
 /* Include definitions from the channel which require items defined by this 
    file (mpidimpl.h) or the file it includes
    (mpiimpl.h). */

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -5,11 +5,12 @@
  */
 
 #include "mpidimpl.h"
+#include "pmi.h"
 
 /* Count the number of outstanding close requests */
 static volatile int MPIDI_Outstanding_close_ops = 0;
+int MPIDI_Failed_vc_count = 0;
 
-
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3U_Handle_connection
 #undef FCNAME
@@ -65,6 +66,19 @@
 
 		    break;
 
+                case MPIDI_VC_STATE_INACTIVE:
+                    /* VC was terminated before it was activated.
+                       This can happen if a failed process was
+                       detected before the process used the VC. */
+                    MPIU_DBG_MSG(CH3_DISCONNECT,TYPICAL, "VC terminated before it was activated.  We probably got a failed"
+                                 " process notification.");
+                    MPIDI_CH3U_Complete_posted_with_error(vc);
+                    ++MPIDI_Failed_vc_count;
+                    MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
+
+                    break;
+
+                    
                 case MPIDI_VC_STATE_ACTIVE:
                 case MPIDI_VC_STATE_REMOTE_CLOSE:
                     /* This is a premature termination.  This process
@@ -74,6 +88,9 @@
                     
  		    MPIU_DBG_MSG(CH3_DISCONNECT,TYPICAL, "Connection closed prematurely.");
 
+                    MPIDI_CH3U_Complete_posted_with_error(vc);
+                    ++MPIDI_Failed_vc_count;
+                    
                     MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc);
                     MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
 
@@ -98,6 +115,10 @@
 
  		    MPIU_DBG_MSG_D(CH3_DISCONNECT,TYPICAL, "Connection closed prematurely during close protocol.  "
                                    "Outstanding close operations = %d", MPIDI_Outstanding_close_ops);
+
+                    MPIDI_CH3U_Complete_posted_with_error(vc);
+                    ++MPIDI_Failed_vc_count;
+
                     MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc);
                     MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
                     
@@ -118,7 +139,7 @@
 		    mpi_errno = MPIR_Err_create_code(
 			MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, 
                         MPI_ERR_INTERN, "**ch3|unhandled_connection_state",
-			"**ch3|unhandled_connection_state %p %d", vc, event);
+			"**ch3|unhandled_connection_state %p %d", vc, vc->state);
                     goto fn_fail;
 		    break;
 		}
@@ -366,3 +387,76 @@
     return mpi_errno;
 }
 
+#define parse_rank(r_p) do {                                                                    \
+        while (isspace(*c)) /* skip spaces */                                                   \
+            ++c;                                                                                \
+        MPIU_ERR_CHKINTERNAL(!isdigit(*c), mpi_errno, "error parsing failed process list");     \
+        *(r_p) = strtol(c, &c, 0);                                                              \
+        while (isspace(*c)) /* skip spaces */                                                   \
+            ++c;                                                                                \
+    } while (0)
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3U_Check_for_failed_procs
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIDI_CH3U_Check_for_failed_procs(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int pmi_errno;
+    char *val;
+    char *c;
+    int len;
+    char *kvsname;
+    int rank, rank_hi;
+    MPIU_CHKLMEM_DECL(1);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
+    mpi_errno = MPIDI_PG_GetConnKVSname(&kvsname);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    pmi_errno = PMI_KVS_Get_value_length_max(&len);
+    MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get_value_length_max");
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    MPIU_CHKLMEM_MALLOC(val, char *, len, mpi_errno, "val");
+    pmi_errno = PMI_KVS_Get(kvsname, "PMI_dead_processes", val, len);
+    MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
+
+    MPIU_DBG_MSG_S(CH3_DISCONNECT, TYPICAL, "Received proc fail notification: %s", val);
+    
+    if (*val == '\0')
+        /* there are no failed processes */
+        goto fn_exit;
+
+    /* parse list of failed processes.  This is a comma separated list
+       of ranks or ranges of ranks (e.g., "1, 3-5, 11") */
+    c = val;
+    while(1) {
+        parse_rank(&rank);
+        if (*c == '-') {
+            ++c; /* skip '-' */
+            parse_rank(&rank_hi);
+        } else
+            rank_hi = rank;
+        while (rank <= rank_hi) {
+            MPIDI_VC_t *vc;
+            MPIDI_PG_Get_vc(MPIDI_Process.my_pg, rank, &vc);
+            mpi_errno = MPIU_CALL(MPIDI_CH3,Connection_terminate(vc));
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            ++rank;
+        }
+        MPIU_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list");
+        if (*c == '\0')
+            break;
+        ++c; /* skip ',' */
+    }
+
+ fn_exit:
+    MPIU_CHKLMEM_FREEALL();
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -344,9 +344,11 @@
     /* A matching request was not found in the unexpected queue, so we 
        need to allocate a new request and add it to the posted queue */
     {
-	int mpi_errno=0;
-	MPIDI_Request_create_rreq( rreq, mpi_errno, 
-				   found = FALSE;goto lock_exit );
+	int mpi_errno = MPI_SUCCESS;
+
+        found = FALSE;
+
+	MPIDI_Request_create_rreq( rreq, mpi_errno, goto lock_exit );
 	rreq->dev.match.parts.tag	   = tag;
 	rreq->dev.match.parts.rank	   = source;
 	rreq->dev.match.parts.context_id   = context_id;
@@ -368,7 +370,26 @@
         rreq->dev.user_buf         = user_buf;
         rreq->dev.user_count       = user_count;
         rreq->dev.datatype         = datatype;
-	rreq->dev.next		   = NULL;
+
+        /* check whether VC has failed, or this is an ANY_SOURCE in a
+           failed communicator */
+        if (source != MPI_ANY_SOURCE) {
+            MPIDI_VC_t *vc;
+            MPIDI_Comm_get_vc(comm, source, &vc);
+            if (vc->state == MPIDI_VC_STATE_MORIBUND) {
+                MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+                rreq->status.MPI_ERROR = mpi_errno;
+                MPIDI_CH3U_Request_complete(rreq);
+                goto lock_exit;
+            }
+        } else if (MPID_VCRT_Contains_failed_vc(comm->vcrt)) {
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**comm_fail");
+            rreq->status.MPI_ERROR = mpi_errno;
+            MPIDI_CH3U_Request_complete(rreq);
+            goto lock_exit;
+        }
+        
+	rreq->dev.next = NULL;
 	if (recvq_posted_tail != NULL) {
 	    recvq_posted_tail->dev.next = rreq;
 	}
@@ -379,8 +400,6 @@
 	MPIDI_POSTED_RECV_ENQUEUE_HOOK(rreq);
     }
     
-    found = FALSE;
-
   lock_exit:
 
     *foundp = found;
@@ -540,6 +559,121 @@
     return rreq;
 }
 
+/* returns TRUE iff the request was sent on the vc */
+static inline int req_uses_vc(const MPID_Request* req, const MPIDI_VC_t *vc)
+{
+    MPIDI_VC_t *vc1;
+    
+    MPIDI_Comm_get_vc(req->comm, req->dev.match.parts.rank, &vc1);
+    return vc == vc1;
+}
+
+/* returns TRUE iff the vc is part of the comm*/
+static inline int is_vc_in_comm(const MPIDI_VC_t *vc, const MPID_Comm *comm)
+{
+    int i;
+
+    for (i = 0; i < comm->remote_size; ++i) {
+        MPIDI_VC_t *vc1;
+        MPIDI_Comm_get_vc(comm, i, &vc1);
+        if (vc == vc1)
+            return TRUE;
+    }
+    return FALSE;
+}
+
+#undef FUNCNAME
+#define FUNCNAME dequeue_and_set_error
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+/* This dequeues req from the posted recv queue, set req's error code to comm_fail, and updates the req pointer.
+   Note that this creates a new error code if one hasn't already been created (i.e., if *error is MPI_SUCCESS). */
+static inline void dequeue_and_set_error(MPID_Request **req,  MPID_Request *prev_req, int *error, int rank)
+{
+    MPID_Request *next = (*req)->dev.next;
+
+    if (*error == MPI_SUCCESS)
+        MPIU_ERR_SET1(*error, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", rank);
+
+    /* remove from queue */
+    if (recvq_posted_head == *req)
+        recvq_posted_head = (*req)->dev.next;
+    else
+        prev_req->dev.next = (*req)->dev.next;
+    if (recvq_posted_tail == *req)
+        recvq_posted_tail = prev_req;
+    
+    /* set error and complete */
+    (*req)->status.MPI_ERROR = *error;
+    MPIDI_CH3U_Request_complete(*req);
+    *req = next;
+}
+
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDU_Complete_posted_with_error
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIDI_CH3U_Complete_posted_with_error(MPIDI_VC_t *vc)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Request *req, *prev_req;
+    int error = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDU_COMPLETE_POSTED_WITH_ERROR);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_COMPLETE_POSTED_WITH_ERROR);
+
+    MPIU_THREAD_CS_ENTER(MSGQUEUE,);
+
+    /* check each req to see if the VC is part of that communicator */
+    req = recvq_posted_head;
+    prev_req = NULL;
+    while (req) {
+        if (req->dev.match.parts.rank != MPI_ANY_SOURCE && req_uses_vc(req, vc)) {
+            /* this req is expected on the VC */
+            dequeue_and_set_error(&req, prev_req, &error, vc->pg_rank);
+        } else if (req->dev.match.parts.rank == MPI_ANY_SOURCE && is_vc_in_comm(vc, req->comm)) {
+            /* This req is an ANY_SOURCE and is expected on a communicator that includes the VC.
+               We need to dequeue all anysources posted in a communicator with a failed VC.  We
+               check whether the VC is in the communicator by iterating over the comm's VC table.
+               Since this may be expensive, now that we know the VC is in comm, we take the
+               opportunity to scan the rest of the posted recv queue for other anysources with
+               the same communicator.  Note that in the worst case this is O(N*M), where N is the
+               number of posted requests and M is the number of communicators.  This can happen
+               if every req is an anysource and uses a different communicator.  We can possibly
+               conditionally execute the optimization based on number of comms, number of posted
+               requests and communicator size. */
+            MPID_Request *as_req = req->dev.next;
+            MPID_Request *prev_as_req = req;
+            /* First remove any AS recvs on this comm that were posted AFTER this req */
+            while (as_req) {
+                if (as_req->comm == req->comm && as_req->dev.match.parts.rank == MPI_ANY_SOURCE) {
+                    dequeue_and_set_error(&as_req, prev_as_req, &error, vc->pg_rank);
+                } else {
+                    prev_as_req = as_req;
+                    as_req = as_req->dev.next;
+                }
+            }
+            /* Now remove this req.  We do this in this order to make it easier to keep track of
+               req and prev_req pointers */
+            dequeue_and_set_error(&req, prev_req, &error, vc->pg_rank);
+        } else {
+            prev_req = req;
+            req = req->dev.next;
+        }
+    }
+    
+ fn_exit:
+    MPIU_THREAD_CS_EXIT(MSGQUEUE,);
+
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_COMPLETE_POSTED_WITH_ERROR);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 /* --BEGIN ERROR HANDLING-- */
 /* pretty prints tag, returns out for calling convenience */
 static char *tag_val_to_str(int tag, char *out, int max)

Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c	2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c	2011-01-13 22:17:24 UTC (rev 7723)
@@ -36,6 +36,8 @@
 typedef struct MPIDI_VCRT
 {
     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
+    int contains_failed_vc;
+    int last_check_for_failed_vc;
     int size;
     MPIDI_VC_t * vcr_table[1];
 }
@@ -81,6 +83,8 @@
     MPIU_Object_set_ref(vcrt, 1);
     vcrt->size = size;
     *vcrt_ptr = vcrt;
+    vcrt->contains_failed_vc = FALSE;
+    vcrt->last_check_for_failed_vc = 0;
 
  fn_exit:
     MPIU_CHKPMEM_COMMIT();
@@ -255,6 +259,34 @@
 }
 
 /*@
+  MPID_VCRT_Contains_failed_vc - returns TRUE iff a VC in this VCRT is in MORUBIND state
+  @*/
+#undef FUNCNAME
+#define FUNCNAME MPID_VCRT_Contains_failed_vc
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPID_VCRT_Contains_failed_vc(MPID_VCRT vcrt)
+{
+    if (vcrt->contains_failed_vc) {
+        /* We have already determined that this VCRT has a dead VC */
+        return TRUE;
+    } else if (vcrt->last_check_for_failed_vc < MPIDI_Failed_vc_count) {
+        /* A VC has failed since the last time we checked for dead VCs
+           in this VCRT */
+        int i;
+        for (i = 0; i < vcrt->size; ++i) {
+            if (vcrt->vcr_table[i]->state == MPIDI_VC_STATE_MORIBUND) {
+                vcrt->contains_failed_vc = TRUE;
+                return TRUE;
+            }
+        }
+        vcrt->last_check_for_failed_vc = MPIDI_Failed_vc_count;
+    }
+    return FALSE;
+}
+
+
+/*@
   MPID_VCR_Dup - Duplicate a virtual connection reference 
 
   Notes:


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/mpl/src/mplstr.c
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt2/src/mpl/src/string/mplstr.c:5182,5196,5198
/mpich2/branches/dev/ftb/src/mpl/src/mplstr.c:5661-5730
/mpich2/branches/dev/lapi/src/mpl/src/mplstr.c:5817
/mpich2/branches/release/mpich2-1.1.1/src/mpl/src/string/mplstr.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpl/src/string/mplstr.c:5406
/mpich2/trunk/src/mpl/src/mplstr.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt2/src/mpl/src/string/mplstr.c:5182,5196,5198
/mpich2/branches/dev/error-return/src/mpl/src/mplstr.c:7662-7670
/mpich2/branches/dev/ftb/src/mpl/src/mplstr.c:5661-5730
/mpich2/branches/dev/lapi/src/mpl/src/mplstr.c:5817
/mpich2/branches/release/mpich2-1.1.1/src/mpl/src/string/mplstr.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpl/src/string/mplstr.c:5406
/mpich2/trunk/src/mpl/src/mplstr.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra:5406
/mpich2/trunk/src/pm/hydra:7355-7359*,7366-7367*,7371-7402*,7406-7409*,7411-7416*,7419-7420*,7422-7425*,7429-7433*,7435*,7437-7438,7447-7448*,7462*,7470*,7473-7477*,7484-7485*,7488-7491*,7493-7502*,7504*,7507-7508*,7510-7517*,7519-7527*,7529-7530*,7532*,7536*,7538-7566*,7568*,7570*,7572*,7574*,7576*,7578*,7581*,7583*,7592*,7596*,7607-7622*,7624-7630*,7632-7635*,7637*,7639*,7641-7643*,7646-7649*,7651-7654*,7658-7659*,7663*,7665*,7668*,7676-7678*,7681*,7684*,7686*,7688*,7690-7692*,7694*,7696*,7700*,7705*,7707-7710*,7712*,7714*,7719*
   + /mpich2/branches/dev/ckpt/src/pm/hydra:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra:5406
/mpich2/trunk/src/pm/hydra:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671*,7674*,7676-7678,7681,7683*,7684,7685*,7686,7687*,7688,7690-7692,7694,7696,7700,7701-7702*,7705,7707-7710,7712,7714,7719,7720*,7722*


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/Makefile.am
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/Makefile.am:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/Makefile.am:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/Makefile.am:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/Makefile.am:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/Makefile.am:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/Makefile.am:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/Makefile.am:5406
/mpich2/trunk/src/pm/hydra/Makefile.am:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/Makefile.am:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/Makefile.am:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/Makefile.am:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/Makefile.am:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/Makefile.am:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/Makefile.am:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/Makefile.am:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/Makefile.am:5406
/mpich2/trunk/src/pm/hydra/Makefile.am:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/README
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/README:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/README:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/README:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/README:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/README:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/README:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/README:5406
/mpich2/trunk/src/pm/hydra/README:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/README:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/README:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/README:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/README:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/README:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/README:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/README:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/README:5406
/mpich2/trunk/src/pm/hydra/README:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/autogen.sh
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/autogen.sh:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/autogen.sh:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/autogen.sh:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/autogen.sh:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/autogen.sh:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/autogen.sh:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/autogen.sh:5406
/mpich2/trunk/src/pm/hydra/autogen.sh:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/autogen.sh:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/autogen.sh:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/autogen.sh:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/autogen.sh:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/autogen.sh:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/autogen.sh:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/autogen.sh:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/autogen.sh:5406
/mpich2/trunk/src/pm/hydra/autogen.sh:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/configure.in
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/configure.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/configure.in:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/configure.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/configure.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/configure.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/configure.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/configure.in:5406
/mpich2/trunk/src/pm/hydra/configure.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/configure.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/configure.in:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/configure.in:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/configure.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/configure.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/configure.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/configure.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/configure.in:5406
/mpich2/trunk/src/pm/hydra/configure.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/examples
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/examples:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/examples:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/examples:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/examples:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/examples:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/examples:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/examples:5406
/mpich2/trunk/src/pm/hydra/examples:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/examples:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/examples:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/examples:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/examples:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/examples:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/examples:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/examples:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/examples:5406
/mpich2/trunk/src/pm/hydra/examples:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/hydra-doxygen.cfg.in
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/hydra-doxygen.cfg.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/hydra-doxygen.cfg.in:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/hydra-doxygen.cfg.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/hydra-doxygen.cfg.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/hydra-doxygen.cfg.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/hydra-doxygen.cfg.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/hydra-doxygen.cfg.in:5406
/mpich2/trunk/src/pm/hydra/hydra-doxygen.cfg.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/hydra-doxygen.cfg.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/hydra-doxygen.cfg.in:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/hydra-doxygen.cfg.in:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/hydra-doxygen.cfg.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/hydra-doxygen.cfg.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/hydra-doxygen.cfg.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/hydra-doxygen.cfg.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/hydra-doxygen.cfg.in:5406
/mpich2/trunk/src/pm/hydra/hydra-doxygen.cfg.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/include
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/include:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/include:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/include:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/include:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/include:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/include:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/include:5406
/mpich2/trunk/src/pm/hydra/include:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/include:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/include:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/include:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/include:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/include:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/include:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/include:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/include:5406
/mpich2/trunk/src/pm/hydra/include:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/mpich2prereq
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/mpich2prereq:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/mpich2prereq:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/mpich2prereq:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/mpich2prereq:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/mpich2prereq:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/mpich2prereq:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/mpich2prereq:5406
/mpich2/trunk/src/pm/hydra/mpich2prereq:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/mpich2prereq:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/mpich2prereq:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/mpich2prereq:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/mpich2prereq:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/mpich2prereq:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/mpich2prereq:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/mpich2prereq:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/mpich2prereq:5406
/mpich2/trunk/src/pm/hydra/mpich2prereq:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/pm
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/pm:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/pm:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/pm:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/pm:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/pm:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/pm:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/pm:5406
/mpich2/trunk/src/pm/hydra/pm:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/pm:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/pm:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/pm:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/pm:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/pm:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/pm:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/pm:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/pm:5406
/mpich2/trunk/src/pm/hydra/pm:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/tools:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/tools:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/tools:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/tools:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/tools:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/tools:5406
/mpich2/trunk/src/pm/hydra/tools:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/tools:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/tools:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/tools:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/tools:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/tools:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/tools:5406
/mpich2/trunk/src/pm/hydra/tools:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/error-return/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/error-return/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/error-return/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/ui
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/ui:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/ui:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/ui:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/ui:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/ui:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/ui:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/ui:5406
/mpich2/trunk/src/pm/hydra/ui:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/ui:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/ui:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/ui:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/ui:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/ui:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/ui:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/ui:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/ui:5406
/mpich2/trunk/src/pm/hydra/ui:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/utils
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/src/pm/hydra/utils:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/utils:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/utils:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/utils:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/utils:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/utils:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/utils:5406
/mpich2/trunk/src/pm/hydra/utils:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/src/pm/hydra/utils:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/utils:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/utils:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/utils:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/utils:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/utils:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/utils:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/utils:5406
/mpich2/trunk/src/pm/hydra/utils:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722


Property changes on: mpich2/branches/release/mpich2-1.3.x/winconfigure.wsf
___________________________________________________________________
Modified: svn:mergeinfo
   - /mpich2/branches/dev/ckpt/winconfigure.wsf:5050
/mpich2/branches/dev/ckpt2/winconfigure.wsf:5057-6537
/mpich2/branches/dev/ftb/winconfigure.wsf:5661-5730
/mpich2/branches/dev/lapi/winconfigure.wsf:5817
/mpich2/branches/dev/win_rrvm/winconfigure.wsf:6404,6407-6408,6420,6422-6423
/mpich2/branches/dev/wintcp_async_progress/winconfigure.wsf:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/winconfigure.wsf:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/winconfigure.wsf:5406
/mpich2/trunk/winconfigure.wsf:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
   + /mpich2/branches/dev/ckpt/winconfigure.wsf:5050
/mpich2/branches/dev/ckpt2/winconfigure.wsf:5057-6537
/mpich2/branches/dev/error-return/winconfigure.wsf:7662-7670
/mpich2/branches/dev/ftb/winconfigure.wsf:5661-5730
/mpich2/branches/dev/lapi/winconfigure.wsf:5817
/mpich2/branches/dev/win_rrvm/winconfigure.wsf:6404,6407-6408,6420,6422-6423
/mpich2/branches/dev/wintcp_async_progress/winconfigure.wsf:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/winconfigure.wsf:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/winconfigure.wsf:5406
/mpich2/trunk/winconfigure.wsf:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722



More information about the mpich2-commits mailing list