[mpich2-commits] r7723 - in mpich2/branches/release/mpich2-1.3.x: . confdb maint src/include src/mpi/coll src/mpi/errhan src/mpid src/mpid/ch3/channels/nemesis/include src/mpid/ch3/channels/nemesis/nemesis/include src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp src/mpid/ch3/channels/nemesis/nemesis/src src/mpid/ch3/channels/nemesis/src src/mpid/ch3/channels/sctp/include src/mpid/ch3/channels/sock/include src/mpid/ch3/include src/mpid/ch3/src src/mpl/src src/pm/hydra src/pm/hydra/examples src/pm/hydra/include src/pm/hydra/pm src/pm/hydra/tools src/pm/hydra/tools/bootstrap/external src/pm/hydra/tools/bootstrap/src src/pm/hydra/tools/bootstrap/utils src/pm/hydra/ui src/pm/hydra/utils
buntinas at mcs.anl.gov
buntinas at mcs.anl.gov
Thu Jan 13 16:17:25 CST 2011
Author: buntinas
Date: 2011-01-13 16:17:24 -0600 (Thu, 13 Jan 2011)
New Revision: 7723
Modified:
mpich2/branches/release/mpich2-1.3.x/
mpich2/branches/release/mpich2-1.3.x/confdb/
mpich2/branches/release/mpich2-1.3.x/maint/Version
mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c
mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt
mpich2/branches/release/mpich2-1.3.x/src/mpid/
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c
mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c
mpich2/branches/release/mpich2-1.3.x/src/mpl/src/mplstr.c
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/Makefile.am
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/README
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/autogen.sh
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/configure.in
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/examples/
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/hydra-doxygen.cfg.in
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/include/
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/mpich2prereq
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/pm/
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/ui/
mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/utils/
mpich2/branches/release/mpich2-1.3.x/winconfigure.wsf
Log:
merged changesets 7604, 7671, 7674, 7683, 7685, 7687, 7701, 7702, 7720 and 7722 from trunk into 1.3.x
Property changes on: mpich2/branches/release/mpich2-1.3.x
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt:5050
/mpich2/branches/dev/ckpt2:5057-6537
/mpich2/branches/dev/ftb:5661-5730
/mpich2/branches/dev/lapi:5817
/mpich2/branches/dev/wintcp_async_progress:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2:5406
/mpich2/trunk:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt:5050
/mpich2/branches/dev/ckpt2:5057-6537
/mpich2/branches/dev/error-return:7662-7670
/mpich2/branches/dev/ftb:5661-5730
/mpich2/branches/dev/lapi:5817
/mpich2/branches/dev/wintcp_async_progress:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2:5406
/mpich2/trunk:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/confdb
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt2/confdb:5180,5182,5196,5198
/mpich2/branches/dev/ftb/confdb:5661-5730
/mpich2/branches/dev/lapi/confdb:5817
/mpich2/branches/dev/wintcp_async_progress/confdb:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/confdb:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/confdb:5406
/mpich2/trunk/confdb:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt2/confdb:5180,5182,5196,5198
/mpich2/branches/dev/error-return/confdb:7662-7670
/mpich2/branches/dev/ftb/confdb:5661-5730
/mpich2/branches/dev/lapi/confdb:5817
/mpich2/branches/dev/wintcp_async_progress/confdb:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/confdb:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/confdb:5406
/mpich2/trunk/confdb:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/maint/Version
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/maint/Version:5050
/mpich2/branches/dev/ckpt2/maint/Version:5057-6537
/mpich2/branches/dev/ftb/maint/Version:5661-5730
/mpich2/branches/dev/lapi/maint/Version:5817
/mpich2/branches/dev/wintcp_async_progress/maint/Version:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/maint/Version:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/maint/Version:5406
/mpich2/trunk/maint/Version:7422-7425,7429-7433,7435,7437-7438,7442-7447,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
/mpich2/trunk/src/pm/hydra/VERSION:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7447
+ /mpich2/branches/dev/ckpt/maint/Version:5050
/mpich2/branches/dev/ckpt2/maint/Version:5057-6537
/mpich2/branches/dev/ftb/maint/Version:5661-5730
/mpich2/branches/dev/lapi/maint/Version:5817
/mpich2/branches/dev/wintcp_async_progress/maint/Version:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/maint/Version:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/maint/Version:5406
/mpich2/trunk/maint/Version:7422-7425,7429-7433,7435,7437-7438,7442-7447,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
/mpich2/trunk/src/pm/hydra/VERSION:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7447,7604,7671,7674,7683,7685,7687,7701-7702,7720,7722
Modified: mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/include/mpiimpl.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -3116,6 +3116,11 @@
int MPID_VCRT_Get_ptr(MPID_VCRT vcrt, MPID_VCR **vc_pptr);
/*@
+ MPID_VCRT_Contains_failed_vc - returns TRUE iff a VC in this VCRT is in MORUBIND state
+ @*/
+int MPID_VCRT_Contains_failed_vc(MPID_VCRT vcrt);
+
+/*@
MPID_VCR_Dup - Create a duplicate reference to a virtual connection
@*/
int MPID_VCR_Dup(MPID_VCR orig_vcr, MPID_VCR * new_vcr);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgather.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -83,11 +83,12 @@
MPI_Datatype recvtype,
MPID_Comm *comm_ptr )
{
- int comm_size, rank;
- int mpi_errno = MPI_SUCCESS;
+ int comm_size, rank;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Aint recvtype_extent, tot_bytes;
MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
- int j, i, pof2, src, rem;
+ int j, i, pof2, src, rem;
void *tmp_buf = NULL;
int curr_cnt, dst, type_size, left, right, jnext;
MPI_Comm comm;
@@ -173,11 +174,13 @@
(comm_size-dst_tree_root)*recvcount,
recvtype, dst,
MPIR_ALLGATHER_TAG, comm, &status);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
@@ -235,9 +238,11 @@
/* last_recv_cnt was set in the previous
receive. that's the amount of data to be
sent now. */
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* recv only if this proc. doesn't have data and sender
has data */
@@ -251,10 +256,13 @@
comm, &status);
/* nprocs_completed is also equal to the
no. of processes whose data we don't have */
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
tmp_mask >>= 1;
@@ -331,11 +339,13 @@
tmp_buf_size - recv_offset,
MPI_BYTE, dst,
MPIR_ALLGATHER_TAG, comm, &status);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
@@ -382,8 +392,12 @@
mpi_errno = MPIC_Send(((char *)tmp_buf + offset),
last_recv_cnt, MPI_BYTE,
dst, MPIR_ALLGATHER_TAG,
- comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ comm);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* last_recv_cnt was set in the previous
receive. that's the amount of data to be
sent now. */
@@ -398,10 +412,15 @@
MPI_BYTE, dst,
MPIR_ALLGATHER_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
/* nprocs_completed is also equal to the
no. of processes whose data we don't have */
- MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
tmp_mask >>= 1;
@@ -469,10 +488,11 @@
curr_cnt, recvtype,
src, MPIR_ALLGATHER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
curr_cnt *= 2;
pof2 *= 2;
}
@@ -490,9 +510,11 @@
rem * recvcount, recvtype,
src, MPIR_ALLGATHER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Rotate blocks in tmp_buf down by (rank) blocks and store
@@ -549,9 +571,11 @@
recvcount, recvtype, left,
MPIR_ALLGATHER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
j = jnext;
jnext = (comm_size + jnext - 1) % comm_size;
}
@@ -560,8 +584,10 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
/* check if multiple threads are calling this collective function */
- MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
- return (mpi_errno);
+ MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
@@ -590,6 +616,7 @@
*/
int rank, local_size, remote_size, mpi_errno = MPI_SUCCESS, root;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Aint true_extent, true_lb = 0, extent, send_extent;
void *tmp_buf=NULL;
MPID_Comm *newcomm_ptr = NULL;
@@ -624,9 +651,11 @@
if (sendcount != 0) {
mpi_errno = MPIR_Gather_impl(sendbuf, sendcount, sendtype, tmp_buf, sendcount,
sendtype, 0, newcomm_ptr);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* first broadcast from left to right group, then from right to
@@ -637,9 +666,11 @@
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Bcast_inter(tmp_buf, sendcount*local_size,
sendtype, root, comm_ptr);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* receive bcast from right */
@@ -647,9 +678,11 @@
root = 0;
mpi_errno = MPIR_Bcast_inter(recvbuf, recvcount*remote_size,
recvtype, root, comm_ptr);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
else {
@@ -658,9 +691,11 @@
root = 0;
mpi_errno = MPIR_Bcast_inter(recvbuf, recvcount*remote_size,
recvtype, root, comm_ptr);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* bcast to left */
@@ -668,14 +703,18 @@
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Bcast_inter(tmp_buf, sendcount*local_size,
sendtype, root, comm_ptr);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
- fn_exit:
+ fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allgatherv.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -79,7 +79,8 @@
{
MPI_Comm comm;
int comm_size, rank, j, i, left, right;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
MPI_Aint recvbuf_extent, recvtype_extent, recvtype_true_extent,
recvtype_true_lb;
@@ -191,11 +192,15 @@
total_count - recv_offset, recvtype, dst,
MPIR_ALLGATHERV_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* for convenience, recv is posted for a bigger amount
- than will be sent */
-
- MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ /* for convenience, recv is posted for a bigger amount
+ than will be sent */
+ MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
@@ -254,7 +259,11 @@
last_recv_cnt,
recvtype, dst,
MPIR_ALLGATHERV_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* last_recv_cnt was set in the previous
receive. that's the amount of data to be
sent now. */
@@ -273,11 +282,15 @@
total_count - offset, recvtype,
dst, MPIR_ALLGATHERV_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* for convenience, recv is posted for a
- bigger amount than will be sent */
-
- MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ /* for convenience, recv is posted for a
+ bigger amount than will be sent */
+ MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
tmp_mask >>= 1;
@@ -377,11 +390,15 @@
((char *)tmp_buf + recv_offset),
tmp_buf_size-recv_offset, MPI_BYTE, dst,
MPIR_ALLGATHERV_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* for convenience, recv is posted for a bigger amount
- than will be sent */
-
- MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ /* for convenience, recv is posted for a bigger amount
+ than will be sent */
+ MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
@@ -432,7 +449,11 @@
last_recv_cnt, MPI_BYTE,
dst, MPIR_ALLGATHERV_TAG,
comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* last_recv_cnt was set in the previous
receive. that's the amount of data to be
sent now. */
@@ -447,10 +468,15 @@
dst,
MPIR_ALLGATHERV_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* for convenience, recv is posted for a bigger amount
- than will be sent */
- MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ /* for convenience, recv is posted for a bigger amount
+ than will be sent */
+ MPIR_Get_count_impl(&status, MPI_BYTE, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
tmp_mask >>= 1;
@@ -523,9 +549,13 @@
((char *)tmp_buf + curr_cnt*recvtype_extent),
total_count - curr_cnt, recvtype,
src, MPIR_ALLGATHERV_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
- MPIR_Get_count_impl(&status, recvtype, &recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ recv_cnt = 0;
+ } else
+ MPIR_Get_count_impl(&status, recvtype, &recv_cnt);
curr_cnt += recv_cnt;
pof2 *= 2;
@@ -548,7 +578,11 @@
total_count - curr_cnt, recvtype,
src, MPIR_ALLGATHERV_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Rotate blocks in tmp_buf down by (rank) blocks and store
@@ -631,19 +665,31 @@
}
else if (!sendnow) { /* If there's no data to send, just do a recv call */
mpi_errno = MPIC_Recv(rbuf, recvnow, recvtype, left, MPIR_ALLGATHERV_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
torecv -= recvnow;
}
else if (!recvnow) { /* If there's no data to receive, just do a send call */
mpi_errno = MPIC_Send(sbuf, sendnow, recvtype, right, MPIR_ALLGATHERV_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
tosend -= sendnow;
}
else { /* There's data to be sent and received */
mpi_errno = MPIC_Sendrecv(sbuf, sendnow, recvtype, right, MPIR_ALLGATHERV_TAG,
rbuf, recvnow, recvtype, left, MPIR_ALLGATHERV_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
tosend -= sendnow;
torecv -= recvnow;
}
@@ -665,6 +711,8 @@
MPIU_CHKLMEM_FREEALL();
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -696,6 +744,7 @@
and then does an intracommunicator broadcast.
*/
int remote_size, mpi_errno, root, rank;
+ int mpi_errno_ret = MPI_SUCCESS;
MPID_Comm *newcomm_ptr = NULL;
MPI_Datatype newtype = MPI_DATATYPE_NULL;
@@ -710,13 +759,21 @@
mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
recvcounts, displs, recvtype, root,
comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* gatherv to right group */
root = 0;
mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
recvcounts, displs, recvtype, root,
comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* gatherv to left group */
@@ -724,13 +781,21 @@
mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
recvcounts, displs, recvtype, root,
comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* gatherv from left group */
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Gatherv_impl(sendbuf, sendcount, sendtype, recvbuf,
recvcounts, displs, recvtype, root,
comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* now do an intracommunicator broadcast within each group. we use
@@ -751,11 +816,17 @@
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
mpi_errno = MPIR_Bcast_intra(recvbuf, 1, newtype, 0, newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
MPIR_Type_free_impl(&newtype);
fn_exit:
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
/* --BEGIN ERROR HANDLING-- */
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/allreduce.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -131,7 +131,8 @@
int rc;
#endif
int comm_size, rank, type_size;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int mask, dst, is_commutative, pof2, newrank, rem, newdst, i,
send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
MPI_Aint true_extent, true_lb, extent;
@@ -174,10 +175,18 @@
allreduce is in recvbuf. Pass that as the sendbuf to reduce. */
mpi_errno = MPIR_Reduce_impl(recvbuf, NULL, count, datatype, op, 0, comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
} else {
mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, 0, comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
} else {
/* only one process on the node. copy sendbuf to recvbuf */
@@ -190,13 +199,21 @@
/* now do an IN_PLACE allreduce among the local roots of all nodes */
if (comm_ptr->node_roots_comm != NULL) {
mpi_errno = allreduce_intra_or_coll_fn(MPI_IN_PLACE, recvbuf, count, datatype, op, comm_ptr->node_roots_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* now broadcast the result among local processes */
if (comm_ptr->node_comm != NULL) {
mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
goto fn_exit;
}
@@ -215,17 +232,18 @@
do a reduce to 0 and then broadcast. */
mpi_errno = MPIR_Reduce_impl ( sendbuf, recvbuf, count, datatype,
op, 0, comm_ptr );
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* FIXME: mpi_errno is error CODE, not necessarily the error
- class MPI_ERR_OP. In MPICH2, we can get the error class
- with
- errorclass = mpi_errno & ERROR_CLASS_MASK;
- */
- if (mpi_errno == MPI_ERR_OP || mpi_errno == MPI_SUCCESS) {
- /* Allow MPI_ERR_OP since we can continue from this error */
- rc = MPIR_Bcast_impl( recvbuf, count, datatype, 0, comm_ptr );
- if (rc) mpi_errno = rc;
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
}
+
+ mpi_errno = MPIR_Bcast_impl( recvbuf, count, datatype, 0, comm_ptr );
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else
#endif /* MPID_HAS_HETERO */
@@ -299,7 +317,11 @@
mpi_errno = MPIC_Send(recvbuf, count,
datatype, rank+1,
MPIR_ALLREDUCE_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* temporarily set the rank to -1 so that this
process does not pariticipate in recursive
@@ -311,7 +333,11 @@
datatype, rank-1,
MPIR_ALLREDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* do the reduction on received data. since the
ordering is right, it doesn't matter whether
@@ -360,7 +386,11 @@
count, datatype, dst,
MPIR_ALLREDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* tmp_buf contains data received in this step.
recvbuf contains data accumulated so far */
@@ -456,7 +486,11 @@
recv_cnt, datatype, dst,
MPIR_ALLREDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* tmp_buf contains data received in this step.
recvbuf contains data accumulated so far */
@@ -516,7 +550,11 @@
recv_cnt, datatype, dst,
MPIR_ALLREDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (newrank > newdst) send_idx = recv_idx;
@@ -538,7 +576,11 @@
datatype, rank+1,
MPIR_ALLREDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
if (MPIU_THREADPRIV_FIELD(op_errno))
@@ -550,6 +592,8 @@
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return (mpi_errno);
fn_fail:
@@ -580,6 +624,7 @@
broadcasts because it would require allocation of a temporary buffer.
*/
int rank, mpi_errno, root;
+ int mpi_errno_ret = MPI_SUCCESS;
MPID_Comm *newcomm_ptr = NULL;
rank = comm_ptr->rank;
@@ -591,26 +636,42 @@
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* reduce to rank 0 of right group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* reduce to rank 0 of left group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* reduce from right group to rank 0 */
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Get the local intracommunicator */
@@ -620,9 +681,15 @@
newcomm_ptr = comm_ptr->local_comm;
mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
fn_exit:
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoall.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -88,6 +88,7 @@
MPI_Aint sendtype_extent, recvtype_extent;
MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
int mpi_errno=MPI_SUCCESS, src, dst, rank, nbytes;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
int sendtype_size, pack_size, block, position, *displs, count;
MPI_Datatype newtype = MPI_DATATYPE_NULL;
@@ -138,7 +139,11 @@
j, MPIR_ALLTOALL_TAG,
j, MPIR_ALLTOALL_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (rank == j) {
/* same as above with i/j args reversed */
@@ -147,7 +152,11 @@
i, MPIR_ALLTOALL_TAG,
i, MPIR_ALLTOALL_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -216,7 +225,11 @@
MPIR_ALLTOALL_TAG, recvbuf, 1, newtype,
src, MPIR_ALLTOALL_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
MPIR_Type_free_impl(&newtype);
@@ -302,11 +315,15 @@
sendbuf_extent*(comm_size-dst_tree_root),
sendtype, dst, MPIR_ALLTOALL_TAG,
comm, &status);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-
- /* in case of non-power-of-two nodes, less data may be
- received than specified */
- MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ /* in case of non-power-of-two nodes, less data may be
+ received than specified */
+ MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
@@ -351,7 +368,11 @@
last_recv_cnt, sendtype,
dst, MPIR_ALLTOALL_TAG,
comm);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* recv only if this proc. doesn't have data and sender
has data */
@@ -364,8 +385,13 @@
sendtype,
dst, MPIR_ALLTOALL_TAG,
comm, &status);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
- MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ last_recv_cnt = 0;
+ } else
+ MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt);
curr_cnt += last_recv_cnt;
}
tmp_mask >>= 1;
@@ -430,7 +456,7 @@
recvcount, recvtype, dst,
MPIR_ALLTOALL_TAG, comm,
&reqarray[i]);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
for ( i=0; i<ss; i++ ) {
@@ -440,7 +466,7 @@
sendcount, sendtype, dst,
MPIR_ALLTOALL_TAG, comm,
&reqarray[i+ss]);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
/* ... then wait for them to finish: */
@@ -452,7 +478,11 @@
for (j=0; j<2*ss; j++) {
if (starray[j].MPI_ERROR != MPI_SUCCESS) {
mpi_errno = starray[j].MPI_ERROR;
- MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -502,7 +532,11 @@
src*recvcount*recvtype_extent),
recvcount, recvtype, src,
MPIR_ALLTOALL_TAG, comm, &status);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
@@ -510,7 +544,9 @@
MPIU_CHKLMEM_FREEALL();
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
if (newtype != MPI_DATATYPE_NULL)
MPIR_Type_free_impl(&newtype);
@@ -544,7 +580,8 @@
*/
int local_size, remote_size, max_size, i;
MPI_Aint sendtype_extent, recvtype_extent;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
int src, dst, rank;
char *sendaddr, *recvaddr;
@@ -590,13 +627,19 @@
MPIR_ALLTOALL_TAG, recvaddr,
recvcount, recvtype, src,
MPIR_ALLTOALL_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
fn_exit:
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/alltoallw.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -62,7 +62,8 @@
MPID_Comm *comm_ptr )
{
int comm_size, i, j;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
MPI_Status *starray;
MPI_Request *reqarray;
@@ -100,7 +101,11 @@
j, MPIR_ALLTOALL_TAG,
j, MPIR_ALLTOALL_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (rank == j) {
/* same as above with i/j args reversed */
@@ -109,7 +114,11 @@
i, MPIR_ALLTOALL_TAG,
i, MPIR_ALLTOALL_TAG,
comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -167,7 +176,11 @@
for (i=0; i<outstanding_requests; i++) {
if (starray[i].MPI_ERROR != MPI_SUCCESS) {
mpi_errno = starray[i].MPI_ERROR;
- MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -193,7 +206,11 @@
((char *)recvbuf+rdispls[src]),
recvcnts[src], recvtypes[dst], src,
MPIR_ALLTOALLW_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
#endif
}
@@ -202,7 +219,9 @@
fn_exit:
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
MPIU_CHKLMEM_FREEALL();
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
@@ -238,6 +257,7 @@
*/
int local_size, remote_size, max_size, i;
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
int src, dst, rank, sendcount, recvcount;
char *sendaddr, *recvaddr;
@@ -284,13 +304,19 @@
dst, MPIR_ALLTOALLW_TAG, recvaddr,
recvcount, recvtype, src,
MPIR_ALLTOALLW_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
fn_exit:
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/barrier.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -55,6 +55,7 @@
int MPIR_Barrier_intra( MPID_Comm *comm_ptr )
{
int size, rank, src, dst, mask, mpi_errno=MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Comm comm;
/* Only one collective operation per communicator can be active at any
@@ -76,13 +77,18 @@
MPIR_BARRIER_TAG, NULL, 0, MPI_BYTE,
src, MPIR_BARRIER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
mask <<= 1;
}
fn_exit:
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -128,6 +134,7 @@
int MPIR_Barrier_inter( MPID_Comm *comm_ptr )
{
int rank, mpi_errno = MPI_SUCCESS, root;
+ int mpi_errno_ret = MPI_SUCCESS;
int i = 0;
MPID_Comm *newcomm_ptr = NULL;
@@ -143,7 +150,11 @@
/* do a barrier on the local intracommunicator */
mpi_errno = MPIR_Barrier_intra(newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* rank 0 on each group does an intercommunicator broadcast to the
remote group to indicate that all processes in the local group
@@ -156,23 +167,41 @@
/* bcast to right*/
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* receive bcast from right */
root = 0;
mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* receive bcast from left */
root = 0;
mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* bcast to left */
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Bcast_inter(&i, 1, MPI_BYTE, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
fn_exit:
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -217,6 +246,7 @@
int MPIR_Barrier_impl(MPID_Comm *comm_ptr)
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Barrier != NULL)
{
@@ -233,13 +263,21 @@
if (comm_ptr->node_comm != NULL)
{
mpi_errno = MPIR_Barrier_or_coll_fn(comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* do the barrier across roots of all nodes */
if (comm_ptr->node_roots_comm != NULL) {
mpi_errno = MPIR_Barrier_or_coll_fn(comm_ptr->node_roots_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* release the local processes on each node with a 1-byte broadcast
@@ -248,7 +286,11 @@
{
int i=0;
mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
else {
@@ -268,6 +310,8 @@
}
fn_exit:
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/bcast.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -58,7 +58,8 @@
{
int rank, comm_size, src, dst;
int relative_rank, mask;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int nbytes=0;
int type_size, is_contig, is_homogeneous;
int position;
@@ -154,7 +155,11 @@
else
mpi_errno = MPIC_Recv(buffer,count,datatype,src,
MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
break;
}
mask <<= 1;
@@ -184,7 +189,11 @@
else
mpi_errno = MPIC_Send(buffer,count,datatype,dst,
MPIR_BCAST_TAG,comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
mask >>= 1;
}
@@ -203,6 +212,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -236,7 +247,8 @@
MPI_Status status;
int rank, comm_size, src, dst;
int relative_rank, mask;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int scatter_size, curr_size, recv_size = 0, send_size;
MPI_Comm comm;
@@ -283,10 +295,14 @@
relative_rank*scatter_size),
recv_size, MPI_BYTE, src,
MPIR_BCAST_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
- /* query actual size of data received */
- MPIR_Get_count_impl(&status, MPI_BYTE, &curr_size);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ curr_size = 0;
+ } else
+ /* query actual size of data received */
+ MPIR_Get_count_impl(&status, MPI_BYTE, &curr_size);
}
break;
}
@@ -314,7 +330,11 @@
scatter_size*(relative_rank+mask)),
send_size, MPI_BYTE, dst,
MPIR_BCAST_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
curr_size -= send_size;
}
@@ -323,6 +343,8 @@
}
fn_exit:
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -363,6 +385,7 @@
int rank, comm_size, dst;
int relative_rank, mask;
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int scatter_size, nbytes=0, curr_size, recv_size = 0;
int type_size, j, k, i, tmp_mask, is_contig, is_homogeneous;
int relative_dst, dst_tree_root, my_tree_root, send_offset;
@@ -436,7 +459,11 @@
mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
nbytes, tmp_buf, is_contig, is_homogeneous);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* medium size allgather and pof2 comm_size. use recurive doubling. */
@@ -470,9 +497,13 @@
((char *)tmp_buf + recv_offset),
(nbytes-recv_offset < 0 ? 0 : nbytes-recv_offset),
MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
- MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ recv_size = 0;
+ } else
+ MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
curr_size += recv_size;
}
@@ -540,7 +571,11 @@
/* recv_size was set in the previous
receive. that's the amount of data to be
sent now. */
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* recv only if this proc. doesn't have data and sender
has data */
@@ -556,9 +591,13 @@
comm, &status);
/* nprocs_completed is also equal to the no. of processes
whose data we don't have */
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
- MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ recv_size = 0;
+ } else
+ MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
curr_size += recv_size;
/* printf("Rank %d, recv from %d, offset %d, size %d\n", rank, dst, offset, recv_size);
fflush(stdout);*/
@@ -586,6 +625,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -621,6 +662,7 @@
int rank, comm_size;
int relative_rank;
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int scatter_size, nbytes;
int type_size, j, i, is_contig, is_homogeneous;
int position;
@@ -690,7 +732,11 @@
mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
nbytes, tmp_buf, is_contig, is_homogeneous);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */
@@ -727,7 +773,11 @@
recvcnts[(jnext-root+comm_size)%comm_size],
MPI_BYTE, left,
MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
j = jnext;
jnext = (comm_size + jnext - 1) % comm_size;
@@ -746,6 +796,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -769,7 +821,11 @@
{ \
mpi_errno_ = bcast_fn_(buffer_, count_, datatype_, root_, comm_ptr_); \
} \
- if (mpi_errno_) MPIU_ERR_POP(mpi_errno_); \
+ if (mpi_errno) { \
+ /* for communication errors, just record the error but continue */ \
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); \
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); \
+ } \
} while (0)
/* FIXME This function uses some heuristsics based off of some testing on a
@@ -786,6 +842,7 @@
MPID_Comm *comm_ptr)
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int type_size, is_homogeneous;
int nbytes=0;
@@ -828,7 +885,11 @@
mpi_errno = MPIC_Recv(buffer,count,datatype,MPIU_Get_intranode_rank(comm_ptr, root),
MPIR_BCAST_TAG,comm_ptr->node_comm->handle,MPI_STATUS_IGNORE);
}
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* perform the internode broadcast */
@@ -905,11 +966,17 @@
algorithm that (at least approximately) minimized internode
communication. */
mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
fn_exit:
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -971,6 +1038,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int comm_size;
int nbytes=0;
int type_size, is_homogeneous;
@@ -987,7 +1055,11 @@
#if defined(USE_SMP_COLLECTIVES)
if (MPIR_Comm_is_node_aware(comm_ptr)) {
mpi_errno = MPIR_SMP_Bcast(buffer, count, datatype, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
goto fn_exit;
}
#endif
@@ -1018,14 +1090,22 @@
if ((nbytes < MPIR_PARAM_BCAST_SHORT_MSG_SIZE) || (comm_size < MPIR_PARAM_BCAST_MIN_PROCS))
{
mpi_errno = MPIR_Bcast_binomial(buffer, count, datatype, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else /* (nbytes >= MPIR_PARAM_BCAST_SHORT_MSG_SIZE) && (comm_size >= MPIR_PARAM_BCAST_MIN_PROCS) */
{
if ((nbytes < MPIR_PARAM_BCAST_LONG_MSG_SIZE) && (MPIU_is_pof2(comm_size, NULL)))
{
mpi_errno = MPIR_Bcast_scatter_doubling_allgather(buffer, count, datatype, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else /* (nbytes >= MPIR_PARAM_BCAST_LONG_MSG_SIZE) || !(comm_size_is_pof2) */
{
@@ -1033,7 +1113,11 @@
topologically aware communicator. Doing inter/intra-node
communication phases breaks the pipelining of the algorithm. */
mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
@@ -1043,6 +1127,8 @@
MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -1067,6 +1153,7 @@
intracommunicator broadcast.
*/
int rank, mpi_errno;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
MPID_Comm *newcomm_ptr = NULL;
MPI_Comm comm;
@@ -1087,7 +1174,11 @@
MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
mpi_errno = MPIC_Send(buffer, count, datatype, 0,
MPIR_BCAST_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
}
else
@@ -1100,7 +1191,11 @@
{
mpi_errno = MPIC_Recv(buffer, count, datatype, root,
MPIR_BCAST_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Get the local intracommunicator */
@@ -1112,11 +1207,17 @@
/* now do the usual broadcast on this intracommunicator
with rank 0 as root. */
mpi_errno = MPIR_Bcast_intra(buffer, count, datatype, 0, newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
fn_fail:
MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST_INTER);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
}
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/exscan.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -106,7 +106,8 @@
{
MPI_Status status;
int rank, comm_size;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int mask, dst, is_commutative, flag;
MPI_Aint true_extent, true_lb, extent;
void *partial_scan, *tmp_buf;
@@ -187,7 +188,11 @@
count, datatype, dst,
MPIR_EXSCAN_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (rank > dst) {
call_uop(tmp_buf, partial_scan, count, datatype);
@@ -237,7 +242,9 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gather.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -61,7 +61,8 @@
MPID_Comm *comm_ptr )
{
int comm_size, rank;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int curr_cnt=0, relative_rank, nbytes, is_homogeneous;
int mask, sendtype_size, recvtype_size, src, dst, relative_src;
int recvblks;
@@ -194,7 +195,11 @@
recvblks * recvcnt, recvtype, src,
MPIR_GATHER_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) {
mpi_errno = MPIC_Recv(tmp_buf, recvblks * nbytes, MPI_BYTE,
@@ -218,8 +223,9 @@
mpi_errno = MPIC_Recv(recvbuf, 1, tmp_type, src,
MPIR_GATHER_TAG, comm, &status);
if (mpi_errno) {
- MPIR_Type_free_impl(&tmp_type);
- MPIU_ERR_POP(mpi_errno);
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
}
MPIR_Type_free_impl(&tmp_type);
@@ -243,7 +249,11 @@
recvblks * nbytes, MPI_BYTE, src,
MPIR_GATHER_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
curr_cnt += (recvblks * nbytes);
}
}
@@ -258,12 +268,20 @@
/* leaf nodes send directly from sendbuf */
mpi_errno = MPIC_Send(sendbuf, sendcnt, sendtype, dst,
MPIR_GATHER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) {
mpi_errno = MPIC_Send(tmp_buf, curr_cnt, MPI_BYTE, dst,
MPIR_GATHER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
blocks[0] = sendcnt;
@@ -282,8 +300,9 @@
mpi_errno = MPIC_Send(MPI_BOTTOM, 1, tmp_type, dst,
MPIR_GATHER_TAG, comm);
if (mpi_errno) {
- MPIR_Type_free_impl(&tmp_type);
- MPIU_ERR_POP(mpi_errno);
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
}
MPIR_Type_free_impl(&tmp_type);
}
@@ -352,10 +371,15 @@
tmp_buf_size-curr_cnt, MPI_BYTE, src,
MPIR_GATHER_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* the recv size is larger than what may be sent in
- some cases. query amount of data actually received */
- MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ recv_size = 0;
+ } else
+ /* the recv size is larger than what may be sent in
+ some cases. query amount of data actually received */
+ MPIR_Get_count_impl(&status, MPI_BYTE, &recv_size);
curr_cnt += recv_size;
}
}
@@ -365,7 +389,11 @@
dst = (dst + root) % comm_size;
mpi_errno = MPIC_Send(tmp_buf, curr_cnt, MPI_BYTE, dst,
MPIR_GATHER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
break;
}
mask <<= 1;
@@ -404,6 +432,8 @@
MPIU_CHKLMEM_FREEALL();
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -438,6 +468,7 @@
*/
int rank, local_size, remote_size, mpi_errno=MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int i, nbytes, sendtype_size, recvtype_size;
MPI_Status status;
MPI_Aint extent, true_extent, true_lb = 0;
@@ -478,8 +509,11 @@
mpi_errno = MPIC_Recv(recvbuf, recvcnt*remote_size,
recvtype, 0, MPIR_GATHER_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else
{
@@ -513,14 +547,22 @@
mpi_errno = MPIR_Gather_impl(sendbuf, sendcnt, sendtype,
tmp_buf, sendcnt, sendtype, 0,
newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (rank == 0)
{
mpi_errno = MPIC_Send(tmp_buf, sendcnt*local_size,
sendtype, root,
MPIR_GATHER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -538,20 +580,30 @@
mpi_errno = MPIC_Recv(((char *)recvbuf+recvcnt*i*extent),
recvcnt, recvtype, i,
MPIR_GATHER_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
else
{
mpi_errno = MPIC_Send(sendbuf,sendcnt,sendtype,root,
MPIR_GATHER_TAG,comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
fn_exit:
MPIU_CHKLMEM_FREEALL();
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/gatherv.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -61,6 +61,7 @@
{
int comm_size, rank;
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Comm comm;
MPI_Aint extent;
int i, reqs;
@@ -120,7 +121,11 @@
for (i = 0; i < reqs; i++) {
if (starray[i].MPI_ERROR != MPI_SUCCESS) {
mpi_errno = starray[i].MPI_ERROR;
- MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -143,12 +148,20 @@
if (comm_size >= min_procs) {
mpi_errno = MPIC_Ssend(sendbuf, sendcnt, sendtype, root,
MPIR_GATHERV_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
mpi_errno = MPIC_Send(sendbuf, sendcnt, sendtype, root,
MPIR_GATHERV_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -158,6 +171,8 @@
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -83,6 +83,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int comm_size = comm_ptr->local_size;
int rank = comm_ptr->rank;
int pof2;
@@ -191,7 +192,11 @@
incoming_data + recv_offset*true_extent,
size, datatype, peer, MPIR_REDUCE_SCATTER_TAG,
comm, MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* always perform the reduction at recv_offset, the data at send_offset
is now our peer's responsibility */
if (rank > peer) {
@@ -222,6 +227,8 @@
recvbuf, size, datatype);
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -292,7 +299,8 @@
MPI_Aint extent, true_extent, true_lb;
int *disps;
void *tmp_recvbuf, *tmp_results;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int type_size, dis[2], blklens[2], total_count, nbytes, src, dst;
int mask, dst_tree_root, my_tree_root, j, k;
int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx,
@@ -408,7 +416,11 @@
mpi_errno = MPIC_Send(tmp_results, total_count,
datatype, rank+1,
MPIR_REDUCE_SCATTER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* temporarily set the rank to -1 so that this
process does not pariticipate in recursive
@@ -420,7 +432,11 @@
datatype, rank-1,
MPIR_REDUCE_SCATTER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* do the reduction on received data. since the
ordering is right, it doesn't matter whether
@@ -519,7 +535,11 @@
dst, MPIR_REDUCE_SCATTER_TAG,
comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* tmp_recvbuf contains data received in this step.
tmp_results contains data accumulated so far */
@@ -567,7 +587,11 @@
disps[rank-1]*extent, recvcnts[rank-1],
datatype, rank-1,
MPIR_REDUCE_SCATTER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
else { /* even */
@@ -576,7 +600,11 @@
datatype, rank+1,
MPIR_REDUCE_SCATTER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -621,7 +649,11 @@
MPIR_REDUCE_SCATTER_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (is_commutative || (src < rank)) {
if (sendbuf != MPI_IN_PLACE) {
@@ -818,7 +850,11 @@
MPIR_REDUCE_SCATTER_TAG, comm,
MPI_STATUS_IGNORE);
received = 1;
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* if some processes in this process's subtree in this step
@@ -871,7 +907,11 @@
MPIR_REDUCE_SCATTER_TAG,
comm, MPI_STATUS_IGNORE);
received = 1;
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
tmp_mask >>= 1;
k--;
@@ -959,7 +999,9 @@
if (MPIU_THREADPRIV_FIELD(op_errno))
mpi_errno = MPIU_THREADPRIV_FIELD(op_errno);
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
@@ -986,6 +1028,7 @@
*/
int rank, mpi_errno, root, local_size, total_count, i;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Aint true_extent, true_lb = 0, extent;
void *tmp_buf=NULL;
int *disps=NULL;
@@ -1026,26 +1069,42 @@
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* reduce to rank 0 of right group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* reduce to rank 0 of left group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* reduce from right group to rank 0 */
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Get the local intracommunicator */
@@ -1058,10 +1117,16 @@
mpi_errno = MPIR_Scatterv(tmp_buf, recvcnts, disps, datatype, recvbuf,
recvcnts[rank], datatype, 0, newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -1073,7 +1138,7 @@
implementations of reduce_scatter. In all other cases
MPIR_Reduce_Scatter_impl should be used. */
#undef FUNCNAME
-#define FUNCNAME MPIR_Reduce_scatter_impl
+#define FUNCNAME MPIR_Reduce_scatter
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPIR_Reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts,
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/red_scat_block.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -89,6 +89,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int comm_size = comm_ptr->local_size;
int rank = comm_ptr->rank;
int pof2;
@@ -193,7 +194,11 @@
incoming_data + recv_offset*true_extent,
size, datatype, peer, MPIR_REDUCE_SCATTER_BLOCK_TAG,
comm, MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* always perform the reduction at recv_offset, the data at send_offset
is now our peer's responsibility */
if (rank > peer) {
@@ -226,6 +231,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -296,7 +303,8 @@
MPI_Aint extent, true_extent, true_lb;
int *disps;
void *tmp_recvbuf, *tmp_results;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int type_size, dis[2], blklens[2], total_count, nbytes, src, dst;
int mask, dst_tree_root, my_tree_root, j, k;
int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx,
@@ -411,7 +419,11 @@
mpi_errno = MPIC_Send(tmp_results, total_count,
datatype, rank+1,
MPIR_REDUCE_SCATTER_BLOCK_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* temporarily set the rank to -1 so that this
process does not pariticipate in recursive
@@ -423,7 +435,11 @@
datatype, rank-1,
MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* do the reduction on received data. since the
ordering is right, it doesn't matter whether
@@ -522,7 +538,11 @@
dst, MPIR_REDUCE_SCATTER_BLOCK_TAG,
comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* tmp_recvbuf contains data received in this step.
tmp_results contains data accumulated so far */
@@ -573,7 +593,11 @@
MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
MPI_STATUS_IGNORE);
}
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
@@ -616,7 +640,11 @@
MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (is_commutative || (src < rank)) {
if (sendbuf != MPI_IN_PLACE) {
@@ -803,7 +831,11 @@
MPIR_REDUCE_SCATTER_BLOCK_TAG, comm,
MPI_STATUS_IGNORE);
received = 1;
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* if some processes in this process's subtree in this step
@@ -845,7 +877,11 @@
mpi_errno = MPIC_Send(tmp_recvbuf, 1, recvtype,
dst, MPIR_REDUCE_SCATTER_BLOCK_TAG,
comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* recv only if this proc. doesn't have data and sender
has data */
@@ -856,7 +892,11 @@
MPIR_REDUCE_SCATTER_BLOCK_TAG,
comm, MPI_STATUS_IGNORE);
received = 1;
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
tmp_mask >>= 1;
k--;
@@ -944,7 +984,9 @@
if (MPIU_THREADPRIV_FIELD(op_errno))
mpi_errno = MPIU_THREADPRIV_FIELD(op_errno);
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
@@ -971,6 +1013,7 @@
*/
int rank, mpi_errno, root, local_size, total_count;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Aint true_extent, true_lb = 0, extent;
void *tmp_buf=NULL;
MPID_Comm *newcomm_ptr = NULL;
@@ -1001,26 +1044,42 @@
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* reduce to rank 0 of right group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* reduce to rank 0 of left group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* reduce from right group to rank 0 */
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, tmp_buf, total_count, datatype, op,
root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Get the local intracommunicator */
@@ -1031,10 +1090,16 @@
mpi_errno = MPIR_Scatter_impl(tmp_buf, recvcount, datatype, recvbuf,
recvcount, datatype, 0, newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/reduce.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -41,6 +41,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
int comm_size, rank, is_commutative, type_size;
int mask, relrank, source, lroot;
@@ -168,7 +169,11 @@
source = (source + lroot) % comm_size;
mpi_errno = MPIC_Recv (tmp_buf, count, datatype, source,
MPIR_REDUCE_TAG, comm, &status);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* The sender is above us, so the received buffer must be
the second argument (in the noncommutative case). */
@@ -203,7 +208,11 @@
source = ((relrank & (~ mask)) + lroot) % comm_size;
mpi_errno = MPIC_Send( recvbuf, count, datatype,
source, MPIR_REDUCE_TAG, comm );
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
break;
}
mask <<= 1;
@@ -221,7 +230,11 @@
mpi_errno = MPIC_Recv ( recvbuf, count, datatype, 0,
MPIR_REDUCE_TAG, comm, &status);
}
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* FIXME does this need to be checked after each uop invocation for
@@ -235,6 +248,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -281,6 +296,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int comm_size, rank, is_commutative, type_size, pof2, rem, newrank;
int mask, *cnts, *disps, i, j, send_idx=0;
int recv_idx, last_idx=0, newdst;
@@ -389,7 +405,11 @@
mpi_errno = MPIC_Send(recvbuf, count,
datatype, rank-1,
MPIR_REDUCE_TAG, comm);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* temporarily set the rank to -1 so that this
process does not pariticipate in recursive
@@ -401,7 +421,11 @@
datatype, rank+1,
MPIR_REDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* do the reduction on received data. */
/* This algorithm is used only for predefined ops
@@ -480,7 +504,11 @@
recv_cnt, datatype, dst,
MPIR_REDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* tmp_buf contains data received in this step.
recvbuf contains data accumulated so far */
@@ -534,7 +562,11 @@
mpi_errno = MPIC_Recv(recvbuf, cnts[0], datatype,
0, MPIR_REDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
newrank = 0;
send_idx = 0;
last_idx = 2;
@@ -542,7 +574,11 @@
else if (newrank == 0) { /* send */
mpi_errno = MPIC_Send(recvbuf, cnts[0], datatype,
root, MPIR_REDUCE_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
newrank = -1;
}
newroot = 0;
@@ -611,7 +647,11 @@
send_cnt, datatype,
dst, MPIR_REDUCE_TAG,
comm);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
break;
}
else {
@@ -623,7 +663,11 @@
recv_cnt, datatype, dst,
MPIR_REDUCE_TAG, comm,
MPI_STATUS_IGNORE);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
if (newrank > newdst) send_idx = recv_idx;
@@ -644,6 +688,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -720,6 +766,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int comm_size, is_commutative, type_size, pof2;
MPID_Op *op_ptr;
#if defined(USE_SMP_COLLECTIVES)
@@ -762,7 +809,11 @@
MPIU_Get_intranode_rank(comm_ptr, root) == -1) {
mpi_errno = MPIR_Reduce_impl(sendbuf, tmp_buf, count, datatype,
op, 0, comm_ptr->node_comm);
- if (mpi_errno) goto fn_fail;
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* do the internode reduce to the root's node */
@@ -774,7 +825,11 @@
mpi_errno = MPIR_Reduce_impl(buf, NULL, count, datatype,
op, MPIU_Get_internode_rank(comm_ptr, root),
comm_ptr->node_roots_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else { /* I am on root's node. I have not participated in the earlier reduce. */
if (comm_ptr->rank != root) {
@@ -784,7 +839,11 @@
mpi_errno = MPIR_Reduce_impl(sendbuf, tmp_buf, count, datatype,
op, MPIU_Get_internode_rank(comm_ptr, root),
comm_ptr->node_roots_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* point sendbuf at tmp_buf to make final intranode reduce easy */
sendbuf = tmp_buf;
@@ -795,7 +854,11 @@
mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype,
op, MPIU_Get_internode_rank(comm_ptr, root),
comm_ptr->node_roots_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
/* set sendbuf to MPI_IN_PLACE to make final intranode reduce easy. */
sendbuf = MPI_IN_PLACE;
@@ -810,7 +873,11 @@
mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype,
op, MPIU_Get_intranode_rank(comm_ptr, root),
comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
goto fn_exit;
@@ -842,12 +909,20 @@
(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) && (count >= pof2)) {
/* do a reduce-scatter followed by gather to root. */
mpi_errno = MPIR_Reduce_redscat_gather(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* use a binomial tree algorithm */
mpi_errno = MPIR_Reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
@@ -857,6 +932,8 @@
#if defined(USE_SMP_COLLECTIVES)
MPIU_CHKLMEM_FREEALL();
#endif
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -886,6 +963,7 @@
*/
int rank, mpi_errno;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Status status;
MPI_Aint true_extent, true_lb, extent;
void *tmp_buf=NULL;
@@ -906,7 +984,11 @@
/* root receives data from rank 0 on remote group */
mpi_errno = MPIC_Recv(recvbuf, count, datatype, 0,
MPIR_REDUCE_TAG, comm, &status);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
/* remote group. Rank 0 allocates temporary buffer, does
@@ -939,19 +1021,29 @@
/* now do a local reduce on this intracommunicator */
mpi_errno = MPIR_Reduce_intra(sendbuf, tmp_buf, count, datatype,
op, 0, newcomm_ptr);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (rank == 0)
{
mpi_errno = MPIC_Send(tmp_buf, count, datatype, root,
MPIR_REDUCE_TAG, comm);
- if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
fn_exit:
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scan.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -75,7 +75,8 @@
{
MPI_Status status;
int rank, comm_size;
- int mpi_errno = MPI_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int mask, dst, is_commutative;
MPI_Aint true_extent, true_lb, extent;
void *partial_scan, *tmp_buf;
@@ -171,7 +172,11 @@
count, datatype, dst,
MPIR_SCAN_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (rank > dst) {
#ifdef HAVE_CXX_BINDING
@@ -228,7 +233,9 @@
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
- return (mpi_errno);
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
@@ -252,6 +259,7 @@
MPID_Comm *comm_ptr )
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPIU_CHKLMEM_DECL(3);
MPIU_THREADPRIV_DECL;
int rank = comm_ptr->rank;
@@ -303,7 +311,11 @@
{
mpi_errno = MPIR_Scan_impl(sendbuf, recvbuf, count, datatype,
op, comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (sendbuf != MPI_IN_PLACE)
{
@@ -321,7 +333,11 @@
mpi_errno = MPIC_Recv(localfulldata, count, datatype,
comm_ptr->node_comm->local_size - 1, MPIR_SCAN_TAG,
comm_ptr->node_comm->handle, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (comm_ptr->node_roots_comm == NULL &&
comm_ptr->node_comm != NULL &&
@@ -329,7 +345,11 @@
{
mpi_errno = MPIC_Send(recvbuf, count, datatype,
0, MPIR_SCAN_TAG, comm_ptr->node_comm->handle);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else if (comm_ptr->node_roots_comm != NULL)
{
@@ -344,7 +364,11 @@
{
mpi_errno = MPIR_Scan_impl(localfulldata, prefulldata, count, datatype,
op, comm_ptr->node_roots_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
if (MPIU_Get_internode_rank(comm_ptr, rank) !=
comm_ptr->node_roots_comm->local_size-1)
@@ -352,7 +376,11 @@
mpi_errno = MPIC_Send(prefulldata, count, datatype,
MPIU_Get_internode_rank(comm_ptr, rank) + 1,
MPIR_SCAN_TAG, comm_ptr->node_roots_comm->handle);
- if(mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
if (MPIU_Get_internode_rank(comm_ptr, rank) != 0)
{
@@ -361,7 +389,11 @@
MPIR_SCAN_TAG, comm_ptr->node_roots_comm->handle,
&status);
noneed = 0;
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
@@ -373,7 +405,11 @@
if (comm_ptr->node_comm != NULL) {
mpi_errno = MPIR_Bcast_impl(&noneed, 1, MPI_INT, 0, comm_ptr->node_comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
if (noneed == 0) {
@@ -382,7 +418,11 @@
#endif
if (comm_ptr->node_comm != NULL) {
mpi_errno = MPIR_Bcast_impl(tempbuf, count, datatype, 0, comm_ptr->node_comm);
- if(mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* do reduce on tempbuf and recvbuf, finish scan. */
@@ -420,6 +460,8 @@
fn_exit:
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatter.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -68,6 +68,7 @@
int tmp_buf_size = 0;
void *tmp_buf=NULL;
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Comm comm;
MPIU_CHKLMEM_DECL(4);
@@ -171,16 +172,24 @@
mpi_errno = MPIC_Recv(recvbuf, recvcnt, recvtype,
src, MPIR_SCATTER_TAG, comm,
&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
else {
mpi_errno = MPIC_Recv(tmp_buf, tmp_buf_size, MPI_BYTE, src,
MPIR_SCATTER_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
- /* the recv size is larger than what may be sent in
- some cases. query amount of data actually received */
- MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ curr_cnt = 0;
+ } else
+ /* the recv size is larger than what may be sent in
+ some cases. query amount of data actually received */
+ MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
}
break;
}
@@ -218,7 +227,11 @@
MPI_BYTE, dst,
MPIR_SCATTER_TAG, comm);
}
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
curr_cnt -= send_subtree_cnt;
}
mask >>= 1;
@@ -319,10 +332,15 @@
mpi_errno = MPIC_Recv(tmp_buf, tmp_buf_size, MPI_BYTE, src,
MPIR_SCATTER_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* the recv size is larger than what may be sent in
- some cases. query amount of data actually received */
- MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ curr_cnt = 0;
+ } else
+ /* the recv size is larger than what may be sent in
+ some cases. query amount of data actually received */
+ MPIR_Get_count_impl(&status, MPI_BYTE, &curr_cnt);
break;
}
mask <<= 1;
@@ -344,7 +362,11 @@
mpi_errno = MPIC_Send (((char *)tmp_buf + nbytes*mask),
send_subtree_cnt, MPI_BYTE, dst,
MPIR_SCATTER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
curr_cnt -= send_subtree_cnt;
}
mask >>= 1;
@@ -364,6 +386,8 @@
MPIU_CHKLMEM_FREEALL();
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
@@ -396,6 +420,7 @@
*/
int rank, local_size, remote_size, mpi_errno=MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
int i, nbytes, sendtype_size, recvtype_size;
MPI_Status status;
MPI_Aint extent, true_extent, true_lb = 0;
@@ -429,7 +454,11 @@
/* root sends all data to rank 0 on remote group and returns */
mpi_errno = MPIC_Send(sendbuf, sendcnt*remote_size,
sendtype, 0, MPIR_SCATTER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
goto fn_exit;
}
else {
@@ -454,7 +483,11 @@
mpi_errno = MPIC_Recv(tmp_buf, recvcnt*local_size,
recvtype, root,
MPIR_SCATTER_TAG, comm, &status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
/* Get the local intracommunicator */
@@ -467,7 +500,11 @@
mpi_errno = MPIR_Scatter_impl(tmp_buf, recvcnt, recvtype,
recvbuf, recvcnt, recvtype, 0,
newcomm_ptr);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
else {
@@ -478,13 +515,21 @@
mpi_errno = MPIC_Send(((char *)sendbuf+sendcnt*i*extent),
sendcnt, sendtype, i,
MPIR_SCATTER_TAG, comm);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
else {
mpi_errno = MPIC_Recv(recvbuf,recvcnt,recvtype,root,
MPIR_SCATTER_TAG,comm,&status);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
@@ -492,6 +537,8 @@
MPIU_CHKLMEM_FREEALL();
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/coll/scatterv.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -60,6 +60,7 @@
MPID_Comm *comm_ptr )
{
int rank, comm_size, mpi_errno = MPI_SUCCESS;
+ int mpi_errno_ret = MPI_SUCCESS;
MPI_Comm comm;
MPI_Aint extent;
int i, reqs;
@@ -120,7 +121,11 @@
for (i = 0; i < reqs; i++) {
if (starray[i].MPI_ERROR != MPI_SUCCESS) {
mpi_errno = starray[i].MPI_ERROR;
- MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
}
@@ -131,7 +136,11 @@
if (recvcnt) {
mpi_errno = MPIC_Recv(recvbuf,recvcnt,recvtype,root,
MPIR_SCATTERV_TAG,comm,MPI_STATUS_IGNORE);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ /* for communication errors, just record the error but continue */
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
+ MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
+ }
}
}
@@ -140,6 +149,8 @@
/* check if multiple threads are calling this collective function */
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
MPIU_CHKLMEM_FREEALL();
+ if (mpi_errno_ret)
+ mpi_errno = mpi_errno_ret;
return mpi_errno;
fn_fail:
goto fn_exit;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpi/errhan/errnames.txt 2011-01-13 22:17:24 UTC (rev 7723)
@@ -883,6 +883,7 @@
**signal:signal() failed
**signal %s:signal() failed: %s
+**sigusr1:This version of MPICH requires the SIGUSR1 signal, but the application has already installed a handler
#
# mpi functions
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/mpid
___________________________________________________________________
Added: svn:mergeinfo
+ /mpich2/branches/dev/ckpt/src/mpid:5050
/mpich2/branches/dev/ckpt2/src/mpid:5057-6537
/mpich2/branches/dev/error-return/src/mpid:7405-7603,7662-7670
/mpich2/branches/dev/ftb/src/mpid:5661-5730
/mpich2/branches/dev/lapi/src/mpid:5817
/mpich2/branches/dev/wintcp_async_progress/src/mpid:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/mpid:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpid:5406
/mpich2/trunk/src/mpid:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_post.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -10,12 +10,8 @@
/* #define MPIDI_CH3_EAGER_MAX_MSG_SIZE (1500 - sizeof(MPIDI_CH3_Pkt_t)) */
#define MPIDI_CH3_EAGER_MAX_MSG_SIZE (128*1024)
-#define MPIDI_CH3_Progress_start(progress_state_) \
-do { \
- MPIU_THREAD_CS_ENTER(COMPLETION,); \
- (progress_state_)->ch.completion_count = MPIDI_CH3I_progress_completion_count; \
- MPIU_THREAD_CS_EXIT(COMPLETION,); \
-} while (0)
+#define MPIDI_CH3_Progress_start(progress_state_) \
+ (progress_state_)->ch.completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
#define MPIDI_CH3_Progress_end(progress_state_)
enum {
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -14,6 +14,7 @@
#include <winsock2.h>
#include <windows.h>
#endif
+#include "opa_primitives.h"
/*#define MPID_USE_SEQUENCE_NUMBERS*/
/*#define HAVE_CH3_PRE_INIT*/
@@ -130,5 +131,14 @@
#define MPIDI_CH3_PROGRESS_STATE_DECL MPIDI_CH3I_Progress_state ch;
+extern OPA_int_t MPIDI_CH3I_progress_completion_count;
+#define MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT do { \
+ OPA_write_barrier(); \
+ OPA_incr_int(&MPIDI_CH3I_progress_completion_count); \
+ MPIU_DBG_MSG_D(CH3_PROGRESS,VERBOSE, \
+ "just incremented MPIDI_CH3I_progress_completion_count=%d", \
+ OPA_load_int(&MPIDI_CH3I_progress_completion_count)); \
+ } while(0)
+
#endif /* !defined(MPICH_MPIDI_CH3_PRE_H_INCLUDED) */
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -23,7 +23,7 @@
static inline int MPID_nem_mpich2_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
static inline int MPID_nem_mpich2_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
-static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
+static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
static inline int MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
static inline int MPID_nem_mpich2_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
static inline void MPID_nem_mpich2_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
@@ -863,10 +863,9 @@
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int
-MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
+MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
{
int mpi_errno = MPI_SUCCESS;
- unsigned completions = MPIDI_CH3I_progress_completion_count;
#ifndef ENABLE_NO_YIELD
int pollcount = 0;
#endif
@@ -905,7 +904,7 @@
mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
if (mpi_errno) MPIU_ERR_POP (mpi_errno);
- if (completions != MPIDI_CH3I_progress_completion_count || MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
+ if (MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
|| MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
{
*cell = NULL;
@@ -921,6 +920,12 @@
}
++pollcount;
#endif
+
+ if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
+ *cell = NULL;
+ *in_fbox = 0;
+ goto exit_l;
+ }
}
MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -117,7 +117,7 @@
static int find_free_entry(int *index);
static int close_cleanup_and_free_sc_plfd(sockconn_t *const sc);
static int cleanup_and_free_sc_plfd(sockconn_t *const sc);
-static int error_closed(struct MPIDI_VC *const vc);
+static int error_closed(struct MPIDI_VC *const vc, int req_errno);
#undef FUNCNAME
#define FUNCNAME is_same_connection
@@ -584,10 +584,8 @@
*got_sc_eof = 1;
goto fn_exit;
}
- if (nread == -1 && errno != EAGAIN) {
- MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
- }
- MPIU_ERR_CHKANDJUMP1(nread != hdr_len, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno)); /* FIXME-Z1 */
+ MPIU_ERR_CHKANDJUMP1(nread == -1 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
+ MPIU_ERR_CHKANDJUMP(nread != hdr_len, mpi_errno, MPI_ERR_OTHER, "**read"); /* FIXME-Z1 */
MPIU_Assert(hdr.pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_INFO ||
hdr.pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_TMPVC_INFO);
MPIU_Assert(hdr.datalen != 0);
@@ -603,10 +601,8 @@
++iov_cnt;
}
CHECK_EINTR (nread, readv(sc->fd, iov, iov_cnt));
- if (nread == -1 && errno != EAGAIN) {
- MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
- }
- MPIU_ERR_CHKANDJUMP1(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno)); /* FIXME-Z1 */
+ MPIU_ERR_CHKANDJUMP1(nread == -1 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
+ MPIU_ERR_CHKANDJUMP(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read"); /* FIXME-Z1 */
if (pg_id_len == 0) {
sc->is_same_pg = TRUE;
mpi_errno = MPID_nem_tcp_get_vc_from_conninfo (MPIDI_Process.my_pg->id,
@@ -665,10 +661,8 @@
iov[0].iov_len = sizeof(sc->vc->port_name_tag);
CHECK_EINTR (nread, readv(sc->fd, iov, iov_cnt));
- if (nread == -1 && errno != EAGAIN) {
- MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
- }
- MPIU_ERR_CHKANDJUMP1(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno)); /* FIXME-Z1 */
+ MPIU_ERR_CHKANDJUMP1(nread == -1 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", MPIU_Strerror(errno));
+ MPIU_ERR_CHKANDJUMP(nread != hdr.datalen, mpi_errno, MPI_ERR_OTHER, "**read"); /* FIXME-Z1 */
sc->is_same_pg = FALSE;
sc->pg_id = NULL;
sc->is_tmpvc = TRUE;
@@ -800,14 +794,10 @@
int rc = 0;
if (vc_tcp->connect_retry_count > MPIDI_NEM_TCP_MAX_CONNECT_RETRIES) {
- int mpi_errno2 = MPI_SUCCESS;
MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "exceeded retries, closing sc");
- mpi_errno2 = error_closed(vc);
- if (mpi_errno2) {
- MPIU_ERR_SET(mpi_errno2, MPI_ERR_OTHER, "**tcp_cleanup_fail");
- if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
- }
MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**exceeded_connect_tries", "**exceeded_connect_tries %d", vc->pg_rank);
+ mpi_errno = error_closed(vc, mpi_errno);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
goto fn_fail;
}
@@ -987,6 +977,7 @@
int close_cleanup_and_free_sc_plfd(sockconn_t *const sc)
{
int mpi_errno = MPI_SUCCESS;
+ int mpi_errno2 = MPI_SUCCESS;
int rc;
MPIDI_VC_t *const sc_vc = sc->vc;
MPIDI_STATE_DECL(MPID_STATE_CLOSE_CLEANUP_AND_FREE_SC_PLFD);
@@ -1002,8 +993,8 @@
if (rc == -1 && errno != EAGAIN && errno != EBADF)
MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**close", "**close %s", MPIU_Strerror(errno));
- mpi_errno = cleanup_and_free_sc_plfd(sc);
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno2 = cleanup_and_free_sc_plfd(sc);
+ if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_CLOSE_CLEANUP_AND_FREE_SC_PLFD);
@@ -1545,7 +1536,9 @@
the other side performs a tcp close() before we do and we
blow up here. */
MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "other side closed, but we're shutting down, closing sc");
- mpi_errno = MPID_nem_tcp_cleanup_on_error(sc_vc);
+ /* it's really not an error, but we're calling
+ cleanup_on_error because it does what we want it to */
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(sc_vc, MPI_SUCCESS);
goto fn_exit;
}
else
@@ -1643,14 +1636,12 @@
return mpi_errno;
fn_fail: /* comm related failures jump here */
{
- int cleanup_errno = MPI_SUCCESS;
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(sc_vc); /* QUIESCENT */
- if (cleanup_errno) {
- MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
- MPIU_ERR_ADD(mpi_errno, cleanup_errno);
- }
MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", sc_vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(sc_vc, mpi_errno);
+ if (mpi_errno) {
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+ }
}
fn_noncomm_fail: /* NON-comm related failures jump here */
goto fn_exit;
@@ -1792,9 +1783,7 @@
num_skipped_polls = 0;
CHECK_EINTR(n, poll(MPID_nem_tcp_plfd_tbl, num_polled, 0));
- if (n == -1) {
- MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**poll", "**poll %s", MPIU_Strerror(errno));
- }
+ MPIU_ERR_CHKANDJUMP1(n == -1, mpi_errno, MPI_ERR_OTHER, "**poll", "**poll %s", MPIU_Strerror(errno));
/* MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "some sc fd poll event")); */
for(i = 0; i < num_polled; i++)
{
@@ -1806,7 +1795,7 @@
/* We could check for POLLHUP here, but HUP/HUP+EOF is not erroneous
* on many platforms, including modern Linux. */
if (it_plfd->revents & POLLERR || it_plfd->revents & POLLNVAL) {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
int rc;
char dummy;
const char *err_str = "UNKNOWN";
@@ -1819,24 +1808,15 @@
MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "error polling fd, closing sc");
if (it_sc->vc) {
-#ifdef HAVE_ERROR_CHECKING
- int pg_rank = it_sc->vc->pg_rank; /* vc goes away on cleanup */
-#endif
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(it_sc->vc);
- if (cleanup_errno) {
- MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
- MPIU_ERR_ADD(mpi_errno, cleanup_errno);
- }
- MPIU_ERR_SET2(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d %s", pg_rank, err_str);
+ MPIU_ERR_SET2(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d %s", it_sc->vc->pg_rank, err_str);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(it_sc->vc, req_errno);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
} else {
- cleanup_errno = close_cleanup_and_free_sc_plfd(it_sc);
- if (cleanup_errno) {
- MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
- MPIU_ERR_ADD(mpi_errno, cleanup_errno);
- }
- MPIU_ERR_SET2(mpi_errno, MPI_ERR_OTHER, "**comm_fail_conn", "**comm_fail_conn %s %s", CONN_STATE_STR[it_sc->state.cstate], err_str);
+ MPIU_ERR_SET2(req_errno, MPI_ERR_OTHER, "**comm_fail_conn", "**comm_fail_conn %s %s", CONN_STATE_STR[it_sc->state.cstate], err_str);
+ mpi_errno = close_cleanup_and_free_sc_plfd(it_sc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
}
- goto fn_fail;
+ continue;
}
mpi_errno = it_sc->handler(it_plfd, it_sc);
@@ -1900,13 +1880,14 @@
len = sizeof(SA_IN);
MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "before accept"));
if ((connfd = accept(l_sc->fd, (SA *) &rmt_addr, &len)) < 0) {
- MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "after accept, l_sc=%p lstnfd=%d connfd=%d, errno=%d:%s ", l_sc, l_sc->fd, connfd, errno, MPIU_Strerror(errno)));
+ int save_errno = errno;
+ MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "after accept, l_sc=%p lstnfd=%d connfd=%d, errno=%d:%s ", l_sc, l_sc->fd, connfd, errno, MPIU_Strerror(save_errno)));
if (errno == EINTR)
continue;
else if (errno == EWOULDBLOCK || errno == EAGAIN)
break; /* no connection in the listen queue. get out of here.(N1) */
- MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**sock_accept", "**sock_accept %s", MPIU_Strerror(errno));
+ MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**sock_accept", "**sock_accept %s", MPIU_Strerror(save_errno));
}
else {
int index = -1;
@@ -1945,7 +1926,7 @@
#define FUNCNAME error_closed
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int error_closed(struct MPIDI_VC *const vc)
+static int error_closed(struct MPIDI_VC *const vc, int req_errno)
{
int mpi_errno = MPI_SUCCESS;
MPID_nem_tcp_vc_area * const vc_tcp = VC_TCP(vc);
@@ -1957,8 +1938,10 @@
mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- /* complete pending send/recv requests with error ??? */
+ mpi_errno = MPID_nem_tcp_error_out_send_queue(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_ERROR_CLOSED);
return mpi_errno;
@@ -1967,12 +1950,13 @@
}
/* This is called when an communication error has occurred on a VC to
- close the VC and release associated resources. */
+ close the VC and release associated resources.
+ Any outstanding requests will have MPI_ERROR set to req_errno */
#undef FUNCNAME
#define FUNCNAME MPID_nem_tcp_cleanup_on_error
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc)
+int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc, int req_errno)
{
int mpi_errno = MPI_SUCCESS;
int mpi_errno2 = MPI_SUCCESS;
@@ -1983,7 +1967,7 @@
mpi_errno = MPID_nem_tcp_cleanup(vc);
/* not jumping on error, keep going */
- mpi_errno2 = error_closed(vc);
+ mpi_errno2 = error_closed(vc, req_errno);
if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
fn_exit:
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -93,7 +93,8 @@
int MPID_nem_tcp_is_sock_connected(int fd);
int MPID_nem_tcp_disconnect(struct MPIDI_VC *const vc);
int MPID_nem_tcp_cleanup (struct MPIDI_VC *const vc);
-int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc);
+int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc, int req_errno);
+int MPID_nem_tcp_error_out_send_queue(struct MPIDI_VC *const vc, int req_errno);
int MPID_nem_tcp_ckpt_cleanup(void);
int MPID_nem_tcp_state_listening_handler(struct pollfd *const l_plfd, sockconn_t *const l_sc);
int MPID_nem_tcp_send_queued(MPIDI_VC_t *vc, MPIDI_nem_tcp_request_queue_t *send_queue);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -562,6 +562,7 @@
int MPID_nem_tcp_vc_terminate (MPIDI_VC_t *vc)
{
int mpi_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
MPIDI_STATE_DECL(MPID_NEM_TCP_VC_TERMINATE);
MPIDI_FUNC_ENTER(MPID_NEM_TCP_VC_TERMINATE);
@@ -569,6 +570,10 @@
mpi_errno = MPID_nem_tcp_cleanup(vc);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_error_out_send_queue(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_NEM_TCP_VC_TERMINATE);
return mpi_errno;
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -98,13 +98,13 @@
CHECK_EINTR(offset, writev(vc_tcp->sc->fd, iov, sreq->dev.iov_count));
if (offset == 0) {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
- MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
- goto fn_fail;
+ MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ goto fn_exit; /* this vc is closed now, just bail out */
}
if (offset == -1)
{
@@ -114,13 +114,12 @@
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "EAGAIN");
break;
} else {
- int cleanup_errno = MPI_SUCCESS;
-
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
- goto fn_fail;
+ int req_errno = MPI_SUCCESS;
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ goto fn_exit; /* this vc is closed now, just bail out */
}
}
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "write " MPIDI_MSG_SZ_FMT, offset);
@@ -264,12 +263,12 @@
CHECK_EINTR(offset, writev(sc->fd, iov, 2));
if (offset == 0) {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
- MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
if (offset == -1)
@@ -277,12 +276,11 @@
if (errno == EAGAIN)
offset = 0;
else {
- int cleanup_errno = MPI_SUCCESS;
-
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ int req_errno = MPI_SUCCESS;
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
}
@@ -404,12 +402,12 @@
CHECK_EINTR(offset, writev(sc->fd, iov, 2));
if (offset == 0) {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
- MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
if (offset == -1)
@@ -417,12 +415,12 @@
if (errno == EAGAIN)
offset = 0;
else {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc); /* ignoring return code */
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
}
@@ -539,12 +537,12 @@
CHECK_EINTR(offset, writev(sc->fd, iov, 2));
if (offset == 0) {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
- MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
if (offset == -1)
@@ -552,12 +550,11 @@
if (errno == EAGAIN)
offset = 0;
else {
- int cleanup_errno = MPI_SUCCESS;
-
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ int req_errno = MPI_SUCCESS;
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
}
@@ -697,12 +694,12 @@
{
CHECK_EINTR(offset, writev(vc_tcp->sc->fd, iov, iov_n));
if (offset == 0) {
- int cleanup_errno = MPI_SUCCESS;
+ int req_errno = MPI_SUCCESS;
- MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ MPIU_ERR_SET(req_errno, MPI_ERR_OTHER, "**sock_closed");
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
if (offset == -1)
@@ -710,12 +707,11 @@
if (errno == EAGAIN)
offset = 0;
else {
- int cleanup_errno = MPI_SUCCESS;
-
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror (errno));
- MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
- cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
- if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+ int req_errno = MPI_SUCCESS;
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**writev", "**writev %s", MPIU_Strerror(errno));
+ MPIU_ERR_SET1(req_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ mpi_errno = MPID_nem_tcp_cleanup_on_error(vc, req_errno);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
goto fn_fail;
}
}
@@ -814,3 +810,42 @@
MPIDI_CH3_Request_destroy(sreq);
goto fn_exit;
}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_tcp_error_out_send_queue
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPID_nem_tcp_error_out_send_queue(struct MPIDI_VC *const vc, int req_errno)
+{
+ int mpi_errno = MPI_SUCCESS;
+ MPID_Request *req;
+ MPID_nem_tcp_vc_area *const vc_tcp = VC_TCP(vc);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_ERROR_OUT_SEND_QUEUE);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_ERROR_OUT_SEND_QUEUE);
+
+ /* we don't call onDataAvail or onFinal handlers because this is
+ an error condition and we just want to mark them as complete */
+
+ /* send queue */
+ while (!SENDQ_EMPTY(vc_tcp->send_queue)) {
+ SENDQ_DEQUEUE(&vc_tcp->send_queue, &req);
+ req->status.MPI_ERROR = req_errno;
+
+ MPIDI_CH3U_Request_complete(req);
+ }
+
+ /* paused send queue */
+ while (!SENDQ_EMPTY(vc_tcp->paused_send_queue)) {
+ SENDQ_DEQUEUE(&vc_tcp->paused_send_queue, &req);
+ req->status.MPI_ERROR = req_errno;
+
+ MPIDI_CH3U_Request_complete(req);
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_ERROR_OUT_SEND_QUEUE);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5050
/mpich2/branches/dev/ckpt2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5057-6537
/mpich2/branches/dev/ftb/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5661-5730
/mpich2/branches/dev/lapi/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5817
/mpich2/branches/dev/win_rrvm/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:6416,6428
/mpich2/branches/dev/wintcp_async_progress/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5406
/mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5050
/mpich2/branches/dev/ckpt2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5057-6537
/mpich2/branches/dev/error-return/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:7405-7603,7662-7670
/mpich2/branches/dev/ftb/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5661-5730
/mpich2/branches/dev/lapi/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5817
/mpich2/branches/dev/win_rrvm/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:6416,6428
/mpich2/branches/dev/wintcp_async_progress/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:5406
/mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/wintcp/socksm.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -68,9 +68,12 @@
int rc, ret;
const struct cr_restart_info* ri;
- if (MPIDI_Process.my_pg_rank == 0)
+ if (MPIDI_Process.my_pg_rank == 0) {
MPIDI_nem_ckpt_start_checkpoint = TRUE;
-
+ /* poke the progress engine in case we're waiting in a blocking recv */
+ MPIDI_CH3_Progress_signal_completion();
+ }
+
ret = sem_wait(&ckpt_sem);
CHECK_ERR(ret, "sem_wait");
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isend.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -24,6 +24,13 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISEND);
+ if (vc->state == MPIDI_VC_STATE_MORIBUND) {
+ sreq->status.MPI_ERROR = MPI_SUCCESS;
+ MPIU_ERR_SET1(sreq->status.MPI_ERROR, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ MPIDI_CH3U_Request_complete(sreq);
+ goto fn_fail;
+ }
+
if (((MPIDI_CH3I_VC *)vc->channel_private)->iSendContig)
{
mpi_errno = ((MPIDI_CH3I_VC *)vc->channel_private)->iSendContig(vc, sreq, hdr, hdr_sz, NULL, 0);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_isendv.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -28,6 +28,13 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISENDV);
+ if (vc->state == MPIDI_VC_STATE_MORIBUND) {
+ sreq->status.MPI_ERROR = MPI_SUCCESS;
+ MPIU_ERR_SET1(sreq->status.MPI_ERROR, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ MPIDI_CH3U_Request_complete(sreq);
+ goto fn_fail;
+ }
+
if (vc_ch->iSendContig)
{
MPIU_Assert(n_iov > 0);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -33,6 +33,8 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSG);
+ MPIU_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+
if (((MPIDI_CH3I_VC *)vc->channel_private)->iStartContigMsg)
{
mpi_errno = ((MPIDI_CH3I_VC *)vc->channel_private)->iStartContigMsg(vc, hdr, hdr_sz, NULL, 0, sreq_ptr);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -41,6 +41,8 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSGV);
+ MPIU_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+
if (((MPIDI_CH3I_VC *)vc->channel_private)->iStartContigMsg)
{
MPIU_Assert (n_iov > 0);
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/nemesis/src/ch3_progress.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -11,6 +11,9 @@
#if defined (MPID_NEM_INLINE) && MPID_NEM_INLINE
#include "mpid_nem_inline.h"
#endif
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
#define PKTARRAY_SIZE (MPIDI_NEM_PKT_END+1)
@@ -30,11 +33,7 @@
extern MPID_Request ** const MPID_Recvq_unexpected_tail_ptr;
#endif
-/* MT any races on this var reported by DRD/helgrind/TSan are probably bugs.
- * This var is protected by the COMPLETION critical section in non-global mode. */
-/* FIXME volatile is probably unnecessary, access is arbitrated entirely by
- * mutex, but the decl is shared among channels */
-volatile unsigned int MPIDI_CH3I_progress_completion_count = 0;
+OPA_int_t MPIDI_CH3I_progress_completion_count = OPA_INT_T_INITIALIZER(0);
/* NEMESIS MULTITHREADING: Extra Data Structures Added */
#ifdef MPICH_IS_THREADED
@@ -46,6 +45,9 @@
#endif /* MPICH_IS_THREADED */
/* NEMESIS MULTITHREADING - End block*/
+static volatile int sigusr1_count = 0;
+static int my_sigusr1_count = 0;
+
struct MPID_Request *MPIDI_CH3I_sendq_head[CH3_NUM_QUEUES] = {0};
struct MPID_Request *MPIDI_CH3I_sendq_tail[CH3_NUM_QUEUES] = {0};
struct MPID_Request *MPIDI_CH3I_active_send[CH3_NUM_QUEUES] = {0};
@@ -66,6 +68,13 @@
static qn_ent_t *qn_head = NULL;
+static void sigusr1_handler(int sig)
+{
+ ++sigusr1_count;
+ /* poke the progress engine in case we're waiting in a blocking recv */
+ MPIDI_CH3_Progress_signal_completion();
+}
+
/* MPIDI_CH3I_Shm_send_progress() this function makes progress sending
queued messages on the shared memory queues. This function is
nonblocking and does not call netmod functions..*/
@@ -241,6 +250,12 @@
MPIU_Assert(progress_state != NULL);
}
+ if (sigusr1_count > my_sigusr1_count) {
+ my_sigusr1_count = sigusr1_count;
+ mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+
#ifdef ENABLE_CHECKPOINTING
if (MPIR_PARAM_ENABLE_CKPOINT) {
if (MPIDI_nem_ckpt_start_checkpoint) {
@@ -326,7 +341,7 @@
#endif
)
{
- mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox);
+ mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox, progress_state->ch.completion_count);
}
else
{
@@ -431,19 +446,20 @@
/* in the case of progress_wait, bail out if anything completed (CC-1) */
if (is_blocking) {
- int made_progress = FALSE;
- MPIU_THREAD_CS_ENTER(COMPLETION,);
- if (progress_state->ch.completion_count != MPIDI_CH3I_progress_completion_count) {
- made_progress = TRUE;
+ int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
+ if (progress_state->ch.completion_count != completion_count) {
+ /* Read barrier to make sure no reads get values before the
+ completion counter was incremented */
+ OPA_read_barrier();
/* reset for the next iteration */
- progress_state->ch.completion_count = MPIDI_CH3I_progress_completion_count;
+ progress_state->ch.completion_count = completion_count;
+ break;
}
- MPIU_THREAD_CS_EXIT(COMPLETION,);
- if (made_progress) break;
}
}
while (is_blocking);
+
#ifdef MPICH_IS_THREADED
MPIU_THREAD_CHECK_BEGIN;
{
@@ -481,15 +497,9 @@
{
while (1)
{
- /* we also currently hold the MPIDCOMM CS */
- MPIU_THREAD_CS_ENTER(COMPLETION,);
- if (completion_count != MPIDI_CH3I_progress_completion_count ||
+ if (completion_count != OPA_load_int(&MPIDI_CH3I_progress_completion_count) ||
MPIDI_CH3I_progress_blocked != TRUE)
- {
- MPIU_THREAD_CS_EXIT(COMPLETION,);
break;
- }
- MPIU_THREAD_CS_EXIT(COMPLETION,);
MPID_Thread_cond_wait(&MPIDI_CH3I_progress_completion_cond, &MPIR_ThreadInfo.global_mutex/*MPIDCOMM*/);
}
}
@@ -787,6 +797,18 @@
/* other pkt handlers */
pktArray[MPIDI_NEM_PKT_NETMOD] = pkt_NETMOD_handler;
+#ifdef HAVE_SIGNAL
+ {
+ /* install signal handler for process failure notifications from hydra */
+ void *ret;
+
+ ret = signal(SIGUSR1, &sigusr1_handler);
+ MPIU_ERR_CHKANDJUMP1(ret == SIG_ERR, mpi_errno, MPI_ERR_OTHER, "**signal", "**signal %s", MPIU_Strerror(errno));
+ /* Error if the app set its own SIGUSR1 handler. */
+ MPIU_ERR_CHKANDJUMP(ret != SIG_DFL, mpi_errno, MPI_ERR_OTHER, "**sigusr1");
+ }
+#endif
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PROGRESS_INIT);
return mpi_errno;
@@ -826,6 +848,14 @@
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_CONNECTION_TERMINATE);
MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_CONNECTION_TERMINATE);
+
+ MPIU_DBG_MSG_D(CH3_DISCONNECT, TYPICAL, "Terminating VC %d", vc->pg_rank);
+
+ /* if this is already closed, exit */
+ if (vc->state == MPIDI_VC_STATE_MORIBUND ||
+ vc->state == MPIDI_VC_STATE_INACTIVE_CLOSED)
+ goto fn_exit;
+
if (((MPIDI_CH3I_VC *)vc->channel_private)->is_local)
mpi_errno = MPID_nem_vc_terminate(vc);
else
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sctp/include/mpidi_ch3_pre.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -229,4 +229,9 @@
#define MPIDI_CH3_PROGRESS_STATE_DECL MPIDI_CH3I_Progress_state ch;
+
+/* This variable is used in the definitions of the MPID_Progress_xxx macros,
+ and must be available to the routines in src/mpi */
+extern volatile unsigned int MPIDI_CH3I_progress_completion_count;
+
#endif /* !defined(MPICH_MPIDI_CH3_PRE_H_INCLUDED) */
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -58,6 +58,11 @@
#define MPIDI_CH3_PROGRESS_STATE_DECL MPIDI_CH3I_Progress_state ch;
+/* This variable is used in the definitions of the MPID_Progress_xxx macros,
+ and must be available to the routines in src/mpi */
+extern volatile unsigned int MPIDI_CH3I_progress_completion_count;
+
+
/* MPICH_IS_THREADED isn't defined yet (handled by mpiimplthread.h) */
#if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)
#define MPIDI_CH3I_PROGRESS_WAKEUP \
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidimpl.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -787,6 +787,8 @@
typedef struct MPIDI_VC * MPID_VCR;
#endif
+/* number of VCs that are in MORIBUND state */
+extern int MPIDI_Failed_vc_count;
/* Initialize a new VC */
int MPIDI_VC_Init( MPIDI_VC_t *, MPIDI_PG_t *, int );
@@ -1440,6 +1442,7 @@
MPID_Request * MPIDI_CH3U_Recvq_FDP_or_AEU(MPIDI_Message_match * match,
int * found);
int MPIDI_CH3U_Recvq_count_unexp(void);
+int MPIDI_CH3U_Complete_posted_with_error(MPIDI_VC_t *vc);
int MPIDI_CH3U_Request_load_send_iov(MPID_Request * const sreq,
@@ -1583,6 +1586,9 @@
#else
#define MPIDI_CH3_Channel_close( ) MPI_SUCCESS
#endif
+/* MPIDI_CH3U_Check_for_failed_procs() reads PMI_dead_processes key
+ and marks VCs to those processes as failed */
+int MPIDI_CH3U_Check_for_failed_procs(void);
/*@
MPIDI_CH3_Pre_init - Allows the channel to initialize before PMI_init is
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/include/mpidpost.h 2011-01-13 22:17:24 UTC (rev 7723)
@@ -159,10 +159,6 @@
@*/
void MPIDI_CH3U_Request_destroy(MPID_Request * req);
-/* This variable is used in the definitions of the MPID_Progress_xxx macros,
- and must be available to the routines in src/mpi */
-extern volatile unsigned int MPIDI_CH3I_progress_completion_count;
-
/* Include definitions from the channel which require items defined by this
file (mpidimpl.h) or the file it includes
(mpiimpl.h). */
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_handle_connection.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -5,11 +5,12 @@
*/
#include "mpidimpl.h"
+#include "pmi.h"
/* Count the number of outstanding close requests */
static volatile int MPIDI_Outstanding_close_ops = 0;
+int MPIDI_Failed_vc_count = 0;
-
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3U_Handle_connection
#undef FCNAME
@@ -65,6 +66,19 @@
break;
+ case MPIDI_VC_STATE_INACTIVE:
+ /* VC was terminated before it was activated.
+ This can happen if a failed process was
+ detected before the process used the VC. */
+ MPIU_DBG_MSG(CH3_DISCONNECT,TYPICAL, "VC terminated before it was activated. We probably got a failed"
+ " process notification.");
+ MPIDI_CH3U_Complete_posted_with_error(vc);
+ ++MPIDI_Failed_vc_count;
+ MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
+
+ break;
+
+
case MPIDI_VC_STATE_ACTIVE:
case MPIDI_VC_STATE_REMOTE_CLOSE:
/* This is a premature termination. This process
@@ -74,6 +88,9 @@
MPIU_DBG_MSG(CH3_DISCONNECT,TYPICAL, "Connection closed prematurely.");
+ MPIDI_CH3U_Complete_posted_with_error(vc);
+ ++MPIDI_Failed_vc_count;
+
MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc);
MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
@@ -98,6 +115,10 @@
MPIU_DBG_MSG_D(CH3_DISCONNECT,TYPICAL, "Connection closed prematurely during close protocol. "
"Outstanding close operations = %d", MPIDI_Outstanding_close_ops);
+
+ MPIDI_CH3U_Complete_posted_with_error(vc);
+ ++MPIDI_Failed_vc_count;
+
MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc);
MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
@@ -118,7 +139,7 @@
mpi_errno = MPIR_Err_create_code(
MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__,
MPI_ERR_INTERN, "**ch3|unhandled_connection_state",
- "**ch3|unhandled_connection_state %p %d", vc, event);
+ "**ch3|unhandled_connection_state %p %d", vc, vc->state);
goto fn_fail;
break;
}
@@ -366,3 +387,76 @@
return mpi_errno;
}
+#define parse_rank(r_p) do { \
+ while (isspace(*c)) /* skip spaces */ \
+ ++c; \
+ MPIU_ERR_CHKINTERNAL(!isdigit(*c), mpi_errno, "error parsing failed process list"); \
+ *(r_p) = strtol(c, &c, 0); \
+ while (isspace(*c)) /* skip spaces */ \
+ ++c; \
+ } while (0)
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3U_Check_for_failed_procs
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIDI_CH3U_Check_for_failed_procs(void)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int pmi_errno;
+ char *val;
+ char *c;
+ int len;
+ char *kvsname;
+ int rank, rank_hi;
+ MPIU_CHKLMEM_DECL(1);
+ MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
+ mpi_errno = MPIDI_PG_GetConnKVSname(&kvsname);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ pmi_errno = PMI_KVS_Get_value_length_max(&len);
+ MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get_value_length_max");
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ MPIU_CHKLMEM_MALLOC(val, char *, len, mpi_errno, "val");
+ pmi_errno = PMI_KVS_Get(kvsname, "PMI_dead_processes", val, len);
+ MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
+
+ MPIU_DBG_MSG_S(CH3_DISCONNECT, TYPICAL, "Received proc fail notification: %s", val);
+
+ if (*val == '\0')
+ /* there are no failed processes */
+ goto fn_exit;
+
+ /* parse list of failed processes. This is a comma separated list
+ of ranks or ranges of ranks (e.g., "1, 3-5, 11") */
+ c = val;
+ while(1) {
+ parse_rank(&rank);
+ if (*c == '-') {
+ ++c; /* skip '-' */
+ parse_rank(&rank_hi);
+ } else
+ rank_hi = rank;
+ while (rank <= rank_hi) {
+ MPIDI_VC_t *vc;
+ MPIDI_PG_Get_vc(MPIDI_Process.my_pg, rank, &vc);
+ mpi_errno = MPIU_CALL(MPIDI_CH3,Connection_terminate(vc));
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ ++rank;
+ }
+ MPIU_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list");
+ if (*c == '\0')
+ break;
+ ++c; /* skip ',' */
+ }
+
+ fn_exit:
+ MPIU_CHKLMEM_FREEALL();
+ MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/ch3u_recvq.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -344,9 +344,11 @@
/* A matching request was not found in the unexpected queue, so we
need to allocate a new request and add it to the posted queue */
{
- int mpi_errno=0;
- MPIDI_Request_create_rreq( rreq, mpi_errno,
- found = FALSE;goto lock_exit );
+ int mpi_errno = MPI_SUCCESS;
+
+ found = FALSE;
+
+ MPIDI_Request_create_rreq( rreq, mpi_errno, goto lock_exit );
rreq->dev.match.parts.tag = tag;
rreq->dev.match.parts.rank = source;
rreq->dev.match.parts.context_id = context_id;
@@ -368,7 +370,26 @@
rreq->dev.user_buf = user_buf;
rreq->dev.user_count = user_count;
rreq->dev.datatype = datatype;
- rreq->dev.next = NULL;
+
+ /* check whether VC has failed, or this is an ANY_SOURCE in a
+ failed communicator */
+ if (source != MPI_ANY_SOURCE) {
+ MPIDI_VC_t *vc;
+ MPIDI_Comm_get_vc(comm, source, &vc);
+ if (vc->state == MPIDI_VC_STATE_MORIBUND) {
+ MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+ rreq->status.MPI_ERROR = mpi_errno;
+ MPIDI_CH3U_Request_complete(rreq);
+ goto lock_exit;
+ }
+ } else if (MPID_VCRT_Contains_failed_vc(comm->vcrt)) {
+ MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**comm_fail");
+ rreq->status.MPI_ERROR = mpi_errno;
+ MPIDI_CH3U_Request_complete(rreq);
+ goto lock_exit;
+ }
+
+ rreq->dev.next = NULL;
if (recvq_posted_tail != NULL) {
recvq_posted_tail->dev.next = rreq;
}
@@ -379,8 +400,6 @@
MPIDI_POSTED_RECV_ENQUEUE_HOOK(rreq);
}
- found = FALSE;
-
lock_exit:
*foundp = found;
@@ -540,6 +559,121 @@
return rreq;
}
+/* returns TRUE iff the request was sent on the vc */
+static inline int req_uses_vc(const MPID_Request* req, const MPIDI_VC_t *vc)
+{
+ MPIDI_VC_t *vc1;
+
+ MPIDI_Comm_get_vc(req->comm, req->dev.match.parts.rank, &vc1);
+ return vc == vc1;
+}
+
+/* returns TRUE iff the vc is part of the comm*/
+static inline int is_vc_in_comm(const MPIDI_VC_t *vc, const MPID_Comm *comm)
+{
+ int i;
+
+ for (i = 0; i < comm->remote_size; ++i) {
+ MPIDI_VC_t *vc1;
+ MPIDI_Comm_get_vc(comm, i, &vc1);
+ if (vc == vc1)
+ return TRUE;
+ }
+ return FALSE;
+}
+
+#undef FUNCNAME
+#define FUNCNAME dequeue_and_set_error
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+/* This dequeues req from the posted recv queue, set req's error code to comm_fail, and updates the req pointer.
+ Note that this creates a new error code if one hasn't already been created (i.e., if *error is MPI_SUCCESS). */
+static inline void dequeue_and_set_error(MPID_Request **req, MPID_Request *prev_req, int *error, int rank)
+{
+ MPID_Request *next = (*req)->dev.next;
+
+ if (*error == MPI_SUCCESS)
+ MPIU_ERR_SET1(*error, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", rank);
+
+ /* remove from queue */
+ if (recvq_posted_head == *req)
+ recvq_posted_head = (*req)->dev.next;
+ else
+ prev_req->dev.next = (*req)->dev.next;
+ if (recvq_posted_tail == *req)
+ recvq_posted_tail = prev_req;
+
+ /* set error and complete */
+ (*req)->status.MPI_ERROR = *error;
+ MPIDI_CH3U_Request_complete(*req);
+ *req = next;
+}
+
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDU_Complete_posted_with_error
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIDI_CH3U_Complete_posted_with_error(MPIDI_VC_t *vc)
+{
+ int mpi_errno = MPI_SUCCESS;
+ MPID_Request *req, *prev_req;
+ int error = MPI_SUCCESS;
+ MPIDI_STATE_DECL(MPID_STATE_MPIDU_COMPLETE_POSTED_WITH_ERROR);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_COMPLETE_POSTED_WITH_ERROR);
+
+ MPIU_THREAD_CS_ENTER(MSGQUEUE,);
+
+ /* check each req to see if the VC is part of that communicator */
+ req = recvq_posted_head;
+ prev_req = NULL;
+ while (req) {
+ if (req->dev.match.parts.rank != MPI_ANY_SOURCE && req_uses_vc(req, vc)) {
+ /* this req is expected on the VC */
+ dequeue_and_set_error(&req, prev_req, &error, vc->pg_rank);
+ } else if (req->dev.match.parts.rank == MPI_ANY_SOURCE && is_vc_in_comm(vc, req->comm)) {
+ /* This req is an ANY_SOURCE and is expected on a communicator that includes the VC.
+ We need to dequeue all anysources posted in a communicator with a failed VC. We
+ check whether the VC is in the communicator by iterating over the comm's VC table.
+ Since this may be expensive, now that we know the VC is in comm, we take the
+ opportunity to scan the rest of the posted recv queue for other anysources with
+ the same communicator. Note that in the worst case this is O(N*M), where N is the
+ number of posted requests and M is the number of communicators. This can happen
+ if every req is an anysource and uses a different communicator. We can possibly
+ conditionally execute the optimization based on number of comms, number of posted
+ requests and communicator size. */
+ MPID_Request *as_req = req->dev.next;
+ MPID_Request *prev_as_req = req;
+ /* First remove any AS recvs on this comm that were posted AFTER this req */
+ while (as_req) {
+ if (as_req->comm == req->comm && as_req->dev.match.parts.rank == MPI_ANY_SOURCE) {
+ dequeue_and_set_error(&as_req, prev_as_req, &error, vc->pg_rank);
+ } else {
+ prev_as_req = as_req;
+ as_req = as_req->dev.next;
+ }
+ }
+ /* Now remove this req. We do this in this order to make it easier to keep track of
+ req and prev_req pointers */
+ dequeue_and_set_error(&req, prev_req, &error, vc->pg_rank);
+ } else {
+ prev_req = req;
+ req = req->dev.next;
+ }
+ }
+
+ fn_exit:
+ MPIU_THREAD_CS_EXIT(MSGQUEUE,);
+
+ MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_COMPLETE_POSTED_WITH_ERROR);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+
/* --BEGIN ERROR HANDLING-- */
/* pretty prints tag, returns out for calling convenience */
static char *tag_val_to_str(int tag, char *out, int max)
Modified: mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c
===================================================================
--- mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c 2011-01-13 20:41:46 UTC (rev 7722)
+++ mpich2/branches/release/mpich2-1.3.x/src/mpid/ch3/src/mpid_vc.c 2011-01-13 22:17:24 UTC (rev 7723)
@@ -36,6 +36,8 @@
typedef struct MPIDI_VCRT
{
MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
+ int contains_failed_vc;
+ int last_check_for_failed_vc;
int size;
MPIDI_VC_t * vcr_table[1];
}
@@ -81,6 +83,8 @@
MPIU_Object_set_ref(vcrt, 1);
vcrt->size = size;
*vcrt_ptr = vcrt;
+ vcrt->contains_failed_vc = FALSE;
+ vcrt->last_check_for_failed_vc = 0;
fn_exit:
MPIU_CHKPMEM_COMMIT();
@@ -255,6 +259,34 @@
}
/*@
+ MPID_VCRT_Contains_failed_vc - returns TRUE iff a VC in this VCRT is in MORUBIND state
+ @*/
+#undef FUNCNAME
+#define FUNCNAME MPID_VCRT_Contains_failed_vc
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPID_VCRT_Contains_failed_vc(MPID_VCRT vcrt)
+{
+ if (vcrt->contains_failed_vc) {
+ /* We have already determined that this VCRT has a dead VC */
+ return TRUE;
+ } else if (vcrt->last_check_for_failed_vc < MPIDI_Failed_vc_count) {
+ /* A VC has failed since the last time we checked for dead VCs
+ in this VCRT */
+ int i;
+ for (i = 0; i < vcrt->size; ++i) {
+ if (vcrt->vcr_table[i]->state == MPIDI_VC_STATE_MORIBUND) {
+ vcrt->contains_failed_vc = TRUE;
+ return TRUE;
+ }
+ }
+ vcrt->last_check_for_failed_vc = MPIDI_Failed_vc_count;
+ }
+ return FALSE;
+}
+
+
+/*@
MPID_VCR_Dup - Duplicate a virtual connection reference
Notes:
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/mpl/src/mplstr.c
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt2/src/mpl/src/string/mplstr.c:5182,5196,5198
/mpich2/branches/dev/ftb/src/mpl/src/mplstr.c:5661-5730
/mpich2/branches/dev/lapi/src/mpl/src/mplstr.c:5817
/mpich2/branches/release/mpich2-1.1.1/src/mpl/src/string/mplstr.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpl/src/string/mplstr.c:5406
/mpich2/trunk/src/mpl/src/mplstr.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt2/src/mpl/src/string/mplstr.c:5182,5196,5198
/mpich2/branches/dev/error-return/src/mpl/src/mplstr.c:7662-7670
/mpich2/branches/dev/ftb/src/mpl/src/mplstr.c:5661-5730
/mpich2/branches/dev/lapi/src/mpl/src/mplstr.c:5817
/mpich2/branches/release/mpich2-1.1.1/src/mpl/src/string/mplstr.c:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/mpl/src/string/mplstr.c:5406
/mpich2/trunk/src/mpl/src/mplstr.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra:5406
/mpich2/trunk/src/pm/hydra:7355-7359*,7366-7367*,7371-7402*,7406-7409*,7411-7416*,7419-7420*,7422-7425*,7429-7433*,7435*,7437-7438,7447-7448*,7462*,7470*,7473-7477*,7484-7485*,7488-7491*,7493-7502*,7504*,7507-7508*,7510-7517*,7519-7527*,7529-7530*,7532*,7536*,7538-7566*,7568*,7570*,7572*,7574*,7576*,7578*,7581*,7583*,7592*,7596*,7607-7622*,7624-7630*,7632-7635*,7637*,7639*,7641-7643*,7646-7649*,7651-7654*,7658-7659*,7663*,7665*,7668*,7676-7678*,7681*,7684*,7686*,7688*,7690-7692*,7694*,7696*,7700*,7705*,7707-7710*,7712*,7714*,7719*
+ /mpich2/branches/dev/ckpt/src/pm/hydra:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra:5406
/mpich2/trunk/src/pm/hydra:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671*,7674*,7676-7678,7681,7683*,7684,7685*,7686,7687*,7688,7690-7692,7694,7696,7700,7701-7702*,7705,7707-7710,7712,7714,7719,7720*,7722*
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/Makefile.am
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/Makefile.am:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/Makefile.am:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/Makefile.am:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/Makefile.am:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/Makefile.am:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/Makefile.am:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/Makefile.am:5406
/mpich2/trunk/src/pm/hydra/Makefile.am:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/Makefile.am:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/Makefile.am:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/Makefile.am:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/Makefile.am:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/Makefile.am:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/Makefile.am:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/Makefile.am:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/Makefile.am:5406
/mpich2/trunk/src/pm/hydra/Makefile.am:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/README
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/README:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/README:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/README:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/README:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/README:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/README:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/README:5406
/mpich2/trunk/src/pm/hydra/README:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/README:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/README:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/README:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/README:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/README:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/README:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/README:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/README:5406
/mpich2/trunk/src/pm/hydra/README:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/autogen.sh
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/autogen.sh:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/autogen.sh:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/autogen.sh:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/autogen.sh:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/autogen.sh:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/autogen.sh:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/autogen.sh:5406
/mpich2/trunk/src/pm/hydra/autogen.sh:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/autogen.sh:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/autogen.sh:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/autogen.sh:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/autogen.sh:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/autogen.sh:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/autogen.sh:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/autogen.sh:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/autogen.sh:5406
/mpich2/trunk/src/pm/hydra/autogen.sh:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/configure.in
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/configure.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/configure.in:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/configure.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/configure.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/configure.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/configure.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/configure.in:5406
/mpich2/trunk/src/pm/hydra/configure.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/configure.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/configure.in:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/configure.in:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/configure.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/configure.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/configure.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/configure.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/configure.in:5406
/mpich2/trunk/src/pm/hydra/configure.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/examples
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/examples:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/examples:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/examples:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/examples:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/examples:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/examples:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/examples:5406
/mpich2/trunk/src/pm/hydra/examples:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/examples:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/examples:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/examples:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/examples:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/examples:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/examples:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/examples:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/examples:5406
/mpich2/trunk/src/pm/hydra/examples:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/hydra-doxygen.cfg.in
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/hydra-doxygen.cfg.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/hydra-doxygen.cfg.in:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/hydra-doxygen.cfg.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/hydra-doxygen.cfg.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/hydra-doxygen.cfg.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/hydra-doxygen.cfg.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/hydra-doxygen.cfg.in:5406
/mpich2/trunk/src/pm/hydra/hydra-doxygen.cfg.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/hydra-doxygen.cfg.in:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/hydra-doxygen.cfg.in:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/hydra-doxygen.cfg.in:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/hydra-doxygen.cfg.in:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/hydra-doxygen.cfg.in:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/hydra-doxygen.cfg.in:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/hydra-doxygen.cfg.in:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/hydra-doxygen.cfg.in:5406
/mpich2/trunk/src/pm/hydra/hydra-doxygen.cfg.in:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/include
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/include:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/include:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/include:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/include:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/include:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/include:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/include:5406
/mpich2/trunk/src/pm/hydra/include:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/include:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/include:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/include:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/include:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/include:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/include:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/include:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/include:5406
/mpich2/trunk/src/pm/hydra/include:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/mpich2prereq
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/mpich2prereq:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/mpich2prereq:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/mpich2prereq:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/mpich2prereq:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/mpich2prereq:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/mpich2prereq:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/mpich2prereq:5406
/mpich2/trunk/src/pm/hydra/mpich2prereq:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/mpich2prereq:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/mpich2prereq:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/mpich2prereq:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/mpich2prereq:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/mpich2prereq:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/mpich2prereq:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/mpich2prereq:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/mpich2prereq:5406
/mpich2/trunk/src/pm/hydra/mpich2prereq:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/pm
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/pm:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/pm:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/pm:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/pm:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/pm:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/pm:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/pm:5406
/mpich2/trunk/src/pm/hydra/pm:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/pm:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/pm:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/pm:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/pm:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/pm:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/pm:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/pm:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/pm:5406
/mpich2/trunk/src/pm/hydra/pm:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/tools:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/tools:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/tools:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/tools:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/tools:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/tools:5406
/mpich2/trunk/src/pm/hydra/tools:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/tools:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/tools:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/tools:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/tools:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/tools:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/tools:5406
/mpich2/trunk/src/pm/hydra/tools:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/error-return/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/slurm/slurm_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/external/slurm_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/error-return/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/src/bsci_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/error-return/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:5817
/mpich2/trunk/src/pm/hydra/tools/bootstrap/utils/bscu_query_proxy_id.c:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/ui
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/ui:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/ui:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/ui:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/ui:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/ui:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/ui:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/ui:5406
/mpich2/trunk/src/pm/hydra/ui:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/ui:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/ui:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/ui:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/ui:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/ui:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/ui:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/ui:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/ui:5406
/mpich2/trunk/src/pm/hydra/ui:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/src/pm/hydra/utils
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/src/pm/hydra/utils:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/utils:5057-6537
/mpich2/branches/dev/ftb/src/pm/hydra/utils:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/utils:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/utils:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/utils:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/utils:5406
/mpich2/trunk/src/pm/hydra/utils:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/src/pm/hydra/utils:5050
/mpich2/branches/dev/ckpt2/src/pm/hydra/utils:5057-6537
/mpich2/branches/dev/error-return/src/pm/hydra/utils:7662-7670
/mpich2/branches/dev/ftb/src/pm/hydra/utils:5661-5730
/mpich2/branches/dev/lapi/src/pm/hydra/utils:5817
/mpich2/branches/dev/wintcp_async_progress/src/pm/hydra/utils:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/src/pm/hydra/utils:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/src/pm/hydra/utils:5406
/mpich2/trunk/src/pm/hydra/utils:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7447-7448,7462,7470,7473-7477,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
Property changes on: mpich2/branches/release/mpich2-1.3.x/winconfigure.wsf
___________________________________________________________________
Modified: svn:mergeinfo
- /mpich2/branches/dev/ckpt/winconfigure.wsf:5050
/mpich2/branches/dev/ckpt2/winconfigure.wsf:5057-6537
/mpich2/branches/dev/ftb/winconfigure.wsf:5661-5730
/mpich2/branches/dev/lapi/winconfigure.wsf:5817
/mpich2/branches/dev/win_rrvm/winconfigure.wsf:6404,6407-6408,6420,6422-6423
/mpich2/branches/dev/wintcp_async_progress/winconfigure.wsf:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/winconfigure.wsf:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/winconfigure.wsf:5406
/mpich2/trunk/winconfigure.wsf:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7676-7678,7681,7684,7686,7688,7690-7692,7694,7696,7700,7705,7707-7710,7712,7714,7719
+ /mpich2/branches/dev/ckpt/winconfigure.wsf:5050
/mpich2/branches/dev/ckpt2/winconfigure.wsf:5057-6537
/mpich2/branches/dev/error-return/winconfigure.wsf:7662-7670
/mpich2/branches/dev/ftb/winconfigure.wsf:5661-5730
/mpich2/branches/dev/lapi/winconfigure.wsf:5817
/mpich2/branches/dev/win_rrvm/winconfigure.wsf:6404,6407-6408,6420,6422-6423
/mpich2/branches/dev/wintcp_async_progress/winconfigure.wsf:5008-5009,5123,5555-5559,5561-5564,5566-5567,5570,5577-5581,5613-5616,5619
/mpich2/branches/release/mpich2-1.1.1/winconfigure.wsf:5022,5032,5110,5113,5140-5141
/mpich2/branches/release/mpich2-1.2/winconfigure.wsf:5406
/mpich2/trunk/winconfigure.wsf:7355-7359,7366-7367,7371-7402,7406-7409,7411-7416,7419-7420,7422-7425,7429-7433,7435,7437-7438,7442-7448,7459-7460,7462,7469-7470,7473-7478,7484-7485,7488-7491,7493-7502,7504,7507-7508,7510-7517,7519-7527,7529-7530,7532,7536,7538-7566,7568,7570,7572,7574,7576,7578,7581,7583,7592,7596,7604,7607-7622,7624-7630,7632-7635,7637,7639,7641-7643,7646-7649,7651-7654,7658-7659,7663,7665,7668,7671,7674,7676-7678,7681,7683-7688,7690-7692,7694,7696,7700-7702,7705,7707-7710,7712,7714,7719-7720,7722
More information about the mpich2-commits
mailing list