[mpich2-commits] r7702 - in mpich2/trunk/src/mpid/ch3/channels/nemesis: nemesis/include nemesis/src src
buntinas at mcs.anl.gov
buntinas at mcs.anl.gov
Wed Jan 12 15:24:31 CST 2011
Author: buntinas
Date: 2011-01-12 15:24:31 -0600 (Wed, 12 Jan 2011)
New Revision: 7702
Modified:
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
Log:
Make use of the atomic increment of the completion_count to bump the progress engine when a checkpoint is initiated of a failed process is detected. Also jump out of blocking_recv if the completion_count is bumped.
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h 2011-01-12 21:22:11 UTC (rev 7701)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h 2011-01-12 21:24:31 UTC (rev 7702)
@@ -23,7 +23,7 @@
static inline int MPID_nem_mpich2_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
static inline int MPID_nem_mpich2_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
-static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
+static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
static inline int MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
static inline int MPID_nem_mpich2_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
static inline void MPID_nem_mpich2_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
@@ -863,10 +863,9 @@
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int
-MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
+MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
{
int mpi_errno = MPI_SUCCESS;
- unsigned completions = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
#ifndef ENABLE_NO_YIELD
int pollcount = 0;
#endif
@@ -905,8 +904,8 @@
mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
if (mpi_errno) MPIU_ERR_POP (mpi_errno);
- if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count) || MPID_nem_local_lmt_pending
- || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE] || MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
+ if (MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
+ || MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
{
*cell = NULL;
*in_fbox = 0;
@@ -921,6 +920,12 @@
}
++pollcount;
#endif
+
+ if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
+ *cell = NULL;
+ *in_fbox = 0;
+ goto exit_l;
+ }
}
MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c 2011-01-12 21:22:11 UTC (rev 7701)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c 2011-01-12 21:24:31 UTC (rev 7702)
@@ -68,9 +68,12 @@
int rc, ret;
const struct cr_restart_info* ri;
- if (MPIDI_Process.my_pg_rank == 0)
+ if (MPIDI_Process.my_pg_rank == 0) {
MPIDI_nem_ckpt_start_checkpoint = TRUE;
-
+ /* poke the progress engine in case we're waiting in a blocking recv */
+ MPIDI_CH3_Progress_signal_completion();
+ }
+
ret = sem_wait(&ckpt_sem);
CHECK_ERR(ret, "sem_wait");
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c 2011-01-12 21:22:11 UTC (rev 7701)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c 2011-01-12 21:24:31 UTC (rev 7702)
@@ -71,6 +71,8 @@
static void sigusr1_handler(int sig)
{
++sigusr1_count;
+ /* poke the progress engine in case we're waiting in a blocking recv */
+ MPIDI_CH3_Progress_signal_completion();
}
/* MPIDI_CH3I_Shm_send_progress() this function makes progress sending
@@ -248,6 +250,12 @@
MPIU_Assert(progress_state != NULL);
}
+ if (sigusr1_count > my_sigusr1_count) {
+ my_sigusr1_count = sigusr1_count;
+ mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+
#ifdef ENABLE_CHECKPOINTING
if (MPIR_PARAM_ENABLE_CKPOINT) {
if (MPIDI_nem_ckpt_start_checkpoint) {
@@ -333,7 +341,7 @@
#endif
)
{
- mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox);
+ mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox, progress_state->ch.completion_count);
}
else
{
@@ -436,12 +444,6 @@
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
- if (sigusr1_count > my_sigusr1_count) {
- my_sigusr1_count = sigusr1_count;
- mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
- if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- }
-
/* in the case of progress_wait, bail out if anything completed (CC-1) */
if (is_blocking) {
int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
More information about the mpich2-commits
mailing list