[mpich2-commits] r7702 - in mpich2/trunk/src/mpid/ch3/channels/nemesis: nemesis/include nemesis/src src

buntinas at mcs.anl.gov buntinas at mcs.anl.gov
Wed Jan 12 15:24:31 CST 2011


Author: buntinas
Date: 2011-01-12 15:24:31 -0600 (Wed, 12 Jan 2011)
New Revision: 7702

Modified:
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
   mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
Log:
Make use of the atomic increment of the completion_count to bump the progress engine when a checkpoint is initiated of a failed process is detected.  Also jump out of blocking_recv if the completion_count is bumped.

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h	2011-01-12 21:22:11 UTC (rev 7701)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/include/mpid_nem_inline.h	2011-01-12 21:24:31 UTC (rev 7702)
@@ -23,7 +23,7 @@
 static inline int MPID_nem_mpich2_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
 static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
 static inline int MPID_nem_mpich2_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
-static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
+static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
 static inline int MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
 static inline int MPID_nem_mpich2_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
 static inline void MPID_nem_mpich2_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
@@ -863,10 +863,9 @@
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int
-MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
+MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
 {
     int mpi_errno = MPI_SUCCESS;
-    unsigned completions = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
 #ifndef ENABLE_NO_YIELD
     int pollcount = 0;
 #endif
@@ -905,8 +904,8 @@
 	    mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
             if (mpi_errno) MPIU_ERR_POP (mpi_errno);
 
-            if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count) || MPID_nem_local_lmt_pending
-                || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE] || MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
+            if (MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
+                || MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
             {
                 *cell = NULL;
                 *in_fbox = 0;
@@ -921,6 +920,12 @@
 	}
 	++pollcount;
 #endif
+
+        if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
+            *cell = NULL;
+            *in_fbox = 0;
+            goto exit_l;
+        }
     }
 
     MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c	2011-01-12 21:22:11 UTC (rev 7701)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/src/mpid_nem_ckpt.c	2011-01-12 21:24:31 UTC (rev 7702)
@@ -68,9 +68,12 @@
     int rc, ret;
     const struct cr_restart_info* ri;
 
-    if (MPIDI_Process.my_pg_rank == 0)
+    if (MPIDI_Process.my_pg_rank == 0) {
         MPIDI_nem_ckpt_start_checkpoint = TRUE;
-
+        /* poke the progress engine in case we're waiting in a blocking recv */
+        MPIDI_CH3_Progress_signal_completion();
+    }
+    
     ret = sem_wait(&ckpt_sem);
     CHECK_ERR(ret, "sem_wait");
 

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c	2011-01-12 21:22:11 UTC (rev 7701)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_progress.c	2011-01-12 21:24:31 UTC (rev 7702)
@@ -71,6 +71,8 @@
 static void sigusr1_handler(int sig)
 {
     ++sigusr1_count;
+    /* poke the progress engine in case we're waiting in a blocking recv */
+    MPIDI_CH3_Progress_signal_completion();
 }
 
 /* MPIDI_CH3I_Shm_send_progress() this function makes progress sending
@@ -248,6 +250,12 @@
         MPIU_Assert(progress_state != NULL);
     }
 
+    if (sigusr1_count > my_sigusr1_count) {
+        my_sigusr1_count = sigusr1_count;
+        mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    
 #ifdef ENABLE_CHECKPOINTING
     if (MPIR_PARAM_ENABLE_CKPOINT) {
         if (MPIDI_nem_ckpt_start_checkpoint) {
@@ -333,7 +341,7 @@
 #endif
                 )
             {
-                mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox);
+                mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox, progress_state->ch.completion_count);
             }
             else
             {
@@ -436,12 +444,6 @@
             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         }
 
-        if (sigusr1_count > my_sigusr1_count) {
-            my_sigusr1_count = sigusr1_count;
-            mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        }
-    
         /* in the case of progress_wait, bail out if anything completed (CC-1) */
         if (is_blocking) {
             int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);



More information about the mpich2-commits mailing list