[mpich2-commits] r8069 - in mpich2/trunk/src: include mpi/errhan mpi/rma mpid/ch3/src

gropp at mcs.anl.gov gropp at mcs.anl.gov
Mon Feb 28 09:51:54 CST 2011


Author: gropp
Date: 2011-02-28 09:51:54 -0600 (Mon, 28 Feb 2011)
New Revision: 8069

Modified:
   mpich2/trunk/src/include/mpiimpl.h
   mpich2/trunk/src/mpi/errhan/errnames.txt
   mpich2/trunk/src/mpi/rma/win_create.c
   mpich2/trunk/src/mpi/rma/win_free.c
   mpich2/trunk/src/mpi/rma/win_lock.c
   mpich2/trunk/src/mpi/rma/win_unlock.c
   mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
Log:
Detect and report incorrect use of assert and/or rank in Win_lock and Win_unlock.  These detect an error in the current test suite that manifested itself as a storage leak.  Also corrected some man page information and instrumentation for RMA completion in non-fence modes 

Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/include/mpiimpl.h	2011-02-28 15:51:54 UTC (rev 8069)
@@ -1554,6 +1554,8 @@
     MPID_Comm *comm_ptr;         /* Pointer to comm of window (dup) */
     int         myrank;          /* Rank of this process in comm (used to 
 				    detect operations on self) */
+    int lockRank;                /* If within an MPI_Win_lock epoch, 
+				    the rank that we locked */
 #ifdef USE_THREADED_WINDOW_CODE
     /* These were causing compilation errors.  We need to figure out how to
        integrate threads into MPICH2 before including these fields. */

Modified: mpich2/trunk/src/mpi/errhan/errnames.txt
===================================================================
--- mpich2/trunk/src/mpi/errhan/errnames.txt	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/errhan/errnames.txt	2011-02-28 15:51:54 UTC (rev 8069)
@@ -367,8 +367,23 @@
 **rmasize %d:Invalid size argument in RMA call (value is %d)
 **rmadisp:Invalid displacement argument in RMA call 
 **assert:Invalid assert argument
+**lockassertval:Invalid assert argument passed to MPI_Win_lock
+**lockassertval %d: Invalid assert argument (%d) passed to MPI_Win_lock
 **winunlockrank:Invalid rank argument
 **winunlockrank %d %d:Invalid rank argument %d, should be %d
+**mismatchedlockrank:Rank to unlock in MPI_Win_unlock was not locked with \
+ MPI_Win_lock
+**mismatchedlockrank %d %d:Rank to unlock (%d) in MPI_Win_unlock was not \
+ locked with MPI_Win_lock (rank %d was locked)
+**winunlockwithoutlock: Attempt to unlock in MPI_Win_unlock without \
+ first locking with MPI_Win_lock
+**winfreewhilelocked: MPI_Win_free called while holding an MPI_Win_lock on \
+ a window
+**winfreewhilelocked %d: MPI_Win_free called while holding an MPI_Win_lock on \
+ the process with rank %d
+**lockwhilelocked: Attempt to lock a process while holding a lock
+**lockwhilelocked %d: Attempt to lock a process while holding a lock on \
+ process %d
 **nomemreq:failure occurred while allocating memory for a request object
 **nomemuereq %d:Failed to allocate memory for an unexpected message. %d \
 unexpected messages queued.
@@ -395,6 +410,7 @@
 # Duplicates?
 #**argnull:Invalid null parameter
 #**argnull %s:Invalid null parameter %s
+**commstack:Internal overflow in stack used for MPI_Comm_split
 
 **cond_create:MPIU_Thread_cond_create failed
 **cond_create %s:MPIU_Thread_cond_create failed: %s

Modified: mpich2/trunk/src/mpi/rma/win_create.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_create.c	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_create.c	2011-02-28 15:51:54 UTC (rev 8069)
@@ -47,9 +47,11 @@
 
 .N Errors
 .N MPI_SUCCESS
+.N MPI_ERR_ARG
 .N MPI_ERR_COMM
 .N MPI_ERR_INFO
 .N MPI_ERR_OTHER
+.N MPI_ERR_SIZE
 @*/
 int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, 
 		   MPI_Comm comm, MPI_Win *win)
@@ -119,6 +121,7 @@
     /* Initialize a few fields that have specific defaults */
     win_ptr->name[0]    = 0;
     win_ptr->errhandler = 0;
+    win_ptr->lockRank   = -1;
 
     /* return the handle of the window object to the user */
     MPIU_OBJ_PUBLISH_HANDLE(*win, win_ptr->handle);

Modified: mpich2/trunk/src/mpi/rma/win_free.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_free.c	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_free.c	2011-02-28 15:51:54 UTC (rev 8069)
@@ -45,6 +45,7 @@
 .N Errors
 .N MPI_SUCCESS
 .N MPI_ERR_WIN
+.N MPI_ERR_OTHER
 @*/
 int MPI_Win_free(MPI_Win *win)
 {
@@ -83,6 +84,12 @@
 	    /* If win_ptr is not valid, it will be reset to null */
 
             if (mpi_errno) goto fn_fail;
+	    /* Check for unterminated lock epoch */
+	    if (win_ptr->lockRank != -1) {
+		MPIU_ERR_SET1(mpi_errno,MPI_ERR_OTHER, 
+			     "**winfreewhilelocked",
+			     "**winfreewhilelocked %d", win_ptr->lockRank);
+	    }
         }
         MPID_END_ERROR_CHECKS;
     }

Modified: mpich2/trunk/src/mpi/rma/win_lock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_lock.c	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_lock.c	2011-02-28 15:51:54 UTC (rev 8069)
@@ -52,9 +52,9 @@
    be used portably only in such memory. 
 
    The 'assert' argument is used to indicate special conditions for the
-   fence that an implementation may use to optimize the 'MPI_Win_fence' 
+   fence that an implementation may use to optimize the 'MPI_Win_lock' 
    operation.  The value zero is always correct.  Other assertion values
-   may be or''ed together.  Assertions that are valid for 'MPI_Win_fence' are\:
+   may be or''ed together.  Assertions that are valid for 'MPI_Win_lock' are\:
 
 . MPI_MODE_NOCHECK - no other process holds, or will attempt to acquire a 
   conflicting lock, while the caller holds the window lock. This is useful 
@@ -70,6 +70,7 @@
 .N MPI_SUCCESS
 .N MPI_ERR_RANK
 .N MPI_ERR_WIN
+.N MPI_ERR_OTHER
 @*/
 int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win)
 {
@@ -110,13 +111,22 @@
 	    /* If win_ptr is not value, it will be reset to null */
             if (mpi_errno) goto fn_fail;
 	    
-            if (lock_type != MPI_LOCK_SHARED && lock_type != MPI_LOCK_EXCLUSIVE)
-                mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, 
-						  MPIR_ERR_RECOVERABLE, 
-						  FCNAME, __LINE__, 
-						  MPI_ERR_OTHER, 
-						  "**locktype", 0 );
+	    if (assert != 0 && assert != MPI_MODE_NOCHECK) {
+		MPIU_ERR_SET1(mpi_errno,MPI_ERR_ARG,
+			      "**lockassertval", 
+			      "**lockassertval %d", assert );
+		if (mpi_errno) goto fn_fail;
+	    }
+            if (lock_type != MPI_LOCK_SHARED && 
+		lock_type != MPI_LOCK_EXCLUSIVE) {
+		MPIU_ERR_SET(mpi_errno,MPI_ERR_OTHER, "**locktype" );
+	    }
 
+	    if (win_ptr->lockRank != -1) {
+		MPIU_ERR_SET1(mpi_errno,MPI_ERR_OTHER, 
+			     "**lockwhilelocked", 
+			     "**lockwhillocked %d", win_ptr->lockRank );
+	    }
 	    comm_ptr = win_ptr->comm_ptr;
             MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
 
@@ -131,6 +141,8 @@
     mpi_errno = MPIU_RMA_CALL(win_ptr,
 			      Win_lock(lock_type, rank, assert, win_ptr));
     if (mpi_errno != MPI_SUCCESS) goto fn_fail;
+    /* If the lock succeeded, remember which one with locked */
+    win_ptr->lockRank = rank;
 
     /* ... end of body of routine ... */
 

Modified: mpich2/trunk/src/mpi/rma/win_unlock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_unlock.c	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_unlock.c	2011-02-28 15:51:54 UTC (rev 8069)
@@ -89,8 +89,19 @@
 
 	    comm_ptr = win_ptr->comm_ptr;
             MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
-
             if (mpi_errno) goto fn_fail;
+	    /* Test that the rank we are unlocking is the rank that we locked */
+	    if (win_ptr->lockRank != rank) {
+		if (win_ptr->lockRank < 0) {
+		    MPIU_ERR_SET(mpi_errno,MPI_ERR_RANK,"**winunlockwithoutlock");
+		}
+		else {
+		    MPIU_ERR_SET2(mpi_errno,MPI_ERR_RANK,
+		    "**mismatchedlockrank", 
+ 		    "**mismatchedlockrank %d %d", rank, win_ptr->lockRank );
+		}
+		if (mpi_errno) goto fn_fail;
+	    }
         }
         MPID_END_ERROR_CHECKS;
     }
@@ -100,6 +111,9 @@
     
     mpi_errno = MPIU_RMA_CALL(win_ptr,Win_unlock(rank, win_ptr));
     if (mpi_errno != MPI_SUCCESS) goto fn_fail;
+    /* Clear the lockRank on success with the unlock */
+    /* FIXME: Should this always be cleared, even on failure? */
+    win_ptr->lockRank = -1;
 
     /* ... end of body of routine ... */
 

Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c	2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c	2011-02-28 15:51:54 UTC (rev 8069)
@@ -26,11 +26,13 @@
 MPIU_INSTR_DURATION_DECL(wincomplete_issue);
 MPIU_INSTR_DURATION_DECL(wincomplete_complete);
 MPIU_INSTR_DURATION_DECL(wincomplete_recvsync);
+MPIU_INSTR_DURATION_DECL(wincomplete_block);
 MPIU_INSTR_DURATION_DECL(winwait_wait);
 MPIU_INSTR_DURATION_DECL(winlock_getlocallock);
 MPIU_INSTR_DURATION_DECL(winunlock_getlock);
 MPIU_INSTR_DURATION_DECL(winunlock_issue);
 MPIU_INSTR_DURATION_DECL(winunlock_complete);
+MPIU_INSTR_DURATION_DECL(winunlock_block);
 MPIU_INSTR_DURATION_DECL(lockqueue_alloc);
 MPIU_INSTR_DURATION_DECL(rmapkt_acc);
 MPIU_INSTR_DURATION_DECL(rmapkt_acc_predef);
@@ -53,11 +55,13 @@
     MPIU_INSTR_DURATION_INIT(wincomplete_recvsync,1,"WIN_COMPLETE:Recv sync messages");
     MPIU_INSTR_DURATION_INIT(wincomplete_issue,2,"WIN_COMPLETE:Issue RMA ops");
     MPIU_INSTR_DURATION_INIT(wincomplete_complete,1,"WIN_COMPLETE:Complete RMA ops");
+    MPIU_INSTR_DURATION_INIT(wincomplete_block,0,"WIN_COMPLETE:Wait for any progress");
     MPIU_INSTR_DURATION_INIT(winwait_wait,1,"WIN_WAIT:Wait for ops from other processes");
     MPIU_INSTR_DURATION_INIT(winlock_getlocallock,0,"WIN_LOCK:Get local lock");
     MPIU_INSTR_DURATION_INIT(winunlock_issue,2,"WIN_UNLOCK:Issue RMA ops");
     MPIU_INSTR_DURATION_INIT(winunlock_complete,1,"WIN_UNLOCK:Complete RMA ops");
     MPIU_INSTR_DURATION_INIT(winunlock_getlock,0,"WIN_UNLOCK:Acquire lock");
+    MPIU_INSTR_DURATION_INIT(winunlock_block,0,"WIN_UNLOCK:Wait for any progress");
     MPIU_INSTR_DURATION_INIT(rmapkt_acc,0,"RMA:PKTHANDLER for Accumulate");
     MPIU_INSTR_DURATION_INIT(rmapkt_acc_predef,0,"RMA:PKTHANDLER for Accumulate: predef dtype");
     MPIU_INSTR_DURATION_INIT(rmapkt_acc_immed,0,"RMA:PKTHANDLER for Accum immed");
@@ -1508,7 +1512,7 @@
 	       there was an incomplete request. */
 	    curr_ptr = win_ptr->rma_ops_list_head;
 	    if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
-		MPIU_INSTR_DURATION_START(winfence_block);
+		MPIU_INSTR_DURATION_START(wincomplete_block);
 		mpi_errno = MPID_Progress_wait(&progress_state);
 		/* --BEGIN ERROR HANDLING-- */
 		if (mpi_errno != MPI_SUCCESS) {
@@ -1516,7 +1520,7 @@
 		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
 		}
 		/* --END ERROR HANDLING-- */
-		MPIU_INSTR_DURATION_END(winfence_block);
+		MPIU_INSTR_DURATION_END(wincomplete_block);
 	    }
 	} /* While list of rma operation is non-empty */
 	    
@@ -1639,7 +1643,7 @@
     if (dest == MPI_PROC_NULL) goto fn_exit;
         
     comm_ptr = win_ptr->comm_ptr;
-    
+
     if (dest == win_ptr->myrank) {
 	/* The target is this process itself. We must block until the lock
 	 * is acquired. */
@@ -1667,7 +1671,6 @@
 	/* local lock acquired. local puts, gets, accumulates will be done 
 	   directly without queueing. */
     }
-    
     else {
 	/* target is some other process. add the lock request to rma_ops_list */
 	MPIU_INSTR_DURATION_START(rmaqueue_alloc);
@@ -1675,6 +1678,7 @@
 			    mpi_errno, "RMA operation entry");
 	MPIU_INSTR_DURATION_END(rmaqueue_alloc);
             
+	MPIU_Assert( !win_ptr->rma_ops_list_head );
 	win_ptr->rma_ops_list_head = new_ptr;
 	win_ptr->rma_ops_list_tail = new_ptr;
         
@@ -1714,17 +1718,18 @@
 
     if (dest == MPI_PROC_NULL) goto fn_exit;
         
-    comm_ptr = win_ptr->comm_ptr;
-        
     if (dest == win_ptr->myrank) {
 	/* local lock. release the lock on the window, grant the next one
 	 * in the queue, and return. */
+	MPIU_Assert(!win_ptr->rma_ops_list_head);
 	mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
 	if (mpi_errno != MPI_SUCCESS) goto fn_exit;
 	mpi_errno = MPID_Progress_poke();
 	goto fn_exit;
     }
         
+    comm_ptr = win_ptr->comm_ptr;
+        
     rma_op = win_ptr->rma_ops_list_head;
     
     /* win_lock was not called. return error */
@@ -1749,20 +1754,21 @@
     single_op_opt = 0;
 
     MPIDI_Comm_get_vc_set_active(comm_ptr, dest, &vc);
-   
+
     if (rma_op->next->next == NULL) {
 	/* Single put, get, or accumulate between the lock and unlock. If it
 	 * is of small size and predefined datatype at the target, we
 	 * do an optimization where the lock and the RMA operation are
 	 * sent in a single packet. Otherwise, we send a separate lock
 	 * request first. */
-	
+
 	curr_op = rma_op->next;
 	
 	MPID_Datatype_get_size_macro(curr_op->origin_datatype, type_size);
 	
 	MPIDI_CH3I_DATATYPE_IS_PREDEFINED(curr_op->target_datatype, predefined);
 
+	/* msg_sz typically = 65480 */
 	if ( predefined &&
 	     (type_size * curr_op->origin_count <= vc->eager_max_msg_sz) ) {
 	    single_op_opt = 1;
@@ -1869,6 +1875,7 @@
 	win_ptr->lock_granted = 0; 
     
  fn_exit:
+    MPIU_Assert( !win_ptr->rma_ops_list_head );
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_UNLOCK);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -2076,7 +2083,7 @@
 	       there was an incomplete request. */
 	    curr_ptr = win_ptr->rma_ops_list_head;
 	    if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
-		MPIU_INSTR_DURATION_START(winfence_block);
+		MPIU_INSTR_DURATION_START(winunlock_block);
 		mpi_errno = MPID_Progress_wait(&progress_state);
 		/* --BEGIN ERROR HANDLING-- */
 		if (mpi_errno != MPI_SUCCESS) {
@@ -2084,7 +2091,7 @@
 		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
 		}
 		/* --END ERROR HANDLING-- */
-		MPIU_INSTR_DURATION_END(winfence_block);
+		MPIU_INSTR_DURATION_END(winunlock_block);
 	    }
 	} /* While list of rma operation is non-empty */
 	    
@@ -2192,6 +2199,7 @@
         iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
     }
     else {
+	/* FIXME: Error return */
 	printf( "expected short accumulate...\n" );
 	/* */
     }
@@ -3016,6 +3024,9 @@
 		    }
 		}
 		mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+		/* Without the following signal_completion call, we 
+		   sometimes hang */
+		MPIDI_CH3_Progress_signal_completion();
 	    }
 	}
 
@@ -3063,6 +3074,7 @@
 	/* queue the lock information */
 	MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
 	
+	/* Note: This code is reached by the fechandadd rma tests */
 	/* FIXME: MT: This may need to be done atomically. */
 	
 	/* FIXME: Since we need to add to the tail of the list,



More information about the mpich2-commits mailing list