[mpich2-commits] r8069 - in mpich2/trunk/src: include mpi/errhan mpi/rma mpid/ch3/src
gropp at mcs.anl.gov
gropp at mcs.anl.gov
Mon Feb 28 09:51:54 CST 2011
Author: gropp
Date: 2011-02-28 09:51:54 -0600 (Mon, 28 Feb 2011)
New Revision: 8069
Modified:
mpich2/trunk/src/include/mpiimpl.h
mpich2/trunk/src/mpi/errhan/errnames.txt
mpich2/trunk/src/mpi/rma/win_create.c
mpich2/trunk/src/mpi/rma/win_free.c
mpich2/trunk/src/mpi/rma/win_lock.c
mpich2/trunk/src/mpi/rma/win_unlock.c
mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
Log:
Detect and report incorrect use of assert and/or rank in Win_lock and Win_unlock. These detect an error in the current test suite that manifested itself as a storage leak. Also corrected some man page information and instrumentation for RMA completion in non-fence modes
Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/include/mpiimpl.h 2011-02-28 15:51:54 UTC (rev 8069)
@@ -1554,6 +1554,8 @@
MPID_Comm *comm_ptr; /* Pointer to comm of window (dup) */
int myrank; /* Rank of this process in comm (used to
detect operations on self) */
+ int lockRank; /* If within an MPI_Win_lock epoch,
+ the rank that we locked */
#ifdef USE_THREADED_WINDOW_CODE
/* These were causing compilation errors. We need to figure out how to
integrate threads into MPICH2 before including these fields. */
Modified: mpich2/trunk/src/mpi/errhan/errnames.txt
===================================================================
--- mpich2/trunk/src/mpi/errhan/errnames.txt 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/errhan/errnames.txt 2011-02-28 15:51:54 UTC (rev 8069)
@@ -367,8 +367,23 @@
**rmasize %d:Invalid size argument in RMA call (value is %d)
**rmadisp:Invalid displacement argument in RMA call
**assert:Invalid assert argument
+**lockassertval:Invalid assert argument passed to MPI_Win_lock
+**lockassertval %d: Invalid assert argument (%d) passed to MPI_Win_lock
**winunlockrank:Invalid rank argument
**winunlockrank %d %d:Invalid rank argument %d, should be %d
+**mismatchedlockrank:Rank to unlock in MPI_Win_unlock was not locked with \
+ MPI_Win_lock
+**mismatchedlockrank %d %d:Rank to unlock (%d) in MPI_Win_unlock was not \
+ locked with MPI_Win_lock (rank %d was locked)
+**winunlockwithoutlock: Attempt to unlock in MPI_Win_unlock without \
+ first locking with MPI_Win_lock
+**winfreewhilelocked: MPI_Win_free called while holding an MPI_Win_lock on \
+ a window
+**winfreewhilelocked %d: MPI_Win_free called while holding an MPI_Win_lock on \
+ the process with rank %d
+**lockwhilelocked: Attempt to lock a process while holding a lock
+**lockwhilelocked %d: Attempt to lock a process while holding a lock on \
+ process %d
**nomemreq:failure occurred while allocating memory for a request object
**nomemuereq %d:Failed to allocate memory for an unexpected message. %d \
unexpected messages queued.
@@ -395,6 +410,7 @@
# Duplicates?
#**argnull:Invalid null parameter
#**argnull %s:Invalid null parameter %s
+**commstack:Internal overflow in stack used for MPI_Comm_split
**cond_create:MPIU_Thread_cond_create failed
**cond_create %s:MPIU_Thread_cond_create failed: %s
Modified: mpich2/trunk/src/mpi/rma/win_create.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_create.c 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_create.c 2011-02-28 15:51:54 UTC (rev 8069)
@@ -47,9 +47,11 @@
.N Errors
.N MPI_SUCCESS
+.N MPI_ERR_ARG
.N MPI_ERR_COMM
.N MPI_ERR_INFO
.N MPI_ERR_OTHER
+.N MPI_ERR_SIZE
@*/
int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info,
MPI_Comm comm, MPI_Win *win)
@@ -119,6 +121,7 @@
/* Initialize a few fields that have specific defaults */
win_ptr->name[0] = 0;
win_ptr->errhandler = 0;
+ win_ptr->lockRank = -1;
/* return the handle of the window object to the user */
MPIU_OBJ_PUBLISH_HANDLE(*win, win_ptr->handle);
Modified: mpich2/trunk/src/mpi/rma/win_free.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_free.c 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_free.c 2011-02-28 15:51:54 UTC (rev 8069)
@@ -45,6 +45,7 @@
.N Errors
.N MPI_SUCCESS
.N MPI_ERR_WIN
+.N MPI_ERR_OTHER
@*/
int MPI_Win_free(MPI_Win *win)
{
@@ -83,6 +84,12 @@
/* If win_ptr is not valid, it will be reset to null */
if (mpi_errno) goto fn_fail;
+ /* Check for unterminated lock epoch */
+ if (win_ptr->lockRank != -1) {
+ MPIU_ERR_SET1(mpi_errno,MPI_ERR_OTHER,
+ "**winfreewhilelocked",
+ "**winfreewhilelocked %d", win_ptr->lockRank);
+ }
}
MPID_END_ERROR_CHECKS;
}
Modified: mpich2/trunk/src/mpi/rma/win_lock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_lock.c 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_lock.c 2011-02-28 15:51:54 UTC (rev 8069)
@@ -52,9 +52,9 @@
be used portably only in such memory.
The 'assert' argument is used to indicate special conditions for the
- fence that an implementation may use to optimize the 'MPI_Win_fence'
+ fence that an implementation may use to optimize the 'MPI_Win_lock'
operation. The value zero is always correct. Other assertion values
- may be or''ed together. Assertions that are valid for 'MPI_Win_fence' are\:
+ may be or''ed together. Assertions that are valid for 'MPI_Win_lock' are\:
. MPI_MODE_NOCHECK - no other process holds, or will attempt to acquire a
conflicting lock, while the caller holds the window lock. This is useful
@@ -70,6 +70,7 @@
.N MPI_SUCCESS
.N MPI_ERR_RANK
.N MPI_ERR_WIN
+.N MPI_ERR_OTHER
@*/
int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win)
{
@@ -110,13 +111,22 @@
/* If win_ptr is not value, it will be reset to null */
if (mpi_errno) goto fn_fail;
- if (lock_type != MPI_LOCK_SHARED && lock_type != MPI_LOCK_EXCLUSIVE)
- mpi_errno = MPIR_Err_create_code( MPI_SUCCESS,
- MPIR_ERR_RECOVERABLE,
- FCNAME, __LINE__,
- MPI_ERR_OTHER,
- "**locktype", 0 );
+ if (assert != 0 && assert != MPI_MODE_NOCHECK) {
+ MPIU_ERR_SET1(mpi_errno,MPI_ERR_ARG,
+ "**lockassertval",
+ "**lockassertval %d", assert );
+ if (mpi_errno) goto fn_fail;
+ }
+ if (lock_type != MPI_LOCK_SHARED &&
+ lock_type != MPI_LOCK_EXCLUSIVE) {
+ MPIU_ERR_SET(mpi_errno,MPI_ERR_OTHER, "**locktype" );
+ }
+ if (win_ptr->lockRank != -1) {
+ MPIU_ERR_SET1(mpi_errno,MPI_ERR_OTHER,
+ "**lockwhilelocked",
+ "**lockwhillocked %d", win_ptr->lockRank );
+ }
comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
@@ -131,6 +141,8 @@
mpi_errno = MPIU_RMA_CALL(win_ptr,
Win_lock(lock_type, rank, assert, win_ptr));
if (mpi_errno != MPI_SUCCESS) goto fn_fail;
+ /* If the lock succeeded, remember which one with locked */
+ win_ptr->lockRank = rank;
/* ... end of body of routine ... */
Modified: mpich2/trunk/src/mpi/rma/win_unlock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_unlock.c 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpi/rma/win_unlock.c 2011-02-28 15:51:54 UTC (rev 8069)
@@ -89,8 +89,19 @@
comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
-
if (mpi_errno) goto fn_fail;
+ /* Test that the rank we are unlocking is the rank that we locked */
+ if (win_ptr->lockRank != rank) {
+ if (win_ptr->lockRank < 0) {
+ MPIU_ERR_SET(mpi_errno,MPI_ERR_RANK,"**winunlockwithoutlock");
+ }
+ else {
+ MPIU_ERR_SET2(mpi_errno,MPI_ERR_RANK,
+ "**mismatchedlockrank",
+ "**mismatchedlockrank %d %d", rank, win_ptr->lockRank );
+ }
+ if (mpi_errno) goto fn_fail;
+ }
}
MPID_END_ERROR_CHECKS;
}
@@ -100,6 +111,9 @@
mpi_errno = MPIU_RMA_CALL(win_ptr,Win_unlock(rank, win_ptr));
if (mpi_errno != MPI_SUCCESS) goto fn_fail;
+ /* Clear the lockRank on success with the unlock */
+ /* FIXME: Should this always be cleared, even on failure? */
+ win_ptr->lockRank = -1;
/* ... end of body of routine ... */
Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c 2011-02-27 20:45:22 UTC (rev 8068)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c 2011-02-28 15:51:54 UTC (rev 8069)
@@ -26,11 +26,13 @@
MPIU_INSTR_DURATION_DECL(wincomplete_issue);
MPIU_INSTR_DURATION_DECL(wincomplete_complete);
MPIU_INSTR_DURATION_DECL(wincomplete_recvsync);
+MPIU_INSTR_DURATION_DECL(wincomplete_block);
MPIU_INSTR_DURATION_DECL(winwait_wait);
MPIU_INSTR_DURATION_DECL(winlock_getlocallock);
MPIU_INSTR_DURATION_DECL(winunlock_getlock);
MPIU_INSTR_DURATION_DECL(winunlock_issue);
MPIU_INSTR_DURATION_DECL(winunlock_complete);
+MPIU_INSTR_DURATION_DECL(winunlock_block);
MPIU_INSTR_DURATION_DECL(lockqueue_alloc);
MPIU_INSTR_DURATION_DECL(rmapkt_acc);
MPIU_INSTR_DURATION_DECL(rmapkt_acc_predef);
@@ -53,11 +55,13 @@
MPIU_INSTR_DURATION_INIT(wincomplete_recvsync,1,"WIN_COMPLETE:Recv sync messages");
MPIU_INSTR_DURATION_INIT(wincomplete_issue,2,"WIN_COMPLETE:Issue RMA ops");
MPIU_INSTR_DURATION_INIT(wincomplete_complete,1,"WIN_COMPLETE:Complete RMA ops");
+ MPIU_INSTR_DURATION_INIT(wincomplete_block,0,"WIN_COMPLETE:Wait for any progress");
MPIU_INSTR_DURATION_INIT(winwait_wait,1,"WIN_WAIT:Wait for ops from other processes");
MPIU_INSTR_DURATION_INIT(winlock_getlocallock,0,"WIN_LOCK:Get local lock");
MPIU_INSTR_DURATION_INIT(winunlock_issue,2,"WIN_UNLOCK:Issue RMA ops");
MPIU_INSTR_DURATION_INIT(winunlock_complete,1,"WIN_UNLOCK:Complete RMA ops");
MPIU_INSTR_DURATION_INIT(winunlock_getlock,0,"WIN_UNLOCK:Acquire lock");
+ MPIU_INSTR_DURATION_INIT(winunlock_block,0,"WIN_UNLOCK:Wait for any progress");
MPIU_INSTR_DURATION_INIT(rmapkt_acc,0,"RMA:PKTHANDLER for Accumulate");
MPIU_INSTR_DURATION_INIT(rmapkt_acc_predef,0,"RMA:PKTHANDLER for Accumulate: predef dtype");
MPIU_INSTR_DURATION_INIT(rmapkt_acc_immed,0,"RMA:PKTHANDLER for Accum immed");
@@ -1508,7 +1512,7 @@
there was an incomplete request. */
curr_ptr = win_ptr->rma_ops_list_head;
if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
- MPIU_INSTR_DURATION_START(winfence_block);
+ MPIU_INSTR_DURATION_START(wincomplete_block);
mpi_errno = MPID_Progress_wait(&progress_state);
/* --BEGIN ERROR HANDLING-- */
if (mpi_errno != MPI_SUCCESS) {
@@ -1516,7 +1520,7 @@
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
}
/* --END ERROR HANDLING-- */
- MPIU_INSTR_DURATION_END(winfence_block);
+ MPIU_INSTR_DURATION_END(wincomplete_block);
}
} /* While list of rma operation is non-empty */
@@ -1639,7 +1643,7 @@
if (dest == MPI_PROC_NULL) goto fn_exit;
comm_ptr = win_ptr->comm_ptr;
-
+
if (dest == win_ptr->myrank) {
/* The target is this process itself. We must block until the lock
* is acquired. */
@@ -1667,7 +1671,6 @@
/* local lock acquired. local puts, gets, accumulates will be done
directly without queueing. */
}
-
else {
/* target is some other process. add the lock request to rma_ops_list */
MPIU_INSTR_DURATION_START(rmaqueue_alloc);
@@ -1675,6 +1678,7 @@
mpi_errno, "RMA operation entry");
MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+ MPIU_Assert( !win_ptr->rma_ops_list_head );
win_ptr->rma_ops_list_head = new_ptr;
win_ptr->rma_ops_list_tail = new_ptr;
@@ -1714,17 +1718,18 @@
if (dest == MPI_PROC_NULL) goto fn_exit;
- comm_ptr = win_ptr->comm_ptr;
-
if (dest == win_ptr->myrank) {
/* local lock. release the lock on the window, grant the next one
* in the queue, and return. */
+ MPIU_Assert(!win_ptr->rma_ops_list_head);
mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
if (mpi_errno != MPI_SUCCESS) goto fn_exit;
mpi_errno = MPID_Progress_poke();
goto fn_exit;
}
+ comm_ptr = win_ptr->comm_ptr;
+
rma_op = win_ptr->rma_ops_list_head;
/* win_lock was not called. return error */
@@ -1749,20 +1754,21 @@
single_op_opt = 0;
MPIDI_Comm_get_vc_set_active(comm_ptr, dest, &vc);
-
+
if (rma_op->next->next == NULL) {
/* Single put, get, or accumulate between the lock and unlock. If it
* is of small size and predefined datatype at the target, we
* do an optimization where the lock and the RMA operation are
* sent in a single packet. Otherwise, we send a separate lock
* request first. */
-
+
curr_op = rma_op->next;
MPID_Datatype_get_size_macro(curr_op->origin_datatype, type_size);
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(curr_op->target_datatype, predefined);
+ /* msg_sz typically = 65480 */
if ( predefined &&
(type_size * curr_op->origin_count <= vc->eager_max_msg_sz) ) {
single_op_opt = 1;
@@ -1869,6 +1875,7 @@
win_ptr->lock_granted = 0;
fn_exit:
+ MPIU_Assert( !win_ptr->rma_ops_list_head );
MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_UNLOCK);
return mpi_errno;
/* --BEGIN ERROR HANDLING-- */
@@ -2076,7 +2083,7 @@
there was an incomplete request. */
curr_ptr = win_ptr->rma_ops_list_head;
if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
- MPIU_INSTR_DURATION_START(winfence_block);
+ MPIU_INSTR_DURATION_START(winunlock_block);
mpi_errno = MPID_Progress_wait(&progress_state);
/* --BEGIN ERROR HANDLING-- */
if (mpi_errno != MPI_SUCCESS) {
@@ -2084,7 +2091,7 @@
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
}
/* --END ERROR HANDLING-- */
- MPIU_INSTR_DURATION_END(winfence_block);
+ MPIU_INSTR_DURATION_END(winunlock_block);
}
} /* While list of rma operation is non-empty */
@@ -2192,6 +2199,7 @@
iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
}
else {
+ /* FIXME: Error return */
printf( "expected short accumulate...\n" );
/* */
}
@@ -3016,6 +3024,9 @@
}
}
mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+ /* Without the following signal_completion call, we
+ sometimes hang */
+ MPIDI_CH3_Progress_signal_completion();
}
}
@@ -3063,6 +3074,7 @@
/* queue the lock information */
MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
+ /* Note: This code is reached by the fechandadd rma tests */
/* FIXME: MT: This may need to be done atomically. */
/* FIXME: Since we need to add to the tail of the list,
More information about the mpich2-commits
mailing list