[mpich2-commits] r7416 - in mpich2/trunk: . src/include src/mpi/rma src/mpid/ch3/include src/mpid/ch3/src src/util/instrm
gropp at mcs.anl.gov
gropp at mcs.anl.gov
Sat Nov 6 08:08:15 CDT 2010
Author: gropp
Date: 2010-11-06 08:08:15 -0500 (Sat, 06 Nov 2010)
New Revision: 7416
Added:
mpich2/trunk/src/include/mpiinstr.h
mpich2/trunk/src/util/instrm/instr.c
Modified:
mpich2/trunk/configure.in
mpich2/trunk/src/include/mpiimpl.h
mpich2/trunk/src/mpi/rma/accumulate.c
mpich2/trunk/src/mpi/rma/get.c
mpich2/trunk/src/mpi/rma/put.c
mpich2/trunk/src/mpi/rma/win_get_group.c
mpich2/trunk/src/mpi/rma/win_lock.c
mpich2/trunk/src/mpi/rma/win_unlock.c
mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
mpich2/trunk/src/mpid/ch3/include/mpidpkt.h
mpich2/trunk/src/mpid/ch3/include/mpidpre.h
mpich2/trunk/src/mpid/ch3/include/mpidrma.h
mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c
mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c
mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
mpich2/trunk/src/util/instrm/Makefile.sm
Log:
Major improvement to RMA performance for long lists of operations, an immediate mode accumulate for single ints, store the MPID_Comm within the window, and added a basic performance instrumentation interface that was extensively used to improve the RMA performance (enabled with --enable-g=instr). With these fixes, MPICH2 can run the one-sided version of the Graph500 benchmark at a respectable if not great rate
Modified: mpich2/trunk/configure.in
===================================================================
--- mpich2/trunk/configure.in 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/configure.in 2010-11-06 13:08:15 UTC (rev 7416)
@@ -135,7 +135,6 @@
AC_MSG_ERROR([Version information not found. Configuration aborted.])
fi
AC_SUBST(MPICH2_RELEASE_DATE)
-
# Produce a numeric version assuming the following format:
# Version: [MAJ].[MIN].[REV][EXT][EXT_NUMBER]
# Example: 1.0.7rc1 has
@@ -330,6 +329,7 @@
compiler flags, i.e. MPICH2LIB_CFLAGS, MPICH2LIB_CXXFLAGS,
MPICH2LIB_FFLAGS, and MPICH2LIB_FCFLAGS.
debug - Synonym for dbg
+ instr - Enable instrumentation
log - Enable debug event logging
mem - Memory usage tracing
meminit - Preinitialize memory associated structures and unions to
@@ -740,7 +740,9 @@
MPI_DEFAULT_FOPTS="-$option"
MPI_DEFAULT_FCOPTS="-$option"
else
+ IFS="$save_IFS"
AC_MSG_WARN([Unknown value $option for --enable-fast])
+ IFS=","
fi
;;
none|no)
@@ -751,7 +753,9 @@
enable_append_ndebug=no
;;
*)
+ IFS="$save_IFS"
AC_MSG_WARN([Unknown value $option for --enable-fast])
+ IFS=","
;;
esac
done
@@ -1260,6 +1264,9 @@
handle)
AC_DEFINE(MPICH_DEBUG_HANDLES,1,[Define to enable handle checking])
;;
+ instr)
+ perform_instr=yes
+ ;;
meminit)
perform_meminit=yes
;;
@@ -1284,12 +1291,15 @@
perform_dbglog=yes
enable_append_g=yes
perform_meminit=yes
+ perform_instr=yes
perform_dbgmutex=yes
perform_mutexnesting=yes
perform_handlealloc=yes
;;
*)
- AC_MSG_WARN([Unknown value $enable_g for enable-g])
+ IFS=$save_IFS
+ AC_MSG_WARN([Unknown value $option for enable-g])
+ IFS=","
;;
esac
done
@@ -1311,8 +1321,11 @@
AC_DEFINE(MPICH_DEBUG_MEMINIT,1,[Define to enable preinitialization of memory used by structures and unions])
fi
if test "$perform_handlealloc" = yes ; then
- AC_DEFINE(MPICH_DEBUG_HANDLEALLOC,1,[Define to enable checking of handles still allocated at MPI_Finalize])
+ AC_DEFINE(MPICH_DEBUG_HANDLEALLOC,1,[Define to enable checking of handles still allocated at MPI_Finalize])
fi
+if test "$perform_instr" = yes ; then
+ AC_DEFINE(USE_MPIU_INSTR,1,[Define this to enable internal instrumentation] )
+fi
if test -n "$perform_memtracing" ; then
enable_g_mem=yes
Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/include/mpiimpl.h 2010-11-06 13:08:15 UTC (rev 7416)
@@ -1497,7 +1497,9 @@
MPID_Attribute *attributes;
MPID_Group *start_group_ptr; /* group passed in MPI_Win_start */
int start_assert; /* assert passed to MPI_Win_start */
- MPI_Comm comm; /* communicator of window (dup) */
+ MPID_Comm *comm_ptr; /* Pointer to comm of window (dup) */
+ int myrank; /* Rank of this process in comm (used to
+ detect operations on self) */
#ifdef USE_THREADED_WINDOW_CODE
/* These were causing compilation errors. We need to figure out how to
integrate threads into MPICH2 before including these fields. */
@@ -1960,6 +1962,9 @@
#include "mpierror.h"
#include "mpierrs.h"
+/* Definitions for instrumentation (currently used within RMA code) */
+#include "mpiinstr.h"
+
/* FIXME: This routine is only used within mpi/src/err/errutil.c and
smpd. We may not want to export it. */
void MPIR_Err_print_stack(FILE *, int);
Added: mpich2/trunk/src/include/mpiinstr.h
===================================================================
--- mpich2/trunk/src/include/mpiinstr.h (rev 0)
+++ mpich2/trunk/src/include/mpiinstr.h 2010-11-06 13:08:15 UTC (rev 7416)
@@ -0,0 +1,82 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * (C) 2010 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#ifndef MPIINSTR_H_INCLUDED
+#define MPIINSTR_H_INCLUDED
+
+#ifdef USE_MPIU_INSTR
+
+#define MPIU_INSTR_TYPE_DURATION 1
+
+typedef struct MPIU_INSTR_Generic_t {
+ int instrType;
+ void *next;
+ int count;
+ const char *desc;
+ int (*toStr)( char *buf, size_t maxlen, void *handlePtr );
+} MPIU_INSTR_Generic_t;
+
+#define MPIU_INSTR_MAX_DATA 8
+typedef struct MPIU_INSTR_Duration_count_t {
+ int instrType;
+ void *next;
+ int count; /* Number of times in duration */
+ const char *desc; /* Character string describing duration */
+ int (*toStr)( char *buf, size_t maxlen, void *handlePtr );
+ MPID_Time_t ttime, /* Time in duration */
+ curstart; /* Time of entry into current duration */
+ int nitems; /* Number of items in data */
+ int data[MPIU_INSTR_MAX_DATA]; /* Used to hold additional data */
+ } MPIU_INSTR_Duration_count;
+
+/* Prototypes for visible routines */
+int MPIU_INSTR_AddHandle( void * );
+int MPIU_INSTR_ToStr_Duration_Count( char *, size_t, void * );
+
+/* Definitions for including instrumentation in files*/
+
+#define MPIU_INSTR_DURATION_DECL(name_) \
+ struct MPIU_INSTR_Duration_count_t MPIU_INSTR_HANDLE_##name_ = { 0 };
+#define MPIU_INSTR_DURATION_EXTERN_DECL(name_) \
+ extern struct MPIU_INSTR_Duration_count_t MPIU_INSTR_HANDLE_##name_;
+/* FIXME: Need a generic way to zero the time */
+#define MPIU_INSTR_DURATION_INIT(name_,nitems_,desc_) \
+ MPIU_INSTR_HANDLE_##name_.count = 0; \
+ MPIU_INSTR_HANDLE_##name_.desc = (const char *)MPIU_Strdup( desc_ ); \
+ memset( &MPIU_INSTR_HANDLE_##name_.ttime,0,sizeof(MPID_Time_t));\
+ MPIU_INSTR_HANDLE_##name_.toStr = MPIU_INSTR_ToStr_Duration_Count;\
+ MPIU_INSTR_HANDLE_##name_.nitems = nitems_;\
+ memset( MPIU_INSTR_HANDLE_##name_.data,0,MPIU_INSTR_MAX_DATA*sizeof(int));\
+ MPIU_INSTR_AddHandle( &MPIU_INSTR_HANDLE_##name_ );
+#define MPIU_INSTR_DURATION_START(name_) \
+ MPID_Wtime( &MPIU_INSTR_HANDLE_##name_.curstart )
+#define MPIU_INSTR_DURATION_END(name_) \
+ do { \
+ MPID_Time_t curend; MPID_Wtime( &curend );\
+ MPID_Wtime_acc( &MPIU_INSTR_HANDLE_##name_.curstart, \
+ &curend, \
+ &MPIU_INSTR_HANDLE_##name_.ttime );\
+ MPIU_INSTR_HANDLE_##name_.count++; } while(0)
+
+#define MPIU_INSTR_DURATION_INCR(name_,idx_,incr_) \
+ MPIU_INSTR_HANDLE_##name_.data[idx_] += incr_;
+#define MPIU_INSTR_DURATION_MAX(name_,idx_,incr_) \
+ MPIU_INSTR_HANDLE_##name_.data[idx_] = \
+ incr_ > MPIU_INSTR_HANDLE_##name_.data[idx_] ? \
+ incr_ : MPIU_INSTR_HANDLE_##name_.data[idx_];
+
+#else
+/* Define null versions of macros (these are empty statements) */
+#define MPIU_INSTR_DURATION_DECL(name_)
+#define MPIU_INSTR_DURATION_EXTERN_DECL(name_)
+#define MPIU_INSTR_DURATION_INIT(name_,nitems_,desc_)
+#define MPIU_INSTR_DURATION_START(name_)
+#define MPIU_INSTR_DURATION_END(name_)
+#define MPIU_INSTR_DURATION_INCR(name_,idx_,incr_)
+#define MPIU_INSTR_DURATION_MAX(name_,idx_,incr_)
+
+#endif /* USE_MPIU_INSTR */
+
+#endif
Modified: mpich2/trunk/src/mpi/rma/accumulate.c
===================================================================
--- mpich2/trunk/src/mpi/rma/accumulate.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/accumulate.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -126,7 +126,7 @@
MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
}
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, target_rank, mpi_errno);
if (mpi_errno != MPI_SUCCESS) goto fn_fail;
Modified: mpich2/trunk/src/mpi/rma/get.c
===================================================================
--- mpich2/trunk/src/mpi/rma/get.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/get.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -122,7 +122,7 @@
MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
}
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, target_rank, mpi_errno);
if (mpi_errno != MPI_SUCCESS) goto fn_fail;
Modified: mpich2/trunk/src/mpi/rma/put.c
===================================================================
--- mpich2/trunk/src/mpi/rma/put.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/put.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -122,7 +122,7 @@
MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
}
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, target_rank, mpi_errno);
if (mpi_errno != MPI_SUCCESS) goto fn_fail;
Modified: mpich2/trunk/src/mpi/rma/win_get_group.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_get_group.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/win_get_group.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -98,7 +98,7 @@
# endif /* HAVE_ERROR_CHECKING */
/* ... body of routine ... */
- MPID_Comm_get_ptr( win_ptr->comm, win_comm_ptr );
+ win_comm_ptr = win_ptr->comm_ptr;
mpi_errno = MPIR_Comm_group_impl(win_comm_ptr, &group_ptr);
if (mpi_errno != MPI_SUCCESS) goto fn_fail;
Modified: mpich2/trunk/src/mpi/rma/win_lock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_lock.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/win_lock.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -117,7 +117,7 @@
MPI_ERR_OTHER,
"**locktype", 0 );
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+ comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
if (mpi_errno) goto fn_fail;
Modified: mpich2/trunk/src/mpi/rma/win_unlock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_unlock.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/win_unlock.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -87,7 +87,7 @@
/* If win_ptr is not valid, it will be reset to null */
if (mpi_errno) goto fn_fail;
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+ comm_ptr = win_ptr->comm_ptr;
MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
if (mpi_errno) goto fn_fail;
Modified: mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidimpl.h 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidimpl.h 2010-11-06 13:08:15 UTC (rev 7416)
@@ -1094,10 +1094,16 @@
#define MPIDI_RMAFNS_VERSION 1
int MPIDI_CH3_RMAFnsInit( MPIDI_RMAFns * );
+/* FIXME: These are specific to the RMA code and should be in the RMA
+ header file. */
#define MPIDI_RMA_PUT 23
#define MPIDI_RMA_GET 24
#define MPIDI_RMA_ACCUMULATE 25
#define MPIDI_RMA_LOCK 26
+
+/* Special case RMA operations */
+#define MPIDI_RMA_ACC_CONTIG 27
+
#define MPIDI_RMA_DATATYPE_BASIC 50
#define MPIDI_RMA_DATATYPE_DERIVED 51
@@ -1704,6 +1710,8 @@
MPIDI_msg_sz_t *, MPID_Request ** );
int MPIDI_CH3_PktHandler_Accumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
MPIDI_msg_sz_t *, MPID_Request ** );
+int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
+ MPIDI_msg_sz_t *, MPID_Request ** );
int MPIDI_CH3_PktHandler_Get( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
MPIDI_msg_sz_t *, MPID_Request ** );
int MPIDI_CH3_PktHandler_GetResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
Modified: mpich2/trunk/src/mpid/ch3/include/mpidpkt.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidpkt.h 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidpkt.h 2010-11-06 13:08:15 UTC (rev 7416)
@@ -10,8 +10,12 @@
/* Enable the use of data within the message packet for small messages */
#define USE_EAGER_SHORT
#define MPIDI_EAGER_SHORT_INTS 4
+/* FIXME: This appears to assume that sizeof(int) == 4 (or at least >= 4) */
#define MPIDI_EAGER_SHORT_SIZE 16
+/* This is the number of ints that can be carried within an RMA packet */
+#define MPIDI_RMA_IMMED_INTS 1
+
/*
* MPIDI_CH3_Pkt_type_t
*
@@ -44,6 +48,8 @@
MPIDI_CH3_PKT_LOCK_GET_UNLOCK, /* optimization for single gets */
MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK, /* optimization for single accumulates */
/* RMA Packets end here */
+ MPIDI_CH3_PKT_ACCUM_IMMED, /* optimization for short accumulate */
+ /* FIXME: Add PUT, GET_IMMED packet types */
MPIDI_CH3_PKT_FLOW_CNTL_UPDATE, /* FIXME: Unused */
MPIDI_CH3_PKT_CLOSE,
MPIDI_CH3_PKT_END_CH3
@@ -193,6 +199,26 @@
}
MPIDI_CH3_Pkt_accum_t;
+typedef struct MPIDI_CH3_Pkt_accum_immed
+{
+ MPIDI_CH3_Pkt_type_t type;
+ void *addr;
+ int count;
+ /* FIXME: Compress datatype/op into a single word (immedate mode) */
+ MPI_Datatype datatype;
+ MPI_Op op;
+ /* FIXME: do we need these (use a regular accum packet if we do?) */
+ MPI_Win target_win_handle; /* Used in the last RMA operation in each
+ * epoch for decrementing rma op counter in
+ * active target rma and for unlocking window
+ * in passive target rma. Otherwise set to NULL*/
+ MPI_Win source_win_handle; /* Used in the last RMA operation in an
+ * epoch in the case of passive target rma
+ * with shared locks. Otherwise set to NULL*/
+ int data[MPIDI_RMA_IMMED_INTS];
+}
+MPIDI_CH3_Pkt_accum_immed_t;
+
typedef struct MPIDI_CH3_Pkt_lock
{
MPIDI_CH3_Pkt_type_t type;
@@ -276,6 +302,7 @@
MPIDI_CH3_Pkt_get_t get;
MPIDI_CH3_Pkt_get_resp_t get_resp;
MPIDI_CH3_Pkt_accum_t accum;
+ MPIDI_CH3_Pkt_accum_immed_t accum_immed;
MPIDI_CH3_Pkt_lock_t lock;
MPIDI_CH3_Pkt_lock_granted_t lock_granted;
MPIDI_CH3_Pkt_pt_rma_done_t pt_rma_done;
Modified: mpich2/trunk/src/mpid/ch3/include/mpidpre.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidpre.h 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidpre.h 2010-11-06 13:08:15 UTC (rev 7416)
@@ -170,7 +170,9 @@
int *disp_units; /* array of displacement units of all windows */\
MPI_Win *all_win_handles; /* array of handles to the window objects\
of all processes */ \
- struct MPIDI_RMA_ops *rma_ops_list; /* list of outstanding RMA requests */ \
+ struct MPIDI_RMA_ops *rma_ops_list_head; /* list of outstanding \
+ RMA requests */ \
+ struct MPIDI_RMA_ops *rma_ops_list_tail; \
volatile int lock_granted; /* flag to indicate whether lock has \
been granted to this process (as source) for \
passive target rma */ \
Modified: mpich2/trunk/src/mpid/ch3/include/mpidrma.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidrma.h 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidrma.h 2010-11-06 13:08:15 UTC (rev 7416)
@@ -28,6 +28,11 @@
/* for keeping track of RMA ops, which will be executed at the next sync call */
typedef struct MPIDI_RMA_ops {
struct MPIDI_RMA_ops *next; /* pointer to next element in list */
+ /* FIXME: It would be better to setup the packet that will be sent, at
+ least in most cases (if, as a result of the sync/ops/sync sequence,
+ a different packet type is needed, it can be extracted from the
+ information otherwise stored). */
+ /* FIXME: Use enum for RMA op type? */
int type; /* MPIDI_RMA_PUT, MPID_REQUEST_GET,
MPIDI_RMA_ACCUMULATE, MPIDI_RMA_LOCK */
void *origin_addr;
@@ -39,6 +44,10 @@
MPI_Datatype target_datatype;
MPI_Op op; /* for accumulate */
int lock_type; /* for win_lock */
+ /* Used to complete operations */
+ struct MPID_Request *request;
+ MPIDI_RMA_dtype_info dtype_info;
+ void *dataloop;
} MPIDI_RMA_ops;
typedef struct MPIDI_PT_single_op {
@@ -59,5 +68,4 @@
MPIDI_VC_t * vc;
struct MPIDI_PT_single_op *pt_single_op; /* to store info for lock-put-unlock optimization */
} MPIDI_Win_lock_queue;
-
#endif
Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -585,6 +585,8 @@
MPIDI_CH3_PktHandler_LockAccumUnlock;
pktArray[MPIDI_CH3_PKT_LOCK_GET_UNLOCK] =
MPIDI_CH3_PktHandler_LockGetUnlock;
+ pktArray[MPIDI_CH3_PKT_ACCUM_IMMED] =
+ MPIDI_CH3_PktHandler_Accumulate_Immed;
/* End of default RMA operations */
fn_fail:
Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -630,7 +630,7 @@
if (HANDLE_GET_KIND(rreq->dev.op) == HANDLE_KIND_BUILTIN)
{
/* get the function by indexing into the op table */
- uop = MPIR_Op_table[(rreq->dev.op)%16 - 1];
+ uop = MPIR_Op_table[((rreq->dev.op)&0xf) - 1];
}
else
{
@@ -956,7 +956,7 @@
if (HANDLE_GET_KIND(single_op->op) == HANDLE_KIND_BUILTIN)
{
/* get the function by indexing into the op table */
- uop = MPIR_Op_table[(single_op->op)%16 - 1];
+ uop = MPIR_Op_table[((single_op->op)&0xf) - 1];
}
else
{
Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -7,6 +7,18 @@
#include "mpidi_ch3_impl.h"
#include "mpidrma.h"
+static int enableShortACC=1;
+
+MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
+#ifdef USE_MPIU_INSTR
+MPIU_INSTR_DURATION_DECL(wincreate_allgather);
+MPIU_INSTR_DURATION_DECL(winfree_rs);
+MPIU_INSTR_DURATION_DECL(winfree_complete);
+MPIU_INSTR_DURATION_DECL(rmaqueue_alloc);
+extern void MPIDI_CH3_RMA_InitInstr(void);
+#endif
+extern void MPIDI_CH3_RMA_SetAccImmed( int );
+
#define MPIDI_PASSIVE_TARGET_DONE_TAG 348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924
@@ -18,7 +30,7 @@
int MPIDI_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info,
MPID_Comm *comm_ptr, MPID_Win **win_ptr )
{
- int mpi_errno=MPI_SUCCESS, i, comm_size, rank;
+ int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank;
MPI_Aint *tmp_buf;
MPID_Comm *win_comm_ptr;
MPIU_CHKPMEM_DECL(4);
@@ -30,6 +42,26 @@
/* FIXME: There should be no unreferenced args */
MPIU_UNREFERENCED_ARG(info);
+ if(initRMAoptions) {
+ int rc;
+ MPIU_THREADSAFE_INIT_BLOCK_BEGIN(initRMAoptions);
+ /* Default is to enable the use of the immediate accumulate feature */
+ if (!MPL_env2bool( "MPICH_RMA_ACC_IMMED", &rc ))
+ rc = 1;
+ MPIDI_CH3_RMA_SetAccImmed(rc);
+#ifdef USE_MPIU_INSTR
+ /* Define all instrumentation handle used in the CH3 RMA here*/
+ MPIU_INSTR_DURATION_INIT(wincreate_allgather,0,"WIN_CREATE:Allgather");
+ MPIU_INSTR_DURATION_INIT(winfree_rs,0,"WIN_FREE:ReduceScatterBlock");
+ MPIU_INSTR_DURATION_INIT(winfree_complete,0,"WIN_FREE:Complete");
+ MPIU_INSTR_DURATION_INIT(rmaqueue_alloc,0,"Allocate RMA Queue element");
+ MPIDI_CH3_RMA_InitInstr();
+
+#endif
+ MPIU_THREADSAFE_INIT_CLEAR(initRMAoptions);
+ MPIU_THREADSAFE_INIT_BLOCK_END(initRMAoptions);
+ }
+
comm_size = comm_ptr->local_size;
rank = comm_ptr->rank;
@@ -46,7 +78,8 @@
(*win_ptr)->start_group_ptr = NULL;
(*win_ptr)->start_assert = 0;
(*win_ptr)->attributes = NULL;
- (*win_ptr)->rma_ops_list = NULL;
+ (*win_ptr)->rma_ops_list_head = NULL;
+ (*win_ptr)->rma_ops_list_tail = NULL;
(*win_ptr)->lock_granted = 0;
(*win_ptr)->current_lock_type = MPID_LOCK_NONE;
(*win_ptr)->shared_lock_ref_cnt = 0;
@@ -56,8 +89,10 @@
mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
- (*win_ptr)->comm = win_comm_ptr->handle;
-
+ (*win_ptr)->comm_ptr = win_comm_ptr;
+ (*win_ptr)->myrank = rank;
+
+ MPIU_INSTR_DURATION_START(wincreate_allgather);
/* allocate memory for the base addresses, disp_units, and
completion counters of all processes */
MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
@@ -82,20 +117,22 @@
mpi_errno, "tmp_buf");
/* FIXME: This needs to be fixed for heterogeneous systems */
- tmp_buf[3*rank] = MPIU_PtrToAint(base);
+ tmp_buf[3*rank] = MPIU_PtrToAint(base);
tmp_buf[3*rank+1] = (MPI_Aint) disp_unit;
tmp_buf[3*rank+2] = (MPI_Aint) (*win_ptr)->handle;
mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
tmp_buf, 3 * sizeof(MPI_Aint), MPI_BYTE,
comm_ptr);
+ MPIU_INSTR_DURATION_END(wincreate_allgather);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-
+
+ k = 0;
for (i=0; i<comm_size; i++)
{
- (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[3*i]);
- (*win_ptr)->disp_units[i] = (int) tmp_buf[3*i+1];
- (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[3*i+2];
+ (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]);
+ (*win_ptr)->disp_units[i] = (int) tmp_buf[k++];
+ (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++];
}
fn_exit:
@@ -126,18 +163,20 @@
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
- MPID_Comm_get_ptr( (*win_ptr)->comm, comm_ptr );
-
+ comm_ptr = (*win_ptr)->comm_ptr;
+ MPIU_INSTR_DURATION_START(winfree_rs);
mpi_errno = MPIR_Reduce_scatter_block_impl((*win_ptr)->pt_rma_puts_accs,
&total_pt_rma_puts_accs, 1,
MPI_INT, MPI_SUM, comm_ptr);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+ MPIU_INSTR_DURATION_END(winfree_rs);
if (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs)
{
MPID_Progress_state progress_state;
/* poke the progress engine until the two are equal */
+ MPIU_INSTR_DURATION_START(winfree_complete);
MPID_Progress_start(&progress_state);
while (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs)
{
@@ -151,6 +190,7 @@
/* --END ERROR HANDLING-- */
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winfree_complete);
}
@@ -187,11 +227,10 @@
{
int mpi_errno = MPI_SUCCESS;
int dt_contig, rank, predefined;
- MPIDI_RMA_ops *curr_ptr, *prev_ptr, *new_ptr;
+ MPIDI_RMA_ops *new_ptr;
MPID_Datatype *dtp;
MPI_Aint dt_true_lb;
MPIDI_msg_sz_t data_sz;
- MPID_Comm *win_comm_ptr;
MPIU_CHKPMEM_DECL(1);
MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
@@ -205,13 +244,7 @@
goto fn_exit;
}
- /* FIXME: It makes sense to save the rank (and size) of the
- communicator in the window structure to speed up these operations,
- or to save a pointer to the communicator structure, rather than
- just the handle
- */
- MPID_Comm_get_ptr(win_ptr->comm, win_comm_ptr);
- rank = MPIR_Comm_rank(win_comm_ptr);
+ rank = win_ptr->myrank;
/* If the put is a local operation, do it here */
if (target_rank == rank)
@@ -223,22 +256,18 @@
else
{
/* queue it up */
- curr_ptr = win_ptr->rma_ops_list;
- prev_ptr = curr_ptr;
- while (curr_ptr != NULL)
- {
- prev_ptr = curr_ptr;
- curr_ptr = curr_ptr->next;
- }
-
- /* FIXME: Where does this memory get freed? */
+ /* FIXME: For short operations, should we use a (per-thread) pool? */
+ MPIU_INSTR_DURATION_START(rmaqueue_alloc);
MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops),
mpi_errno, "RMA operation entry");
- if (prev_ptr != NULL)
- prev_ptr->next = new_ptr;
- else
- win_ptr->rma_ops_list = new_ptr;
-
+ MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+ if (win_ptr->rma_ops_list_tail)
+ win_ptr->rma_ops_list_tail->next = new_ptr;
+ else
+ win_ptr->rma_ops_list_head = new_ptr;
+ win_ptr->rma_ops_list_tail = new_ptr;
+
+ /* FIXME: For contig and very short operations, use a streamlined op */
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_PUT;
new_ptr->origin_addr = origin_addr;
@@ -290,9 +319,8 @@
MPIDI_msg_sz_t data_sz;
int dt_contig, rank, predefined;
MPI_Aint dt_true_lb;
- MPIDI_RMA_ops *curr_ptr, *prev_ptr, *new_ptr;
+ MPIDI_RMA_ops *new_ptr;
MPID_Datatype *dtp;
- MPID_Comm *win_comm_ptr;
MPIU_CHKPMEM_DECL(1);
MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
@@ -306,10 +334,7 @@
goto fn_exit;
}
- /* FIXME: It makes sense to save the rank (and size) of the
- communicator in the window structure to speed up these operations */
- MPID_Comm_get_ptr(win_ptr->comm, win_comm_ptr);
- rank = MPIR_Comm_rank(win_comm_ptr);
+ rank = win_ptr->myrank;
/* If the get is a local operation, do it here */
if (target_rank == rank)
@@ -323,25 +348,17 @@
else
{
/* queue it up */
- curr_ptr = win_ptr->rma_ops_list;
- prev_ptr = curr_ptr;
- while (curr_ptr != NULL)
- {
- prev_ptr = curr_ptr;
- curr_ptr = curr_ptr->next;
- }
-
+ MPIU_INSTR_DURATION_START(rmaqueue_alloc);
MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops),
mpi_errno, "RMA operation entry");
- if (prev_ptr != NULL)
- {
- prev_ptr->next = new_ptr;
- }
+ MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+ if (win_ptr->rma_ops_list_tail)
+ win_ptr->rma_ops_list_tail->next = new_ptr;
else
- {
- win_ptr->rma_ops_list = new_ptr;
- }
+ win_ptr->rma_ops_list_head = new_ptr;
+ win_ptr->rma_ops_list_tail = new_ptr;
+ /* FIXME: For contig and very short operations, use a streamlined op */
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_GET;
new_ptr->origin_addr = origin_addr;
@@ -394,9 +411,8 @@
MPIDI_msg_sz_t data_sz;
int dt_contig, rank, origin_predefined, target_predefined;
MPI_Aint dt_true_lb;
- MPIDI_RMA_ops *curr_ptr, *prev_ptr, *new_ptr;
+ MPIDI_RMA_ops *new_ptr;
MPID_Datatype *dtp;
- MPID_Comm *win_comm_ptr;
MPIU_CHKLMEM_DECL(2);
MPIU_CHKPMEM_DECL(1);
MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
@@ -410,18 +426,13 @@
{
goto fn_exit;
}
+
+ rank = win_ptr->myrank;
- /* FIXME: It makes sense to save the rank (and size) of the
- communicator in the window structure to speed up these operations,
- or to save a pointer to the communicator structure, rather than
- just the handle
- */
- MPID_Comm_get_ptr(win_ptr->comm, win_comm_ptr);
- rank = MPIR_Comm_rank(win_comm_ptr);
-
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined);
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined);
+ /* Do =! rank first (most likely branch?) */
if (target_rank == rank)
{
MPI_User_function *uop;
@@ -440,7 +451,7 @@
"**opnotpredefined %d", op );
/* get the function by indexing into the op table */
- uop = MPIR_Op_table[(op)%16 - 1];
+ uop = MPIR_Op_table[((op)&0xf) - 1];
if (origin_predefined && target_predefined)
{
@@ -524,25 +535,32 @@
else
{
/* queue it up */
- curr_ptr = win_ptr->rma_ops_list;
- prev_ptr = curr_ptr;
- while (curr_ptr != NULL)
- {
- prev_ptr = curr_ptr;
- curr_ptr = curr_ptr->next;
- }
-
+ MPIU_INSTR_DURATION_START(rmaqueue_alloc);
MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops),
mpi_errno, "RMA operation entry");
- if (prev_ptr != NULL)
- {
- prev_ptr->next = new_ptr;
- }
+ MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+ if (win_ptr->rma_ops_list_tail)
+ win_ptr->rma_ops_list_tail->next = new_ptr;
else
- {
- win_ptr->rma_ops_list = new_ptr;
+ win_ptr->rma_ops_list_head = new_ptr;
+ win_ptr->rma_ops_list_tail = new_ptr;
+
+ /* If predefined and contiguous, use a simplified element */
+ if (origin_predefined && target_predefined && enableShortACC) {
+ new_ptr->next = NULL;
+ new_ptr->type = MPIDI_RMA_ACC_CONTIG;
+ /* Only the information needed for the contig/predefined acc */
+ new_ptr->origin_addr = origin_addr;
+ new_ptr->origin_count = origin_count;
+ new_ptr->origin_datatype = origin_datatype;
+ new_ptr->target_rank = target_rank;
+ new_ptr->target_disp = target_disp;
+ new_ptr->target_count = target_count;
+ new_ptr->target_datatype = target_datatype;
+ new_ptr->op = op;
+ goto fn_exit;
}
-
+
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_ACCUMULATE;
new_ptr->origin_addr = origin_addr;
Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -7,6 +7,63 @@
#include "mpidimpl.h"
#include "mpidrma.h"
+static int EnableImmedAcc = 1;
+void MPIDI_CH3_RMA_SetAccImmed( int flag )
+{
+ EnableImmedAcc = flag;
+}
+
+#ifdef USE_MPIU_INSTR
+MPIU_INSTR_DURATION_DECL(winfence_clearlock);
+MPIU_INSTR_DURATION_DECL(winfence_rs);
+MPIU_INSTR_DURATION_DECL(winfence_issue);
+MPIU_INSTR_DURATION_DECL(winfence_complete);
+MPIU_INSTR_DURATION_DECL(winfence_wait);
+MPIU_INSTR_DURATION_DECL(winfence_block);
+MPIU_INSTR_DURATION_DECL(winpost_clearlock);
+MPIU_INSTR_DURATION_DECL(winpost_sendsync);
+MPIU_INSTR_DURATION_DECL(winstart_clearlock);
+MPIU_INSTR_DURATION_DECL(wincomplete_issue);
+MPIU_INSTR_DURATION_DECL(wincomplete_complete);
+MPIU_INSTR_DURATION_DECL(wincomplete_recvsync);
+MPIU_INSTR_DURATION_DECL(winwait_wait);
+MPIU_INSTR_DURATION_DECL(winlock_getlocallock);
+MPIU_INSTR_DURATION_DECL(winunlock_getlock);
+MPIU_INSTR_DURATION_DECL(winunlock_issue);
+MPIU_INSTR_DURATION_DECL(winunlock_complete);
+MPIU_INSTR_DURATION_DECL(lockqueue_alloc);
+MPIU_INSTR_DURATION_DECL(rmapkt_acc);
+MPIU_INSTR_DURATION_DECL(rmapkt_acc_predef);
+MPIU_INSTR_DURATION_DECL(rmapkt_acc_immed);
+MPIU_INSTR_DURATION_EXTERN_DECL(rmaqueue_alloc);
+void MPIDI_CH3_RMA_InitInstr(void);
+
+void MPIDI_CH3_RMA_InitInstr(void)
+{
+ MPIU_INSTR_DURATION_INIT(lockqueue_alloc,0,"Allocate Lock Queue element");
+ MPIU_INSTR_DURATION_INIT(winfence_clearlock,1,"WIN_FENCE:Clear prior lock");
+ MPIU_INSTR_DURATION_INIT(winfence_rs,0,"WIN_FENCE:ReduceScatterBlock");
+ MPIU_INSTR_DURATION_INIT(winfence_issue,2,"WIN_FENCE:Issue RMA ops");
+ MPIU_INSTR_DURATION_INIT(winfence_complete,1,"WIN_FENCE:Complete RMA ops");
+ MPIU_INSTR_DURATION_INIT(winfence_wait,1,"WIN_FENCE:Wait for ops from other processes");
+ MPIU_INSTR_DURATION_INIT(winfence_block,0,"WIN_FENCE:Wait for any progress");
+ MPIU_INSTR_DURATION_INIT(winpost_clearlock,1,"WIN_POST:Clear prior lock");
+ MPIU_INSTR_DURATION_INIT(winpost_sendsync,1,"WIN_POST:Senc sync messages");
+ MPIU_INSTR_DURATION_INIT(winstart_clearlock,1,"WIN_START:Clear prior lock");
+ MPIU_INSTR_DURATION_INIT(wincomplete_recvsync,1,"WIN_COMPLETE:Recv sync messages");
+ MPIU_INSTR_DURATION_INIT(wincomplete_issue,2,"WIN_COMPLETE:Issue RMA ops");
+ MPIU_INSTR_DURATION_INIT(wincomplete_complete,1,"WIN_COMPLETE:Complete RMA ops");
+ MPIU_INSTR_DURATION_INIT(winwait_wait,1,"WIN_WAIT:Wait for ops from other processes");
+ MPIU_INSTR_DURATION_INIT(winlock_getlocallock,0,"WIN_LOCK:Get local lock");
+ MPIU_INSTR_DURATION_INIT(winunlock_issue,2,"WIN_UNLOCK:Issue RMA ops");
+ MPIU_INSTR_DURATION_INIT(winunlock_complete,1,"WIN_UNLOCK:Complete RMA ops");
+ MPIU_INSTR_DURATION_INIT(winunlock_getlock,0,"WIN_UNLOCK:Acquire lock");
+ MPIU_INSTR_DURATION_INIT(rmapkt_acc,0,"RMA:PKTHANDLER for Accumulate");
+ MPIU_INSTR_DURATION_INIT(rmapkt_acc_predef,0,"RMA:PKTHANDLER for Accumulate: predef dtype");
+ MPIU_INSTR_DURATION_INIT(rmapkt_acc_immed,0,"RMA:PKTHANDLER for Accum immed");
+}
+#endif
+
/*
* These routines provide a default implementation of the MPI RMA operations
* in terms of the low-level, two-sided channel operations. A channel
@@ -31,6 +88,11 @@
MPI_Win target_win_handle,
MPIDI_RMA_dtype_info * dtype_info,
void ** dataloop, MPID_Request ** request);
+static int MPIDI_CH3I_Send_contig_acc_msg(MPIDI_RMA_ops * rma_op,
+ MPID_Win * win_ptr,
+ MPI_Win source_win_handle,
+ MPI_Win target_win_handle,
+ MPID_Request ** request);
static int MPIDI_CH3I_Do_passive_target_rma(MPID_Win *win_ptr,
int *wait_for_rma_done_pkt);
static int MPIDI_CH3I_Send_lock_put_or_acc(MPID_Win *win_ptr);
@@ -48,16 +110,13 @@
int MPIDI_Win_fence(int assert, MPID_Win *win_ptr)
{
int mpi_errno = MPI_SUCCESS;
- int comm_size, done;
+ int comm_size;
int *rma_target_proc, *nops_to_proc, i, total_op_count, *curr_ops_cnt;
- MPIDI_RMA_ops *curr_ptr, *next_ptr;
+ MPIDI_RMA_ops *curr_ptr;
MPID_Comm *comm_ptr;
- MPID_Request **requests=NULL; /* array of requests */
MPI_Win source_win_handle, target_win_handle;
- MPIDI_RMA_dtype_info *dtype_infos=NULL;
- void **dataloops=NULL; /* to store dataloops for each datatype */
MPID_Progress_state progress_state;
- MPIU_CHKLMEM_DECL(6);
+ MPIU_CHKLMEM_DECL(3);
MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);
@@ -71,6 +130,7 @@
* have completed and the lock is released. */
if (win_ptr->current_lock_type != MPID_LOCK_NONE)
{
+ MPIU_INSTR_DURATION_START(winfence_clearlock);
MPID_Progress_start(&progress_state);
while (win_ptr->current_lock_type != MPID_LOCK_NONE)
{
@@ -82,9 +142,10 @@
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
}
/* --END ERROR HANDLING-- */
-
+ MPIU_INSTR_DURATION_INCR(winfence_clearlock,0,1);
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winfence_clearlock);
}
/* Note that the NOPRECEDE and NOSUCCEED must be specified by all processes
@@ -108,10 +169,10 @@
}
else
{
+ MPIDI_RMA_ops **prevNextPtr, *tmpptr;
+ MPIU_INSTR_DURATION_START(winfence_rs);
/* This is the second or later fence. Do all the preceding RMA ops. */
-
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
-
+ comm_ptr = win_ptr->comm_ptr;
/* First inform every process whether it is a target of RMA
ops from this process */
comm_size = comm_ptr->local_size;
@@ -131,7 +192,7 @@
/* set rma_target_proc[i] to 1 if rank i is a target of RMA
ops from this process */
total_op_count = 0;
- curr_ptr = win_ptr->rma_ops_list;
+ curr_ptr = win_ptr->rma_ops_list_head;
while (curr_ptr != NULL)
{
total_op_count++;
@@ -143,22 +204,8 @@
MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size*sizeof(int),
mpi_errno, "curr_ops_cnt");
for (i=0; i<comm_size; i++) curr_ops_cnt[i] = 0;
-
- if (total_op_count != 0)
- {
- MPIU_CHKLMEM_MALLOC(requests, MPID_Request **,
- total_op_count*sizeof(MPID_Request*),
- mpi_errno, "requests");
- MPIU_CHKLMEM_MALLOC(dtype_infos, MPIDI_RMA_dtype_info *,
- total_op_count*sizeof(MPIDI_RMA_dtype_info),
- mpi_errno, "dtype_infos");
- MPIU_CHKLMEM_MALLOC(dataloops, void **,
- total_op_count*sizeof(void*),
- mpi_errno, "dataloops");
- for (i=0; i<total_op_count; i++) dataloops[i] = NULL;
- }
-
- /* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc. As a result,
+ /* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc.
+ As a result,
each process knows how many other processes will be doing
RMA ops on its window */
@@ -167,6 +214,7 @@
mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_proc, 1,
MPI_INT, MPI_SUM, comm_ptr);
+ MPIU_INSTR_DURATION_END(winfence_rs);
/* result is stored in rma_target_proc[0] */
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
@@ -175,9 +223,13 @@
procs have the address and could decrement it. */
win_ptr->my_counter = win_ptr->my_counter - comm_size +
rma_target_proc[0];
-
+
+ MPIU_INSTR_DURATION_START(winfence_issue);
+ MPIU_INSTR_DURATION_INCR(winfence_issue,0,total_op_count);
+ MPIU_INSTR_DURATION_MAX(winfence_issue,1,total_op_count);
i = 0;
- curr_ptr = win_ptr->rma_ops_list;
+ curr_ptr = win_ptr->rma_ops_list_head;
+ prevNextPtr = &win_ptr->rma_ops_list_head;
while (curr_ptr != NULL)
{
/* The completion counter at the target is decremented only on
@@ -191,22 +243,28 @@
source_win_handle = MPI_WIN_NULL;
target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
-
+
+ curr_ptr->dataloop = 0;
switch (curr_ptr->type)
{
case (MPIDI_RMA_PUT):
case (MPIDI_RMA_ACCUMULATE):
mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
source_win_handle, target_win_handle,
- &dtype_infos[i],
- &dataloops[i], &requests[i]);
+ &curr_ptr->dtype_info,
+ &curr_ptr->dataloop, &curr_ptr->request);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
break;
+ case MPIDI_RMA_ACC_CONTIG:
+ mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
+ source_win_handle, target_win_handle,
+ &curr_ptr->request );
+ break;
case (MPIDI_RMA_GET):
mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
source_win_handle, target_win_handle,
- &dtype_infos[i],
- &dataloops[i], &requests[i]);
+ &curr_ptr->dtype_info,
+ &curr_ptr->dataloop, &curr_ptr->request);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
break;
default:
@@ -214,87 +272,119 @@
}
i++;
curr_ops_cnt[curr_ptr->target_rank]++;
- curr_ptr = curr_ptr->next;
+ /* If the request is null, we can remove it immediately */
+ if (!curr_ptr->request) {
+ if (curr_ptr->dataloop != NULL) {
+ MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or
+ recv_rma_msg */
+ }
+ tmpptr = curr_ptr->next;
+ *prevNextPtr = tmpptr;
+ MPIU_Free( curr_ptr );
+ curr_ptr = tmpptr;
+ }
+ else {
+ curr_ptr = curr_ptr->next;
+ prevNextPtr = &curr_ptr->next;
+ /* FIXME: We could at least occassionally try to wait
+ on completion of the pending send requests rather than
+ focus on filling the queues. */
+ }
}
-
-
+ MPIU_INSTR_DURATION_END(winfence_issue);
+
+ /* We replaced a loop over an array of requests with a list of the
+ incomplete requests. The reason to do
+ that is for long lists - processing the entire list until
+ all are done introduces a potentially n^2 time. In
+ testing with test/mpi/perf/manyrma.c , the number of iterations
+ within the "while (total_op_count) was O(total_op_count).
+
+ Another alternative is to create a more compressed list (storing
+ only the necessary information, reducing the number of cache lines
+ needed while looping through the requests.
+ */
if (total_op_count)
{
- done = 1;
+ int ntimes = 0;
+ MPIU_INSTR_DURATION_START(winfence_complete);
MPID_Progress_start(&progress_state);
- while (total_op_count)
- {
- for (i=0; i<total_op_count; i++)
- {
- if (requests[i] != NULL)
- {
- if (!MPID_Request_is_complete(requests[i]))
- {
- done = 0;
- break;
- }
- else
- {
- mpi_errno = requests[i]->status.MPI_ERROR;
+ /* Process all operations until they are complete */
+ while (win_ptr->rma_ops_list_head) {
+ int loopcount = 0;
+ prevNextPtr = &win_ptr->rma_ops_list_head;
+ ntimes++;
+ curr_ptr = win_ptr->rma_ops_list_head;
+ do {
+ if (MPID_Request_is_complete(curr_ptr->request)) {
+ /* Once we find a complete request, we complete
+ as many as possible until we find an incomplete
+ or null request */
+ do {
+ mpi_errno = curr_ptr->request->status.MPI_ERROR;
/* --BEGIN ERROR HANDLING-- */
- if (mpi_errno != MPI_SUCCESS)
- {
+ if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMAmessage");
}
/* --END ERROR HANDLING-- */
- /* if origin datatype was a derived
- datatype, it will get freed when the
- request gets freed. */
- MPID_Request_release(requests[i]);
- requests[i] = NULL;
+ MPID_Request_release(curr_ptr->request);
+ if (curr_ptr->dataloop != NULL) {
+ MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or
+ recv_rma_msg */
+ }
+ /* We can remove and free this rma op element */
+ tmpptr = curr_ptr->next;
+ *prevNextPtr = tmpptr;
+ MPIU_Free( curr_ptr );
+ curr_ptr = tmpptr;
}
+ while (curr_ptr &&
+ MPID_Request_is_complete(curr_ptr->request));
+ /* Once a request completes, we wait for another
+ operation to arrive rather than check the
+ rest of the requests. */
+ break;
}
+ else {
+ /* In many cases, if the list of pending requests
+ is long, there's no point in checking the entire
+ list */
+ if (loopcount++ > 4) /* FIXME: threshold as parameter */
+ break; /* wait for an event */
+ prevNextPtr = &curr_ptr->next;
+ curr_ptr = curr_ptr->next;
+ }
+ } while (curr_ptr);
+
+ /* Wait for something to arrive*/
+ /* In some tests, this hung unless the test ensured that
+ there was an incomplete request. */
+ curr_ptr = win_ptr->rma_ops_list_head;
+ if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
+ MPIU_INSTR_DURATION_START(winfence_block);
+ mpi_errno = MPID_Progress_wait(&progress_state);
+ /* --BEGIN ERROR HANDLING-- */
+ if (mpi_errno != MPI_SUCCESS) {
+ MPID_Progress_end(&progress_state);
+ MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+ }
+ /* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_END(winfence_block);
}
-
- if (done)
- {
- break;
- }
-
- mpi_errno = MPID_Progress_wait(&progress_state);
- /* --BEGIN ERROR HANDLING-- */
- if (mpi_errno != MPI_SUCCESS) {
- MPID_Progress_end(&progress_state);
- MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
- }
- /* --END ERROR HANDLING-- */
-
- done = 1;
- }
+ } /* While list of rma operation is non-empty */
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_INCR(winfence_complete,0,ntimes);
+ MPIU_INSTR_DURATION_END(winfence_complete);
}
- if (total_op_count != 0)
- {
- for (i=0; i<total_op_count; i++)
- {
- if (dataloops[i] != NULL)
- {
- MPIU_Free(dataloops[i]); /* allocated in send_rma_msg or
- recv_rma_msg */
- }
- }
- }
+ win_ptr->rma_ops_list_head = NULL;
+ win_ptr->rma_ops_list_tail = NULL;
- /* free MPIDI_RMA_ops_list */
- curr_ptr = win_ptr->rma_ops_list;
- while (curr_ptr != NULL)
- {
- next_ptr = curr_ptr->next;
- MPIU_Free(curr_ptr);
- curr_ptr = next_ptr;
- }
- win_ptr->rma_ops_list = NULL;
-
/* wait for all operations from other processes to finish */
if (win_ptr->my_counter)
{
+ MPIU_INSTR_DURATION_START(winfence_wait);
MPID_Progress_start(&progress_state);
while (win_ptr->my_counter)
{
@@ -305,8 +395,10 @@
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
}
/* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_INCR(winfence_wait,0,1);
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winfence_wait);
}
if (assert & MPI_MODE_NOSUCCEED)
@@ -470,7 +562,7 @@
fflush(stdout);
*/
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ comm_ptr = win_ptr->comm_ptr;
MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype, predefined);
@@ -641,8 +733,120 @@
/* --END ERROR HANDLING-- */
}
+/*
+ * Use this for contiguous accumulate operations
+ */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Send_contig_acc_msg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPIDI_CH3I_Send_contig_acc_msg(MPIDI_RMA_ops *rma_op,
+ MPID_Win *win_ptr,
+ MPI_Win source_win_handle,
+ MPI_Win target_win_handle,
+ MPID_Request **request)
+{
+ MPIDI_CH3_Pkt_t upkt;
+ MPIDI_CH3_Pkt_accum_t *accum_pkt = &upkt.accum;
+ MPID_IOV iov[MPID_IOV_LIMIT];
+ int mpi_errno=MPI_SUCCESS;
+ int origin_type_size, iovcnt;
+ MPIDI_VC_t * vc;
+ MPID_Comm *comm_ptr;
+ int len;
+ MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
+ MPIDI_STATE_DECL(MPID_STATE_MEMCPY);
+ MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
+ *request = NULL;
+
+ MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+ /* FIXME: Make this size check efficient and match the packet type */
+ len = rma_op->origin_count * origin_type_size;
+ if (EnableImmedAcc && len <= MPIDI_RMA_IMMED_INTS*sizeof(int)) {
+ MPIDI_CH3_Pkt_accum_immed_t * accumi_pkt = &upkt.accum_immed;
+ void *dest = accumi_pkt->data, *src = rma_op->origin_addr;
+
+ MPIDI_Pkt_init(accumi_pkt, MPIDI_CH3_PKT_ACCUM_IMMED);
+ accumi_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
+ win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
+ accumi_pkt->count = rma_op->target_count;
+ accumi_pkt->datatype = rma_op->target_datatype;
+ accumi_pkt->op = rma_op->op;
+ accumi_pkt->target_win_handle = target_win_handle;
+ accumi_pkt->source_win_handle = source_win_handle;
+
+ switch (len) {
+ case 1: *(uint8_t *)dest = *(uint8_t *)src; break;
+ case 2: *(uint16_t *)dest = *(uint16_t *)src; break;
+ case 4: *(uint32_t *)dest = *(uint32_t *)src; break;
+ case 8: *(uint64_t *)dest = *(uint64_t *)src; break;
+ default:
+ MPIU_Memcpy( accumi_pkt->data, (void *)rma_op->origin_addr, len );
+ }
+ comm_ptr = win_ptr->comm_ptr;
+ MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+ MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+ mpi_errno = MPIU_CALL(MPIDI_CH3,iStartMsg(vc, accumi_pkt, sizeof(*accumi_pkt), request));
+ MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+ goto fn_exit;
+ }
+
+ MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
+ accum_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
+ win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
+ accum_pkt->count = rma_op->target_count;
+ accum_pkt->datatype = rma_op->target_datatype;
+ accum_pkt->dataloop_size = 0;
+ accum_pkt->op = rma_op->op;
+ accum_pkt->target_win_handle = target_win_handle;
+ accum_pkt->source_win_handle = source_win_handle;
+
+ iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
+ iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
+
+ /* printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
+ rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
+ fflush(stdout);
+ */
+
+ comm_ptr = win_ptr->comm_ptr;
+ MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+
+ /* basic datatype on target */
+ /* basic datatype on origin */
+ /* FIXME: This is still very heavyweight for a small message operation,
+ such as a single word update */
+ /* One possibility is to use iStartMsg with a buffer that is just large
+ enough, though note that nemesis has an optimization for this */
+ iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rma_op->origin_addr;
+ iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
+ iovcnt = 2;
+ MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+ mpi_errno = MPIU_CALL(MPIDI_CH3,iStartMsgv(vc, iov, iovcnt, request));
+ MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+ fn_exit:
+ MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
+ return mpi_errno;
+ /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+ if (*request)
+ {
+ MPIU_Object_set_ref(*request, 0);
+ MPIDI_CH3_Request_destroy(*request);
+ }
+ *request = NULL;
+ goto fn_exit;
+ /* --END ERROR HANDLING-- */
+}
+
+
+
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Recv_rma_msg
#undef FCNAME
@@ -708,7 +912,7 @@
fflush(stdout);
*/
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ comm_ptr = win_ptr->comm_ptr;
MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->target_datatype, predefined);
@@ -790,8 +994,6 @@
}
-
-
#undef FUNCNAME
#define FUNCNAME MPIDI_Win_post
#undef FCNAME
@@ -824,6 +1026,7 @@
{
MPID_Progress_state progress_state;
+ MPIU_INSTR_DURATION_START(winpost_clearlock);
/* poke the progress engine */
MPID_Progress_start(&progress_state);
while (win_ptr->current_lock_type != MPID_LOCK_NONE)
@@ -835,8 +1038,10 @@
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
}
/* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_INCR(winpost_clearlock,0,1);
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winpost_clearlock);
}
post_grp_size = post_grp_ptr->size;
@@ -848,12 +1053,14 @@
{
MPI_Request *req;
MPI_Status *status;
+
+ MPIU_INSTR_DURATION_START(winpost_sendsync);
/* NOCHECK not specified. We need to notify the source
processes that Post has been called. */
/* We need to translate the ranks of the processes in
- post_group to ranks in win_ptr->comm, so that we
+ post_group to ranks in win_ptr->comm_ptr, so that we
can do communication */
MPIU_CHKLMEM_MALLOC(ranks_in_post_grp, int *,
@@ -868,7 +1075,7 @@
ranks_in_post_grp[i] = i;
}
- MPID_Comm_get_ptr( win_ptr->comm, win_comm_ptr );
+ win_comm_ptr = win_ptr->comm_ptr;
mpi_errno = MPIR_Comm_group_impl(win_comm_ptr, &win_grp_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -877,15 +1084,20 @@
MPIR_Group_translate_ranks_impl(post_grp_ptr, post_grp_size, ranks_in_post_grp,
win_grp_ptr, ranks_in_win_grp);
- rank = MPIR_Comm_rank(win_comm_ptr);
+ rank = win_ptr->myrank;
MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request), mpi_errno, "req");
MPIU_CHKLMEM_MALLOC(status, MPI_Status *, post_grp_size*sizeof(MPI_Status), mpi_errno, "status");
/* Send a 0-byte message to the source processes */
+ MPIU_INSTR_DURATION_INCR(winpost_sendsync,0,post_grp_size);
for (i = 0; i < post_grp_size; i++) {
dst = ranks_in_win_grp[i];
-
+
+ /* FIXME: Short messages like this shouldn't normally need a
+ request - this should consider using the ch3 call to send
+ a short message and return a request only if the message is
+ not delivered. */
if (dst != rank) {
MPID_Request *req_ptr;
mpi_errno = MPID_Isend(&i, 0, MPI_INT, dst, SYNC_POST_TAG, win_comm_ptr,
@@ -912,6 +1124,7 @@
mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ MPIU_INSTR_DURATION_END(winpost_sendsync);
}
fn_exit:
@@ -954,6 +1167,7 @@
{
MPID_Progress_state progress_state;
+ MPIU_INSTR_DURATION_START(winstart_clearlock);
/* poke the progress engine */
MPID_Progress_start(&progress_state);
while (win_ptr->current_lock_type != MPID_LOCK_NONE)
@@ -965,8 +1179,10 @@
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
}
/* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_INCR(winstart_clearlock,0,1);
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winstart_clearlock);
}
win_ptr->start_group_ptr = group_ptr;
@@ -989,12 +1205,9 @@
int mpi_errno = MPI_SUCCESS;
int comm_size, *nops_to_proc, src, new_total_op_count;
int i, j, dst, done, total_op_count, *curr_ops_cnt;
- MPIDI_RMA_ops *curr_ptr, *next_ptr;
+ MPIDI_RMA_ops *curr_ptr, *tmpptr, **prevNextPtr;
MPID_Comm *comm_ptr;
- MPID_Request **requests; /* array of requests */
MPI_Win source_win_handle, target_win_handle;
- MPIDI_RMA_dtype_info *dtype_infos=NULL;
- void **dataloops=NULL; /* to store dataloops for each datatype */
MPID_Group *win_grp_ptr;
int start_grp_size, *ranks_in_start_grp, *ranks_in_win_grp, rank;
MPIU_CHKLMEM_DECL(9);
@@ -1002,14 +1215,15 @@
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_COMPLETE);
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+ comm_ptr = win_ptr->comm_ptr;
comm_size = comm_ptr->local_size;
/* Translate the ranks of the processes in
- start_group to ranks in win_ptr->comm */
+ start_group to ranks in win_ptr->comm_ptr */
start_grp_size = win_ptr->start_group_ptr->size;
-
+
+ MPIU_INSTR_DURATION_START(wincomplete_recvsync);
MPIU_CHKLMEM_MALLOC(ranks_in_start_grp, int *, start_grp_size*sizeof(int),
mpi_errno, "ranks_in_start_grp");
@@ -1024,10 +1238,11 @@
mpi_errno = MPIR_Comm_group_impl(comm_ptr, &win_grp_ptr);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
- MPIR_Group_translate_ranks_impl(win_ptr->start_group_ptr, start_grp_size, ranks_in_start_grp,
+ MPIR_Group_translate_ranks_impl(win_ptr->start_group_ptr, start_grp_size,
+ ranks_in_start_grp,
win_grp_ptr, ranks_in_win_grp);
- rank = MPIR_Comm_rank(comm_ptr);
+ rank = win_ptr->myrank;
/* If MPI_MODE_NOCHECK was not specified, we need to check if
Win_post was called on the target processes. Wait for a 0-byte sync
@@ -1036,14 +1251,19 @@
{
MPI_Request *req;
MPI_Status *status;
-
+
MPIU_CHKLMEM_MALLOC(req, MPI_Request *, start_grp_size*sizeof(MPI_Request), mpi_errno, "req");
MPIU_CHKLMEM_MALLOC(status, MPI_Status *, start_grp_size*sizeof(MPI_Status), mpi_errno, "status");
+ MPIU_INSTR_DURATION_INCR(wincomplete_recvsync,0,start_grp_size);
for (i = 0; i < start_grp_size; i++) {
src = ranks_in_win_grp[i];
if (src != rank) {
MPID_Request *req_ptr;
+ /* FIXME: This is a heavyweight way to process these sync
+ messages - this should be handled with a special packet
+ type and callback function.
+ */
mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -1067,28 +1287,31 @@
}
/* --END ERROR HANDLING-- */
}
-
+ MPIU_INSTR_DURATION_END(wincomplete_recvsync);
+
/* keep track of no. of ops to each proc. Needed for knowing
whether or not to decrement the completion counter. The
completion counter is decremented only on the last
operation. */
-
+
+ MPIU_INSTR_DURATION_START(wincomplete_issue);
+
MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size*sizeof(int),
mpi_errno, "nops_to_proc");
for (i=0; i<comm_size; i++) nops_to_proc[i] = 0;
total_op_count = 0;
- curr_ptr = win_ptr->rma_ops_list;
+ curr_ptr = win_ptr->rma_ops_list_head;
while (curr_ptr != NULL)
{
nops_to_proc[curr_ptr->target_rank]++;
total_op_count++;
curr_ptr = curr_ptr->next;
}
-
- MPIU_CHKLMEM_MALLOC(requests, MPID_Request **,
- (total_op_count+start_grp_size) * sizeof(MPID_Request*),
- mpi_errno, "requests");
+
+ MPIU_INSTR_DURATION_INCR(wincomplete_issue,0,total_op_count);
+ MPIU_INSTR_DURATION_MAX(wincomplete_issue,1,total_op_count);
+
/* We allocate a few extra requests because if there are no RMA
ops to a target process, we need to send a 0-byte message just
to decrement the completion counter. */
@@ -1096,19 +1319,10 @@
MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size*sizeof(int),
mpi_errno, "curr_ops_cnt");
for (i=0; i<comm_size; i++) curr_ops_cnt[i] = 0;
-
- if (total_op_count != 0)
- {
- MPIU_CHKLMEM_MALLOC(dtype_infos, MPIDI_RMA_dtype_info *,
- total_op_count*sizeof(MPIDI_RMA_dtype_info),
- mpi_errno, "dtype_infos");
- MPIU_CHKLMEM_MALLOC(dataloops, void **, total_op_count*sizeof(void*),
- mpi_errno, "dataloops");
- for (i=0; i<total_op_count; i++) dataloops[i] = NULL;
- }
i = 0;
- curr_ptr = win_ptr->rma_ops_list;
+ prevNextPtr = &win_ptr->rma_ops_list_head;
+ curr_ptr = win_ptr->rma_ops_list_head;
while (curr_ptr != NULL)
{
/* The completion counter at the target is decremented only on
@@ -1122,22 +1336,28 @@
source_win_handle = MPI_WIN_NULL;
target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
-
+
+ curr_ptr->dataloop = 0;
switch (curr_ptr->type)
{
case (MPIDI_RMA_PUT):
case (MPIDI_RMA_ACCUMULATE):
mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
source_win_handle, target_win_handle,
- &dtype_infos[i],
- &dataloops[i], &requests[i]);
+ &curr_ptr->dtype_info,
+ &curr_ptr->dataloop, &curr_ptr->request);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
break;
+ case MPIDI_RMA_ACC_CONTIG:
+ mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
+ source_win_handle, target_win_handle,
+ &curr_ptr->request );
+ break;
case (MPIDI_RMA_GET):
mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
source_win_handle, target_win_handle,
- &dtype_infos[i],
- &dataloops[i], &requests[i]);
+ &curr_ptr->dtype_info,
+ &curr_ptr->dataloop, &curr_ptr->request);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
break;
default:
@@ -1145,8 +1365,23 @@
}
i++;
curr_ops_cnt[curr_ptr->target_rank]++;
- curr_ptr = curr_ptr->next;
+ /* If the request is null, we can remove it immediately */
+ if (!curr_ptr->request) {
+ if (curr_ptr->dataloop != NULL) {
+ MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or
+ recv_rma_msg */
+ }
+ tmpptr = curr_ptr->next;
+ *prevNextPtr = tmpptr;
+ MPIU_Free( curr_ptr );
+ curr_ptr = tmpptr;
+ }
+ else {
+ curr_ptr = curr_ptr->next;
+ prevNextPtr = &curr_ptr->next;
+ }
}
+ MPIU_INSTR_DURATION_END(wincomplete_issue);
/* If the start_group included some processes that did not end up
becoming targets of RMA operations from this process, we need
@@ -1167,6 +1402,7 @@
MPIDI_CH3_Pkt_t upkt;
MPIDI_CH3_Pkt_put_t *put_pkt = &upkt.put;
MPIDI_VC_t * vc;
+ MPID_Request *request;
MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
put_pkt->addr = NULL;
@@ -1180,80 +1416,111 @@
MPIU_THREAD_CS_ENTER(CH3COMM,vc);
mpi_errno = MPIU_CALL(MPIDI_CH3,iStartMsg(vc, put_pkt,
sizeof(*put_pkt),
- &requests[j]));
+ &request));
MPIU_THREAD_CS_EXIT(CH3COMM,vc);
if (mpi_errno != MPI_SUCCESS) {
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg" );
}
+ /* In the unlikely event that a request is returned (the message
+ is not sent yet), add it to the list of pending operations */
+ if (request) {
+ /* Its hard to use the automatic allocator here, as those
+ macros are optimized for a known maximum number of items. */
+ MPIDI_RMA_ops *new_ptr;
+ new_ptr = (MPIDI_RMA_ops *)MPIU_Malloc(sizeof(MPIDI_RMA_ops) );
+ /* --BEGIN ERROR HANDLING-- */
+ if (!new_ptr) {
+ MPIU_CHKMEM_SETERR(mpi_errno,sizeof(MPIDI_RMA_ops),
+ "RMA operation entry");
+ goto fn_fail;
+ }
+ /* --END ERROR HANDLING-- */
+ if (win_ptr->rma_ops_list_tail)
+ win_ptr->rma_ops_list_tail->next = new_ptr;
+ else
+ win_ptr->rma_ops_list_head = new_ptr;
+ win_ptr->rma_ops_list_tail = new_ptr;
+ new_ptr->next = NULL;
+ new_ptr->request = request;
+ new_ptr->dataloop = 0;
+ }
j++;
new_total_op_count++;
}
}
-
+
if (new_total_op_count)
{
MPID_Progress_state progress_state;
done = 1;
+ MPIU_INSTR_DURATION_START(wincomplete_complete);
MPID_Progress_start(&progress_state);
- while (new_total_op_count)
- {
- for (i=0; i<new_total_op_count; i++)
- {
- if (requests[i] != NULL)
- {
- if (!MPID_Request_is_complete(requests[i]))
- {
- done = 0;
- break;
- }
- else
- {
- mpi_errno = requests[i]->status.MPI_ERROR;
+ while (win_ptr->rma_ops_list_head) {
+ prevNextPtr = &win_ptr->rma_ops_list_head;
+ curr_ptr = win_ptr->rma_ops_list_head;
+ do {
+ if (MPID_Request_is_complete(curr_ptr->request)) {
+ /* Once we find a complete request, we complete
+ as many as possible until we find an incomplete
+ or null request */
+ do {
+ mpi_errno = curr_ptr->request->status.MPI_ERROR;
/* --BEGIN ERROR HANDLING-- */
- if (mpi_errno != MPI_SUCCESS)
- {
+ if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
- MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMArequest");
+ MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMAmessage");
}
/* --END ERROR HANDLING-- */
- MPID_Request_release(requests[i]);
- requests[i] = NULL;
+ MPID_Request_release(curr_ptr->request);
+ if (curr_ptr->dataloop != NULL) {
+ MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or
+ recv_rma_msg */
+ }
+ /* We can remove and free this rma op element */
+ tmpptr = curr_ptr->next;
+ *prevNextPtr = tmpptr;
+ MPIU_Free( curr_ptr );
+ curr_ptr = tmpptr;
}
+ while (curr_ptr &&
+ MPID_Request_is_complete(curr_ptr->request));
+ /* Once a request completes, we wait for another
+ operation to arrive rather than check the
+ rest of the requests. */
+ break;
}
+ else {
+ prevNextPtr = &curr_ptr->next;
+ curr_ptr = curr_ptr->next;
+ break;
+ }
+ } while (curr_ptr);
+
+ /* Wait for something to arrive*/
+ /* In some tests, this hung unless the test ensured that
+ there was an incomplete request. */
+ curr_ptr = win_ptr->rma_ops_list_head;
+ if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
+ MPIU_INSTR_DURATION_START(winfence_block);
+ mpi_errno = MPID_Progress_wait(&progress_state);
+ /* --BEGIN ERROR HANDLING-- */
+ if (mpi_errno != MPI_SUCCESS) {
+ MPID_Progress_end(&progress_state);
+ MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+ }
+ /* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_END(winfence_block);
}
-
- if (done)
- {
- break;
- }
+ } /* While list of rma operation is non-empty */
- mpi_errno = MPID_Progress_wait(&progress_state);
- done = 1;
- }
+
MPID_Progress_end(&progress_state);
}
-
- if (total_op_count != 0)
- {
- for (i=0; i<total_op_count; i++)
- {
- if (dataloops[i] != NULL)
- {
- MPIU_Free(dataloops[i]);
- }
- }
- }
-
- /* free MPIDI_RMA_ops_list */
- curr_ptr = win_ptr->rma_ops_list;
- while (curr_ptr != NULL)
- {
- next_ptr = curr_ptr->next;
- MPIU_Free(curr_ptr);
- curr_ptr = next_ptr;
- }
- win_ptr->rma_ops_list = NULL;
+
+ MPIU_Assert( !win_ptr->rma_ops_list_head );
+ win_ptr->rma_ops_list_head = NULL;
+ win_ptr->rma_ops_list_tail = NULL;
mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -1291,6 +1558,7 @@
{
MPID_Progress_state progress_state;
+ MPIU_INSTR_DURATION_START(winwait_wait);
MPID_Progress_start(&progress_state);
while (win_ptr->my_counter)
{
@@ -1303,8 +1571,10 @@
return mpi_errno;
}
/* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_INCR(winwait_wait,0,1)
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winwait_wait);
}
MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_WAIT);
@@ -1362,9 +1632,9 @@
if (dest == MPI_PROC_NULL) goto fn_exit;
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+ comm_ptr = win_ptr->comm_ptr;
- if (dest == comm_ptr->rank) {
+ if (dest == win_ptr->myrank) {
/* The target is this process itself. We must block until the lock
* is acquired. */
@@ -1373,6 +1643,7 @@
{
MPID_Progress_state progress_state;
+ MPIU_INSTR_DURATION_START(winlock_getlocallock);
MPID_Progress_start(&progress_state);
while (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 0)
{
@@ -1385,6 +1656,7 @@
/* --END ERROR HANDLING-- */
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winlock_getlocallock);
}
/* local lock acquired. local puts, gets, accumulates will be done
directly without queueing. */
@@ -1392,11 +1664,13 @@
else {
/* target is some other process. add the lock request to rma_ops_list */
-
+ MPIU_INSTR_DURATION_START(rmaqueue_alloc);
MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops),
mpi_errno, "RMA operation entry");
+ MPIU_INSTR_DURATION_END(rmaqueue_alloc);
- win_ptr->rma_ops_list = new_ptr;
+ win_ptr->rma_ops_list_head = new_ptr;
+ win_ptr->rma_ops_list_tail = new_ptr;
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_LOCK;
@@ -1434,9 +1708,9 @@
if (dest == MPI_PROC_NULL) goto fn_exit;
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+ comm_ptr = win_ptr->comm_ptr;
- if (dest == comm_ptr->rank) {
+ if (dest == win_ptr->myrank) {
/* local lock. release the lock on the window, grant the next one
* in the queue, and return. */
mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
@@ -1445,7 +1719,7 @@
goto fn_exit;
}
- rma_op = win_ptr->rma_ops_list;
+ rma_op = win_ptr->rma_ops_list_head;
/* win_lock was not called. return error */
if ( (rma_op == NULL) || (rma_op->type != MPIDI_RMA_LOCK) ) {
@@ -1461,7 +1735,8 @@
if (rma_op->next == NULL) {
/* only win_lock called, no put/get/acc. Do nothing and return. */
MPIU_Free(rma_op);
- win_ptr->rma_ops_list = NULL;
+ win_ptr->rma_ops_list_head = NULL;
+ win_ptr->rma_ops_list_tail = NULL;
goto fn_exit;
}
@@ -1502,7 +1777,7 @@
if (single_op_opt == 0) {
/* Send a lock packet over to the target. wait for the lock_granted
- * reply. then do all the RMA ops. */
+ * reply. Then do all the RMA ops. */
MPIDI_Pkt_init(lock_pkt, MPIDI_CH3_PKT_LOCK);
lock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
@@ -1535,6 +1810,7 @@
{
MPID_Progress_state progress_state;
+ MPIU_INSTR_DURATION_START(winunlock_getlock);
MPID_Progress_start(&progress_state);
while (win_ptr->lock_granted == 0)
{
@@ -1547,6 +1823,7 @@
/* --END ERROR HANDLING-- */
}
MPID_Progress_end(&progress_state);
+ MPIU_INSTR_DURATION_END(winunlock_getlock);
}
/* Now do all the RMA operations */
@@ -1603,18 +1880,15 @@
int *wait_for_rma_done_pkt)
{
int mpi_errno = MPI_SUCCESS, done, i, nops;
- MPIDI_RMA_ops *curr_ptr, *next_ptr, **curr_ptr_ptr, *tmp_ptr;
+ MPIDI_RMA_ops *curr_ptr;
MPID_Comm *comm_ptr;
- MPID_Request **requests=NULL; /* array of requests */
- MPIDI_RMA_dtype_info *dtype_infos=NULL;
- void **dataloops=NULL; /* to store dataloops for each datatype */
+ MPIDI_RMA_ops **prevNextPtr, *tmpptr;
MPI_Win source_win_handle, target_win_handle;
- MPIU_CHKLMEM_DECL(3);
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
- if (win_ptr->rma_ops_list->lock_type == MPI_LOCK_EXCLUSIVE) {
+ if (win_ptr->rma_ops_list_head->lock_type == MPI_LOCK_EXCLUSIVE) {
/* exclusive lock. no need to wait for rma done pkt at the end */
*wait_for_rma_done_pkt = 0;
}
@@ -1623,179 +1897,200 @@
to the end of the list and do it last, in which case an rma done
pkt is not needed. If there is no get, rma done pkt is needed */
- /* First check whether the last operation is a get. Skip the first op,
- which is a lock. */
-
- curr_ptr = win_ptr->rma_ops_list->next;
- while (curr_ptr->next != NULL)
- curr_ptr = curr_ptr->next;
-
- if (curr_ptr->type == MPIDI_RMA_GET) {
+ if (win_ptr->rma_ops_list_tail->type == MPIDI_RMA_GET) {
/* last operation is a get. no need to wait for rma done pkt */
*wait_for_rma_done_pkt = 0;
}
else {
/* go through the list and move the first get operation
- (if there is one) to the end */
+ (if there is one) to the end. Note that the first
+ operation must be a lock, so we can skip it */
- curr_ptr = win_ptr->rma_ops_list->next;
- curr_ptr_ptr = &(win_ptr->rma_ops_list->next);
+ curr_ptr = win_ptr->rma_ops_list_head->next;
+ prevNextPtr = &(win_ptr->rma_ops_list_head->next);
*wait_for_rma_done_pkt = 1;
while (curr_ptr != NULL) {
if (curr_ptr->type == MPIDI_RMA_GET) {
+ /* Found a GET, move it to the end */
*wait_for_rma_done_pkt = 0;
- *curr_ptr_ptr = curr_ptr->next;
- tmp_ptr = curr_ptr;
- while (curr_ptr->next != NULL)
- curr_ptr = curr_ptr->next;
- curr_ptr->next = tmp_ptr;
- tmp_ptr->next = NULL;
+ win_ptr->rma_ops_list_tail->next = curr_ptr;
+ *prevNextPtr = curr_ptr->next;
+ curr_ptr->next = NULL;
+ win_ptr->rma_ops_list_tail = curr_ptr;
break;
}
else {
- curr_ptr_ptr = &(curr_ptr->next);
- curr_ptr = curr_ptr->next;
+ prevNextPtr = &(curr_ptr->next);
+ curr_ptr = curr_ptr->next;
}
}
}
}
- MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+ comm_ptr = win_ptr->comm_ptr;
/* Ignore the first op in the list because it is a win_lock and do
the rest */
- curr_ptr = win_ptr->rma_ops_list->next;
+ /*
+ This list has a head (lock) (but no tail (unlock)) that is not
+ processed, so we must skip over that head
+ */
+
+ curr_ptr = win_ptr->rma_ops_list_head->next;
nops = 0;
while (curr_ptr != NULL) {
nops++;
curr_ptr = curr_ptr->next;
}
+
+ MPIU_INSTR_DURATION_START(winunlock_issue);
+ i = 0;
- MPIU_CHKLMEM_MALLOC(requests, MPID_Request **, nops*sizeof(MPID_Request*),
- mpi_errno, "requests");
- MPIU_CHKLMEM_MALLOC(dtype_infos, MPIDI_RMA_dtype_info *,
- nops*sizeof(MPIDI_RMA_dtype_info),
- mpi_errno, "dtype_infos");
- MPIU_CHKLMEM_MALLOC(dataloops, void **, nops*sizeof(void*),
- mpi_errno, "dataloops");
+ /* Remove the lock entry */
+ curr_ptr = win_ptr->rma_ops_list_head;
+ tmpptr = curr_ptr->next;
+ win_ptr->rma_ops_list_head = tmpptr;
+ MPIU_Free( curr_ptr );
- for (i=0; i<nops; i++)
- {
- dataloops[i] = NULL;
- }
-
- i = 0;
- curr_ptr = win_ptr->rma_ops_list->next;
+ prevNextPtr = &win_ptr->rma_ops_list_head;
+ curr_ptr = win_ptr->rma_ops_list_head;
target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
while (curr_ptr != NULL)
{
/* To indicate the last RMA operation, we pass the
source_win_handle only on the last operation. Otherwise,
we pass MPI_WIN_NULL. */
- if (i == nops - 1)
+ /* Could also be curr_ptr->next == NULL */
+ if (/*i == nops - 1*/!curr_ptr->next)
source_win_handle = win_ptr->handle;
else
source_win_handle = MPI_WIN_NULL;
+ curr_ptr->dataloop = 0;
switch (curr_ptr->type)
{
case (MPIDI_RMA_PUT): /* same as accumulate */
case (MPIDI_RMA_ACCUMULATE):
win_ptr->pt_rma_puts_accs[curr_ptr->target_rank]++;
mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
- source_win_handle, target_win_handle, &dtype_infos[i],
- &dataloops[i], &requests[i]);
+ source_win_handle, target_win_handle,
+ &curr_ptr->dtype_info,
+ &curr_ptr->dataloop, &curr_ptr->request);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
break;
+ case MPIDI_RMA_ACC_CONTIG:
+ win_ptr->pt_rma_puts_accs[curr_ptr->target_rank]++;
+ mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
+ source_win_handle, target_win_handle,
+ &curr_ptr->request );
+ break;
case (MPIDI_RMA_GET):
mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
- source_win_handle, target_win_handle, &dtype_infos[i],
- &dataloops[i], &requests[i]);
+ source_win_handle, target_win_handle,
+ &curr_ptr->dtype_info,
+ &curr_ptr->dataloop, &curr_ptr->request);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
break;
default:
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winInvalidOp");
}
i++;
- curr_ptr = curr_ptr->next;
+ /* If the request is null, we can remove it immediately */
+ if (!curr_ptr->request) {
+ if (curr_ptr->dataloop != NULL) {
+ MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or
+ recv_rma_msg */
+ }
+ tmpptr = curr_ptr->next;
+ *prevNextPtr = tmpptr;
+ MPIU_Free( curr_ptr );
+ curr_ptr = tmpptr;
+ }
+ else {
+ curr_ptr = curr_ptr->next;
+ prevNextPtr = &curr_ptr->next;
+ }
}
+ MPIU_INSTR_DURATION_END(winunlock_issue);
if (nops)
{
MPID_Progress_state progress_state;
done = 1;
+ MPIU_INSTR_DURATION_START(winunlock_complete);
MPID_Progress_start(&progress_state);
- while (nops)
- {
- for (i=0; i<nops; i++)
- {
- if (requests[i] != NULL)
- {
- if (!MPID_Request_is_complete(requests[i]))
- {
- done = 0;
- break;
- }
- else
- {
- mpi_errno = requests[i]->status.MPI_ERROR;
+ while (win_ptr->rma_ops_list_head) {
+ prevNextPtr = &win_ptr->rma_ops_list_head;
+ curr_ptr = win_ptr->rma_ops_list_head;
+ do {
+ if (MPID_Request_is_complete(curr_ptr->request)) {
+ /* Once we find a complete request, we complete
+ as many as possible until we find an incomplete
+ or null request */
+ do {
+ mpi_errno = curr_ptr->request->status.MPI_ERROR;
/* --BEGIN ERROR HANDLING-- */
- if (mpi_errno != MPI_SUCCESS)
- {
+ if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMAmessage");
}
/* --END ERROR HANDLING-- */
- /* if origin datatype was a derived
- datatype, it will get freed when the
- request gets freed. */
- MPID_Request_release(requests[i]);
- requests[i] = NULL;
+ MPID_Request_release(curr_ptr->request);
+ if (curr_ptr->dataloop != NULL) {
+ MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or
+ recv_rma_msg */
+ }
+ /* We can remove and free this rma op element */
+ tmpptr = curr_ptr->next;
+ *prevNextPtr = tmpptr;
+ MPIU_Free( curr_ptr );
+ curr_ptr = tmpptr;
}
+ while (curr_ptr &&
+ MPID_Request_is_complete(curr_ptr->request));
+ /* Once a request completes, we wait for another
+ operation to arrive rather than check the
+ rest of the requests. */
+ break;
}
+ else {
+ prevNextPtr = &curr_ptr->next;
+ curr_ptr = curr_ptr->next;
+ break;
+ }
+ } while (curr_ptr);
+
+ /* Wait for something to arrive*/
+ /* In some tests, this hung unless the test ensured that
+ there was an incomplete request. */
+ curr_ptr = win_ptr->rma_ops_list_head;
+ if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
+ MPIU_INSTR_DURATION_START(winfence_block);
+ mpi_errno = MPID_Progress_wait(&progress_state);
+ /* --BEGIN ERROR HANDLING-- */
+ if (mpi_errno != MPI_SUCCESS) {
+ MPID_Progress_end(&progress_state);
+ MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+ }
+ /* --END ERROR HANDLING-- */
+ MPIU_INSTR_DURATION_END(winfence_block);
}
-
- if (done)
- {
- break;
- }
-
- mpi_errno = MPID_Progress_wait(&progress_state);
- /* --BEGIN ERROR HANDLING-- */
- if (mpi_errno != MPI_SUCCESS) {
- MPID_Progress_end(&progress_state);
- MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
- }
- /* --END ERROR HANDLING-- */
- done = 1;
- }
+ } /* While list of rma operation is non-empty */
+
MPID_Progress_end(&progress_state);
}
- for (i=0; i<nops; i++)
- {
- if (dataloops[i] != NULL)
- {
- MPIU_Free(dataloops[i]);
- }
- }
-
- /* free MPIDI_RMA_ops_list */
- curr_ptr = win_ptr->rma_ops_list;
- while (curr_ptr != NULL)
- {
- next_ptr = curr_ptr->next;
- MPIU_Free(curr_ptr);
- curr_ptr = next_ptr;
- }
- win_ptr->rma_ops_list = NULL;
+ MPIU_Assert( !win_ptr->rma_ops_list_head );
+
+ win_ptr->rma_ops_list_head = NULL;
+ win_ptr->rma_ops_list_tail = NULL;
+
fn_exit:
- MPIU_CHKLMEM_FREEALL();
MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
return mpi_errno;
/* --BEGIN ERROR HANDLING-- */
@@ -1829,9 +2124,9 @@
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_LOCK_PUT_OR_ACC);
- lock_type = win_ptr->rma_ops_list->lock_type;
+ lock_type = win_ptr->rma_ops_list_head->lock_type;
- rma_op = win_ptr->rma_ops_list->next;
+ rma_op = win_ptr->rma_ops_list_head->next;
win_ptr->pt_rma_puts_accs[rma_op->target_rank]++;
@@ -1871,8 +2166,30 @@
iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_accum_unlock_pkt;
iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
}
+ else if (rma_op->type == MPIDI_RMA_ACC_CONTIG) {
+ MPIDI_Pkt_init(lock_accum_unlock_pkt, MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK);
+ lock_accum_unlock_pkt->target_win_handle =
+ win_ptr->all_win_handles[rma_op->target_rank];
+ lock_accum_unlock_pkt->source_win_handle = win_ptr->handle;
+ lock_accum_unlock_pkt->lock_type = lock_type;
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ lock_accum_unlock_pkt->addr =
+ (char *) win_ptr->base_addrs[rma_op->target_rank] +
+ win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
+
+ lock_accum_unlock_pkt->count = rma_op->target_count;
+ lock_accum_unlock_pkt->datatype = rma_op->target_datatype;
+ lock_accum_unlock_pkt->op = rma_op->op;
+
+ iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_accum_unlock_pkt;
+ iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
+ }
+ else {
+ printf( "expected short accumulate...\n" );
+ /* */
+ }
+
+ comm_ptr = win_ptr->comm_ptr;
MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype, predefined);
@@ -1974,9 +2291,10 @@
}
/* free MPIDI_RMA_ops_list */
- MPIU_Free(win_ptr->rma_ops_list->next);
- MPIU_Free(win_ptr->rma_ops_list);
- win_ptr->rma_ops_list = NULL;
+ MPIU_Free(win_ptr->rma_ops_list_head->next);
+ MPIU_Free(win_ptr->rma_ops_list_head);
+ win_ptr->rma_ops_list_head = NULL;
+ win_ptr->rma_ops_list_tail = NULL;
fn_fail:
MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_LOCK_PUT_OR_ACC);
@@ -2004,9 +2322,9 @@
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_LOCK_GET);
- lock_type = win_ptr->rma_ops_list->lock_type;
+ lock_type = win_ptr->rma_ops_list_head->lock_type;
- rma_op = win_ptr->rma_ops_list->next;
+ rma_op = win_ptr->rma_ops_list_head->next;
/* create a request, store the origin buf, cnt, datatype in it,
and pass a handle to it in the get packet. When the get
@@ -2048,7 +2366,7 @@
lock_get_unlock_pkt->datatype = rma_op->target_datatype;
lock_get_unlock_pkt->request_handle = rreq->handle;
- MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+ comm_ptr = win_ptr->comm_ptr;
MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
MPIU_THREAD_CS_ENTER(CH3COMM,vc);
@@ -2095,10 +2413,12 @@
MPID_Request_release(rreq);
/* free MPIDI_RMA_ops_list */
- MPIU_Free(win_ptr->rma_ops_list->next);
- MPIU_Free(win_ptr->rma_ops_list);
- win_ptr->rma_ops_list = NULL;
+ MPIU_Free(win_ptr->rma_ops_list_head->next);
+ MPIU_Free(win_ptr->rma_ops_list_head);
+ win_ptr->rma_ops_list_head = NULL;
+ win_ptr->rma_ops_list_tail = NULL;
+
fn_fail:
MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_LOCK_GET);
return mpi_errno;
@@ -2239,7 +2559,6 @@
{
mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(vc, req, &complete);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
if (complete)
{
*rreqp = NULL;
@@ -2467,7 +2786,8 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"received accumulate pkt");
-
+
+ MPIU_INSTR_DURATION_START(rmapkt_acc);
data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
data_buf = (char *)pkt + sizeof(MPIDI_CH3_Pkt_t);
@@ -2484,6 +2804,7 @@
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(accum_pkt->datatype, predefined);
if (predefined)
{
+ MPIU_INSTR_DURATION_START(rmapkt_acc_predef);
MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
req->dev.datatype = accum_pkt->datatype;
@@ -2530,9 +2851,11 @@
if (complete)
{
*rreqp = NULL;
+ MPIU_INSTR_DURATION_END(rmapkt_acc_predef);
goto fn_exit;
}
}
+ MPIU_INSTR_DURATION_END(rmapkt_acc_predef);
}
}
else
@@ -2590,6 +2913,7 @@
}
fn_exit:
+ MPIU_INSTR_DURATION_END(rmapkt_acc);
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
return mpi_errno;
fn_fail:
@@ -2597,7 +2921,110 @@
}
+/* Special accumulate for short data items entirely within the packet */
#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_PktHandler_Accumulate_Immed
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
+ MPIDI_msg_sz_t *buflen,
+ MPID_Request **rreqp )
+{
+ MPIDI_CH3_Pkt_accum_immed_t * accum_pkt = &pkt->accum_immed;
+ MPID_Win *win_ptr;
+ MPI_Aint extent;
+ int mpi_errno = MPI_SUCCESS;
+ MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
+
+ MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"received accumulate immedidate pkt");
+
+ MPIU_INSTR_DURATION_START(rmapkt_acc_immed);
+
+ /* return the number of bytes processed in this function */
+ /* data_len == 0 (all within packet) */
+ *buflen = sizeof(MPIDI_CH3_Pkt_t);
+ *rreqp = NULL;
+
+ MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
+
+ /* size == 0 should never happen */
+ if (accum_pkt->count == 0 || extent == 0) {
+ ;
+ }
+ else {
+ /* Data is already present */
+ if (accum_pkt->op == MPI_REPLACE) {
+ /* no datatypes required */
+ int len = accum_pkt->count * extent;
+ /* FIXME: use immediate copy because this is short */
+ MPIUI_Memcpy( accum_pkt->addr, accum_pkt->data, len );
+ }
+ else {
+ if (HANDLE_GET_KIND(accum_pkt->op) == HANDLE_KIND_BUILTIN) {
+ MPI_User_function *uop;
+ /* get the function by indexing into the op table */
+ uop = MPIR_Op_table[((accum_pkt->op)&0xf) - 1];
+ (*uop)(accum_pkt->data, accum_pkt->addr,
+ &(accum_pkt->count), &(accum_pkt->datatype));
+ }
+ else {
+ MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OP, "**opnotpredefined",
+ "**opnotpredefined %d", accum_pkt->op );
+ }
+ }
+
+ /* There are additional steps to take if this is a passive
+ target RMA or the last operation from the source */
+
+ /* Here is the code executed in PutAccumRespComplete after the
+ accumulation operation */
+ MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
+
+ /* if passive target RMA, increment counter */
+ if (win_ptr->current_lock_type != MPID_LOCK_NONE)
+ win_ptr->my_pt_rma_puts_accs++;
+
+ if (accum_pkt->source_win_handle != MPI_WIN_NULL) {
+ /* Last RMA operation from source. If active
+ target RMA, decrement window counter. If
+ passive target RMA, release lock on window and
+ grant next lock in the lock queue if there is
+ any. If it's a shared lock or a lock-put-unlock
+ type of optimization, we also need to send an
+ ack to the source. */
+ if (win_ptr->current_lock_type == MPID_LOCK_NONE) {
+ /* FIXME: MT: this has to be done atomically */
+ win_ptr->my_counter -= 1;
+ MPIDI_CH3_Progress_signal_completion();
+ }
+ else {
+ if ((win_ptr->current_lock_type == MPI_LOCK_SHARED) ||
+ (/*rreq->dev.single_op_opt*/ 0 == 1)) {
+ mpi_errno = MPIDI_CH3I_Send_pt_rma_done_pkt(vc,
+ accum_pkt->source_win_handle);
+ if (mpi_errno) {
+ MPIU_ERR_POP(mpi_errno);
+ }
+ }
+ mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+ }
+ }
+
+ goto fn_exit;
+ }
+
+ fn_exit:
+ MPIU_INSTR_DURATION_END(rmapkt_acc_immed);
+ MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+
+}
+
+#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_PktHandler_Lock
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -2631,6 +3058,9 @@
/* FIXME: MT: This may need to be done atomically. */
+ /* FIXME: Since we need to add to the tail of the list,
+ we should maintain a tail pointer rather than traversing the
+ list each time to find the tail. */
curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
prev_ptr = curr_ptr;
while (curr_ptr != NULL)
@@ -2639,7 +3069,9 @@
curr_ptr = curr_ptr->next;
}
+ MPIU_INSTR_DURATION_START(lockqueue_alloc);
new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+ MPIU_INSTR_DURATION_END(lockqueue_alloc);
if (!new_ptr) {
MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
"MPIDI_Win_lock_queue");
@@ -2713,7 +3145,9 @@
/* queue the information */
MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
+ MPIU_INSTR_DURATION_START(lockqueue_alloc);
new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+ MPIU_INSTR_DURATION_END(lockqueue_alloc);
if (!new_ptr) {
MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
"MPIDI_Win_lock_queue");
@@ -2874,7 +3308,7 @@
MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
/* FIXME: MT: This may need to be done atomically. */
-
+
curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
prev_ptr = curr_ptr;
while (curr_ptr != NULL)
@@ -2883,7 +3317,9 @@
curr_ptr = curr_ptr->next;
}
+ MPIU_INSTR_DURATION_START(lockqueue_alloc);
new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+ MPIU_INSTR_DURATION_END(lockqueue_alloc);
if (!new_ptr) {
MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
"MPIDI_Win_lock_queue");
@@ -2961,7 +3397,9 @@
/* queue the information */
+ MPIU_INSTR_DURATION_START(lockqueue_alloc);
new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+ MPIU_INSTR_DURATION_END(lockqueue_alloc);
if (!new_ptr) {
MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
"MPIDI_Win_lock_queue");
@@ -3219,6 +3657,19 @@
/*MPIU_DBG_PRINTF((" win_ptr ...... 0x%08X\n", pkt->accum.win_ptr));*/
return MPI_SUCCESS;
}
+int MPIDI_CH3_PktPrint_Accum_Immed( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
+{
+ MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_ACCUM_IMMED\n"));
+ MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->accum_immed.addr));
+ MPIU_DBG_PRINTF((" count ........ %d\n", pkt->accum_immed.count));
+ MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->accum_immed.datatype));
+ MPIU_DBG_PRINTF((" op ........... 0x%08X\n", pkt->accum_immed.op));
+ MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->accum_immed.target_win_handle));
+ MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->accum_immed.source_win_handle));
+ /*MPIU_DBG_PRINTF((" win_ptr ...... 0x%08X\n", pkt->accum.win_ptr));*/
+ fflush(stdout);
+ return MPI_SUCCESS;
+}
int MPIDI_CH3_PktPrint_Lock( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
{
MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_LOCK\n"));
Modified: mpich2/trunk/src/util/instrm/Makefile.sm
===================================================================
--- mpich2/trunk/src/util/instrm/Makefile.sm 2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/util/instrm/Makefile.sm 2010-11-06 13:08:15 UTC (rev 7416)
@@ -1,2 +1,2 @@
-lib${MPILIBNAME}_a_SOURCES = states.c
+lib${MPILIBNAME}_a_SOURCES = states.c instr.c
INCLUDES = -I../../include -I${top_srcdir}/src/include
Added: mpich2/trunk/src/util/instrm/instr.c
===================================================================
--- mpich2/trunk/src/util/instrm/instr.c (rev 0)
+++ mpich2/trunk/src/util/instrm/instr.c 2010-11-06 13:08:15 UTC (rev 7416)
@@ -0,0 +1,132 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * (C) 2001 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+#include "mpiimpl.h"
+
+#ifdef USE_MPIU_INSTR
+
+static int MPIU_INSTR_Printf( FILE *fp );
+static int MPIU_INSTR_Finalize( void *p );
+
+/* */
+/*
+ * Basic but general support for instrumentation hooks in MPICH2
+ *
+ * Most actions are handled by MPIU_INSTR_xxx macros (to permit both lowest
+ * overhead and to allow instrumentation to be selected at compile time.
+ */
+static struct MPIU_INSTR_Generic_t *instrHead = 0, *instrTail = 0;
+
+int MPIU_INSTR_AddHandle( void *handlePtr )
+{
+ struct MPIU_INSTR_Generic_t *gPtr =
+ (struct MPIU_INSTR_Generic_t *)handlePtr;
+
+ /* Note that Addhandle must be within a thread-safe initialization */
+ if (!instrHead) {
+ /* Make sure that this call back occurs early (before MPID_Finalize) */
+ MPIR_Add_finalize( MPIU_INSTR_Finalize, stdout,
+ MPIR_FINALIZE_CALLBACK_PRIO + 2 );
+ }
+
+ if (instrHead) {
+ instrTail->next = gPtr;
+ }
+ else {
+ instrHead = gPtr;
+ }
+ instrTail = gPtr;
+}
+
+#define MAX_INSTR_BUF 1024
+static int MPIU_INSTR_Printf( FILE *fp )
+{
+ struct MPIU_INSTR_Generic_t *gPtr = instrHead;
+ char instrBuf[MAX_INSTR_BUF];
+
+ while (gPtr) {
+ /* We only output information on events that occured */
+ if (gPtr->count) {
+ if (gPtr->toStr) {
+ (*gPtr->toStr)( instrBuf, sizeof(instrBuf), gPtr );
+ }
+ else {
+ if (gPtr->desc) {
+ MPIU_Strncpy( instrBuf, gPtr->desc, sizeof(instrBuf) );
+ }
+ else {
+ /* This should not happen */
+ MPIU_Strncpy( instrBuf, "", sizeof(instrBuf) );
+ }
+ }
+ fputs( instrBuf, fp );
+ fputc( '\n', fp );
+ }
+ gPtr = gPtr->next;
+ }
+ fflush( fp );
+ return 0;
+}
+
+static int MPIU_INSTR_Finalize( void *p )
+{
+ int rc;
+ struct MPIU_INSTR_Generic_t *gPtr = instrHead;
+ /* FIXME: This should at least issue the writes in process order */
+ /* Allow whether output is generated to be controlled */
+ if (!MPL_env2bool( "MPICH_INSTR_AT_FINALIZE", &rc ))
+ rc = 0;
+
+ if (rc) {
+ MPIU_INSTR_Printf( stdout );
+ }
+
+ /* Free any memory allocated for the descriptions */
+ while (gPtr) {
+ if (gPtr->desc) {
+ MPIU_Free( (char *)gPtr->desc );
+ gPtr->desc = 0;
+ }
+ gPtr = gPtr->next;
+ }
+
+ return 0;
+}
+
+/*
+ * Standard print routines for the instrumentation objects
+ */
+
+/*
+ * Print a duration, which may have extra integer fields. Those fields
+ * are printed as integers, in order, separate by tabs
+ */
+int MPIU_INSTR_ToStr_Duration_Count( char *buf, size_t maxBuf, void *ptr )
+{
+ double ttime;
+ struct MPIU_INSTR_Duration_count_t *dPtr =
+ (struct MPIU_INSTR_Duration_count_t *)ptr;
+ MPID_Wtime_todouble( &dPtr->ttime, &ttime );
+ snprintf( buf, maxBuf, "%-40s:\t%d\t%e", dPtr->desc, dPtr->count, ttime );
+ if (dPtr->nitems) {
+ char *p;
+ int len = strlen(buf);
+ int i;
+ /* Add each integer value, separated by a tab. */
+ maxBuf -= len;
+ p = buf + len;
+ for (i=0; i<dPtr->nitems; i++) {
+ snprintf( p, maxBuf, "\t%d", dPtr->data[i] );
+ len = strlen(p);
+ maxBuf -= len;
+ p += len;
+ }
+ }
+ return 0;
+}
+#else
+/* No routines required if instrumentation is not selected */
+#endif
More information about the mpich2-commits
mailing list