[mpich2-commits] r7416 - in mpich2/trunk: . src/include src/mpi/rma src/mpid/ch3/include src/mpid/ch3/src src/util/instrm

gropp at mcs.anl.gov gropp at mcs.anl.gov
Sat Nov 6 08:08:15 CDT 2010


Author: gropp
Date: 2010-11-06 08:08:15 -0500 (Sat, 06 Nov 2010)
New Revision: 7416

Added:
   mpich2/trunk/src/include/mpiinstr.h
   mpich2/trunk/src/util/instrm/instr.c
Modified:
   mpich2/trunk/configure.in
   mpich2/trunk/src/include/mpiimpl.h
   mpich2/trunk/src/mpi/rma/accumulate.c
   mpich2/trunk/src/mpi/rma/get.c
   mpich2/trunk/src/mpi/rma/put.c
   mpich2/trunk/src/mpi/rma/win_get_group.c
   mpich2/trunk/src/mpi/rma/win_lock.c
   mpich2/trunk/src/mpi/rma/win_unlock.c
   mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
   mpich2/trunk/src/mpid/ch3/include/mpidpkt.h
   mpich2/trunk/src/mpid/ch3/include/mpidpre.h
   mpich2/trunk/src/mpid/ch3/include/mpidrma.h
   mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
   mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c
   mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c
   mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
   mpich2/trunk/src/util/instrm/Makefile.sm
Log:
Major improvement to RMA performance for long lists of operations, an immediate mode accumulate for single ints, store the MPID_Comm within the window, and added a basic performance instrumentation interface that was extensively used to improve the RMA performance (enabled with --enable-g=instr).  With these fixes, MPICH2 can run the one-sided version of the Graph500 benchmark at a respectable if not great rate

Modified: mpich2/trunk/configure.in
===================================================================
--- mpich2/trunk/configure.in	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/configure.in	2010-11-06 13:08:15 UTC (rev 7416)
@@ -135,7 +135,6 @@
     AC_MSG_ERROR([Version information not found. Configuration aborted.])
 fi
 AC_SUBST(MPICH2_RELEASE_DATE)
-
 # Produce a numeric version assuming the following format:
 # Version: [MAJ].[MIN].[REV][EXT][EXT_NUMBER]
 # Example: 1.0.7rc1 has
@@ -330,6 +329,7 @@
                    compiler flags, i.e. MPICH2LIB_CFLAGS, MPICH2LIB_CXXFLAGS,
                    MPICH2LIB_FFLAGS, and MPICH2LIB_FCFLAGS.
         debug    - Synonym for dbg
+        instr    - Enable instrumentation
         log      - Enable debug event logging
         mem      - Memory usage tracing
         meminit  - Preinitialize memory associated structures and unions to
@@ -740,7 +740,9 @@
             MPI_DEFAULT_FOPTS="-$option"
             MPI_DEFAULT_FCOPTS="-$option"
         else
+	    IFS="$save_IFS"
             AC_MSG_WARN([Unknown value $option for --enable-fast])
+	    IFS=","
         fi
         ;;
         none|no)
@@ -751,7 +753,9 @@
         enable_append_ndebug=no
         ;;
         *)
+	IFS="$save_IFS"
         AC_MSG_WARN([Unknown value $option for --enable-fast])
+	IFS=","
         ;;
     esac
 done
@@ -1260,6 +1264,9 @@
 	handle)
 	AC_DEFINE(MPICH_DEBUG_HANDLES,1,[Define to enable handle checking])
 	;;
+	instr)
+	perform_instr=yes
+	;;
 	meminit)
 	perform_meminit=yes
 	;;
@@ -1284,12 +1291,15 @@
 	perform_dbglog=yes
 	enable_append_g=yes
 	perform_meminit=yes
+	perform_instr=yes
 	perform_dbgmutex=yes
 	perform_mutexnesting=yes
 	perform_handlealloc=yes
 	;;
 	*)
-	AC_MSG_WARN([Unknown value $enable_g for enable-g])
+	IFS=$save_IFS
+	AC_MSG_WARN([Unknown value $option for enable-g])
+	IFS=","
 	;;
     esac
 done
@@ -1311,8 +1321,11 @@
     AC_DEFINE(MPICH_DEBUG_MEMINIT,1,[Define to enable preinitialization of memory used by structures and unions])
 fi
 if test "$perform_handlealloc" = yes ; then
-   AC_DEFINE(MPICH_DEBUG_HANDLEALLOC,1,[Define to enable checking of handles still allocated at MPI_Finalize])
+    AC_DEFINE(MPICH_DEBUG_HANDLEALLOC,1,[Define to enable checking of handles still allocated at MPI_Finalize])
 fi
+if test "$perform_instr" = yes ; then
+    AC_DEFINE(USE_MPIU_INSTR,1,[Define this to enable internal instrumentation] )
+fi
 
 if test -n "$perform_memtracing" ; then
     enable_g_mem=yes

Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/include/mpiimpl.h	2010-11-06 13:08:15 UTC (rev 7416)
@@ -1497,7 +1497,9 @@
     MPID_Attribute *attributes;
     MPID_Group *start_group_ptr; /* group passed in MPI_Win_start */
     int start_assert;            /* assert passed to MPI_Win_start */
-    MPI_Comm    comm;         /* communicator of window (dup) */
+    MPID_Comm *comm_ptr;         /* Pointer to comm of window (dup) */
+    int         myrank;          /* Rank of this process in comm (used to 
+				    detect operations on self) */
 #ifdef USE_THREADED_WINDOW_CODE
     /* These were causing compilation errors.  We need to figure out how to
        integrate threads into MPICH2 before including these fields. */
@@ -1960,6 +1962,9 @@
 #include "mpierror.h"
 #include "mpierrs.h"
 
+/* Definitions for instrumentation (currently used within RMA code) */
+#include "mpiinstr.h"
+
 /* FIXME: This routine is only used within mpi/src/err/errutil.c and 
    smpd.  We may not want to export it.  */
 void MPIR_Err_print_stack(FILE *, int);

Added: mpich2/trunk/src/include/mpiinstr.h
===================================================================
--- mpich2/trunk/src/include/mpiinstr.h	                        (rev 0)
+++ mpich2/trunk/src/include/mpiinstr.h	2010-11-06 13:08:15 UTC (rev 7416)
@@ -0,0 +1,82 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*  
+ *  (C) 2010 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#ifndef MPIINSTR_H_INCLUDED
+#define MPIINSTR_H_INCLUDED
+
+#ifdef USE_MPIU_INSTR
+
+#define MPIU_INSTR_TYPE_DURATION 1
+
+typedef struct MPIU_INSTR_Generic_t {
+    int instrType;
+    void *next;
+    int count;
+    const char *desc;
+    int (*toStr)( char *buf, size_t maxlen, void *handlePtr );
+} MPIU_INSTR_Generic_t;
+
+#define MPIU_INSTR_MAX_DATA 8
+typedef struct MPIU_INSTR_Duration_count_t {
+    int         instrType;
+    void       *next;
+    int         count;          /* Number of times in duration */
+    const char *desc;           /* Character string describing duration */
+    int (*toStr)( char *buf, size_t maxlen, void *handlePtr );
+    MPID_Time_t ttime,          /* Time in duration */
+                curstart;       /* Time of entry into current duration */
+    int nitems;                 /* Number of items in data */
+    int data[MPIU_INSTR_MAX_DATA]; /* Used to hold additional data */
+    } MPIU_INSTR_Duration_count;
+
+/* Prototypes for visible routines */
+int MPIU_INSTR_AddHandle( void * );
+int MPIU_INSTR_ToStr_Duration_Count( char *, size_t, void * );
+
+/* Definitions for including instrumentation in files*/
+
+#define MPIU_INSTR_DURATION_DECL(name_) \
+    struct MPIU_INSTR_Duration_count_t MPIU_INSTR_HANDLE_##name_ = { 0 };
+#define MPIU_INSTR_DURATION_EXTERN_DECL(name_) \
+    extern struct MPIU_INSTR_Duration_count_t MPIU_INSTR_HANDLE_##name_;
+/* FIXME: Need a generic way to zero the time */
+#define MPIU_INSTR_DURATION_INIT(name_,nitems_,desc_)	\
+    MPIU_INSTR_HANDLE_##name_.count = 0; \
+    MPIU_INSTR_HANDLE_##name_.desc = (const char *)MPIU_Strdup( desc_ ); \
+    memset( &MPIU_INSTR_HANDLE_##name_.ttime,0,sizeof(MPID_Time_t));\
+    MPIU_INSTR_HANDLE_##name_.toStr = MPIU_INSTR_ToStr_Duration_Count;\
+    MPIU_INSTR_HANDLE_##name_.nitems = nitems_;\
+    memset( MPIU_INSTR_HANDLE_##name_.data,0,MPIU_INSTR_MAX_DATA*sizeof(int));\
+    MPIU_INSTR_AddHandle( &MPIU_INSTR_HANDLE_##name_ );
+#define MPIU_INSTR_DURATION_START(name_) \
+    MPID_Wtime( &MPIU_INSTR_HANDLE_##name_.curstart )
+#define MPIU_INSTR_DURATION_END(name_) \
+    do { \
+    MPID_Time_t curend; MPID_Wtime( &curend );\
+    MPID_Wtime_acc( &MPIU_INSTR_HANDLE_##name_.curstart, \
+    		    &curend, \
+		    &MPIU_INSTR_HANDLE_##name_.ttime );\
+    MPIU_INSTR_HANDLE_##name_.count++; } while(0)
+
+#define MPIU_INSTR_DURATION_INCR(name_,idx_,incr_) \
+    MPIU_INSTR_HANDLE_##name_.data[idx_] += incr_;
+#define MPIU_INSTR_DURATION_MAX(name_,idx_,incr_) \
+    MPIU_INSTR_HANDLE_##name_.data[idx_] = \
+	incr_ > MPIU_INSTR_HANDLE_##name_.data[idx_] ? \
+	incr_ : MPIU_INSTR_HANDLE_##name_.data[idx_];
+
+#else
+/* Define null versions of macros (these are empty statements) */
+#define MPIU_INSTR_DURATION_DECL(name_)
+#define MPIU_INSTR_DURATION_EXTERN_DECL(name_)
+#define MPIU_INSTR_DURATION_INIT(name_,nitems_,desc_)
+#define MPIU_INSTR_DURATION_START(name_)
+#define MPIU_INSTR_DURATION_END(name_)
+#define MPIU_INSTR_DURATION_INCR(name_,idx_,incr_)
+#define MPIU_INSTR_DURATION_MAX(name_,idx_,incr_)
+
+#endif /* USE_MPIU_INSTR */ 
+
+#endif

Modified: mpich2/trunk/src/mpi/rma/accumulate.c
===================================================================
--- mpich2/trunk/src/mpi/rma/accumulate.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/accumulate.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -126,7 +126,7 @@
                 MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
             }
 
-	    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+	    comm_ptr = win_ptr->comm_ptr;
 	    MPIR_ERRTEST_SEND_RANK(comm_ptr, target_rank, mpi_errno);
 
             if (mpi_errno != MPI_SUCCESS) goto fn_fail;

Modified: mpich2/trunk/src/mpi/rma/get.c
===================================================================
--- mpich2/trunk/src/mpi/rma/get.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/get.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -122,7 +122,7 @@
                 MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
             }
 
-	    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+	    comm_ptr = win_ptr->comm_ptr;
 	    MPIR_ERRTEST_SEND_RANK(comm_ptr, target_rank, mpi_errno);
 	    
             if (mpi_errno != MPI_SUCCESS) goto fn_fail;

Modified: mpich2/trunk/src/mpi/rma/put.c
===================================================================
--- mpich2/trunk/src/mpi/rma/put.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/put.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -122,7 +122,7 @@
                 MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
             }
 
-	    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+	    comm_ptr = win_ptr->comm_ptr;
 	    MPIR_ERRTEST_SEND_RANK(comm_ptr, target_rank, mpi_errno);
 	    
             if (mpi_errno != MPI_SUCCESS) goto fn_fail;

Modified: mpich2/trunk/src/mpi/rma/win_get_group.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_get_group.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/win_get_group.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -98,7 +98,7 @@
 #   endif /* HAVE_ERROR_CHECKING */
 
     /* ... body of routine ...  */
-    MPID_Comm_get_ptr( win_ptr->comm, win_comm_ptr );
+    win_comm_ptr = win_ptr->comm_ptr;
 
     mpi_errno = MPIR_Comm_group_impl(win_comm_ptr, &group_ptr);
     if (mpi_errno != MPI_SUCCESS) goto fn_fail;

Modified: mpich2/trunk/src/mpi/rma/win_lock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_lock.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/win_lock.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -117,7 +117,7 @@
 						  MPI_ERR_OTHER, 
 						  "**locktype", 0 );
 
-            MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+	    comm_ptr = win_ptr->comm_ptr;
             MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
 
             if (mpi_errno) goto fn_fail;

Modified: mpich2/trunk/src/mpi/rma/win_unlock.c
===================================================================
--- mpich2/trunk/src/mpi/rma/win_unlock.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpi/rma/win_unlock.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -87,7 +87,7 @@
 	    /* If win_ptr is not valid, it will be reset to null */
             if (mpi_errno) goto fn_fail;
 
-            MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+	    comm_ptr = win_ptr->comm_ptr;
             MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);
 
             if (mpi_errno) goto fn_fail;

Modified: mpich2/trunk/src/mpid/ch3/include/mpidimpl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidimpl.h	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidimpl.h	2010-11-06 13:08:15 UTC (rev 7416)
@@ -1094,10 +1094,16 @@
 #define MPIDI_RMAFNS_VERSION 1
 int MPIDI_CH3_RMAFnsInit( MPIDI_RMAFns * );
 
+/* FIXME: These are specific to the RMA code and should be in the RMA 
+   header file. */
 #define MPIDI_RMA_PUT 23
 #define MPIDI_RMA_GET 24
 #define MPIDI_RMA_ACCUMULATE 25
 #define MPIDI_RMA_LOCK 26
+
+/* Special case RMA operations */
+#define MPIDI_RMA_ACC_CONTIG 27
+
 #define MPIDI_RMA_DATATYPE_BASIC 50
 #define MPIDI_RMA_DATATYPE_DERIVED 51
 
@@ -1704,6 +1710,8 @@
 			      MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_Accumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
 				     MPIDI_msg_sz_t *, MPID_Request ** );
+int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
+				     MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_Get( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
 			      MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_GetResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 

Modified: mpich2/trunk/src/mpid/ch3/include/mpidpkt.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidpkt.h	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidpkt.h	2010-11-06 13:08:15 UTC (rev 7416)
@@ -10,8 +10,12 @@
 /* Enable the use of data within the message packet for small messages */
 #define USE_EAGER_SHORT
 #define MPIDI_EAGER_SHORT_INTS 4
+/* FIXME: This appears to assume that sizeof(int) == 4 (or at least >= 4) */
 #define MPIDI_EAGER_SHORT_SIZE 16
 
+/* This is the number of ints that can be carried within an RMA packet */
+#define MPIDI_RMA_IMMED_INTS 1
+
 /*
  * MPIDI_CH3_Pkt_type_t
  *
@@ -44,6 +48,8 @@
     MPIDI_CH3_PKT_LOCK_GET_UNLOCK, /* optimization for single gets */
     MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK, /* optimization for single accumulates */
                                      /* RMA Packets end here */
+    MPIDI_CH3_PKT_ACCUM_IMMED,     /* optimization for short accumulate */
+    /* FIXME: Add PUT, GET_IMMED packet types */
     MPIDI_CH3_PKT_FLOW_CNTL_UPDATE,  /* FIXME: Unused */
     MPIDI_CH3_PKT_CLOSE,
     MPIDI_CH3_PKT_END_CH3
@@ -193,6 +199,26 @@
 }
 MPIDI_CH3_Pkt_accum_t;
 
+typedef struct MPIDI_CH3_Pkt_accum_immed
+{
+    MPIDI_CH3_Pkt_type_t type;
+    void *addr;
+    int count;
+    /* FIXME: Compress datatype/op into a single word (immedate mode) */
+    MPI_Datatype datatype;
+    MPI_Op op;
+    /* FIXME: do we need these (use a regular accum packet if we do?) */
+    MPI_Win target_win_handle; /* Used in the last RMA operation in each
+                               * epoch for decrementing rma op counter in
+                               * active target rma and for unlocking window 
+                               * in passive target rma. Otherwise set to NULL*/
+    MPI_Win source_win_handle; /* Used in the last RMA operation in an
+                               * epoch in the case of passive target rma
+                               * with shared locks. Otherwise set to NULL*/
+    int data[MPIDI_RMA_IMMED_INTS];
+}
+MPIDI_CH3_Pkt_accum_immed_t;
+
 typedef struct MPIDI_CH3_Pkt_lock
 {
     MPIDI_CH3_Pkt_type_t type;
@@ -276,6 +302,7 @@
     MPIDI_CH3_Pkt_get_t get;
     MPIDI_CH3_Pkt_get_resp_t get_resp;
     MPIDI_CH3_Pkt_accum_t accum;
+    MPIDI_CH3_Pkt_accum_immed_t accum_immed;
     MPIDI_CH3_Pkt_lock_t lock;
     MPIDI_CH3_Pkt_lock_granted_t lock_granted;
     MPIDI_CH3_Pkt_pt_rma_done_t pt_rma_done;    

Modified: mpich2/trunk/src/mpid/ch3/include/mpidpre.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidpre.h	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidpre.h	2010-11-06 13:08:15 UTC (rev 7416)
@@ -170,7 +170,9 @@
     int *disp_units;      /* array of displacement units of all windows */\
     MPI_Win *all_win_handles;    /* array of handles to the window objects\
                                           of all processes */            \
-    struct MPIDI_RMA_ops *rma_ops_list; /* list of outstanding RMA requests */  \
+    struct MPIDI_RMA_ops *rma_ops_list_head; /* list of outstanding \
+                                                RMA requests */ \
+    struct MPIDI_RMA_ops *rma_ops_list_tail; \
     volatile int lock_granted;  /* flag to indicate whether lock has     \
                                    been granted to this process (as source) for         \
                                    passive target rma */                 \

Modified: mpich2/trunk/src/mpid/ch3/include/mpidrma.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/include/mpidrma.h	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/include/mpidrma.h	2010-11-06 13:08:15 UTC (rev 7416)
@@ -28,6 +28,11 @@
 /* for keeping track of RMA ops, which will be executed at the next sync call */
 typedef struct MPIDI_RMA_ops {
     struct MPIDI_RMA_ops *next;  /* pointer to next element in list */
+    /* FIXME: It would be better to setup the packet that will be sent, at 
+       least in most cases (if, as a result of the sync/ops/sync sequence,
+       a different packet type is needed, it can be extracted from the 
+       information otherwise stored). */
+    /* FIXME: Use enum for RMA op type? */
     int type;  /* MPIDI_RMA_PUT, MPID_REQUEST_GET,
 		  MPIDI_RMA_ACCUMULATE, MPIDI_RMA_LOCK */
     void *origin_addr;
@@ -39,6 +44,10 @@
     MPI_Datatype target_datatype;
     MPI_Op op;  /* for accumulate */
     int lock_type;  /* for win_lock */
+    /* Used to complete operations */
+    struct MPID_Request *request;
+    MPIDI_RMA_dtype_info dtype_info;
+    void *dataloop;
 } MPIDI_RMA_ops;
 
 typedef struct MPIDI_PT_single_op {
@@ -59,5 +68,4 @@
     MPIDI_VC_t * vc;
     struct MPIDI_PT_single_op *pt_single_op;  /* to store info for lock-put-unlock optimization */
 } MPIDI_Win_lock_queue;
-
 #endif

Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_pkt.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -585,6 +585,8 @@
 	MPIDI_CH3_PktHandler_LockAccumUnlock;
     pktArray[MPIDI_CH3_PKT_LOCK_GET_UNLOCK] = 
 	MPIDI_CH3_PktHandler_LockGetUnlock;
+    pktArray[MPIDI_CH3_PKT_ACCUM_IMMED] = 
+	MPIDI_CH3_PktHandler_Accumulate_Immed;
     /* End of default RMA operations */
 
  fn_fail:

Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_handle_recv_req.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -630,7 +630,7 @@
     if (HANDLE_GET_KIND(rreq->dev.op) == HANDLE_KIND_BUILTIN)
     {
         /* get the function by indexing into the op table */
-        uop = MPIR_Op_table[(rreq->dev.op)%16 - 1];
+        uop = MPIR_Op_table[((rreq->dev.op)&0xf) - 1];
     }
     else
     {
@@ -956,7 +956,7 @@
     if (HANDLE_GET_KIND(single_op->op) == HANDLE_KIND_BUILTIN)
     {
         /* get the function by indexing into the op table */
-        uop = MPIR_Op_table[(single_op->op)%16 - 1];
+        uop = MPIR_Op_table[((single_op->op)&0xf) - 1];
     }
     else
     {

Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_rma_ops.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -7,6 +7,18 @@
 #include "mpidi_ch3_impl.h"
 #include "mpidrma.h"
 
+static int enableShortACC=1;
+
+MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
+#ifdef USE_MPIU_INSTR
+MPIU_INSTR_DURATION_DECL(wincreate_allgather);
+MPIU_INSTR_DURATION_DECL(winfree_rs);
+MPIU_INSTR_DURATION_DECL(winfree_complete);
+MPIU_INSTR_DURATION_DECL(rmaqueue_alloc);
+extern void MPIDI_CH3_RMA_InitInstr(void);
+#endif
+extern void MPIDI_CH3_RMA_SetAccImmed( int );
+
 #define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
 #define MPIDI_PASSIVE_TARGET_RMA_TAG 563924
 
@@ -18,7 +30,7 @@
 int MPIDI_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info,
 		     MPID_Comm *comm_ptr, MPID_Win **win_ptr )
 {
-    int mpi_errno=MPI_SUCCESS, i, comm_size, rank;
+    int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank;
     MPI_Aint *tmp_buf;
     MPID_Comm *win_comm_ptr;
     MPIU_CHKPMEM_DECL(4);
@@ -30,6 +42,26 @@
     /* FIXME: There should be no unreferenced args */
     MPIU_UNREFERENCED_ARG(info);
 
+    if(initRMAoptions) {
+	int rc;
+	MPIU_THREADSAFE_INIT_BLOCK_BEGIN(initRMAoptions);
+	/* Default is to enable the use of the immediate accumulate feature */
+	if (!MPL_env2bool( "MPICH_RMA_ACC_IMMED", &rc ))
+	    rc = 1;
+	MPIDI_CH3_RMA_SetAccImmed(rc);
+#ifdef USE_MPIU_INSTR
+    /* Define all instrumentation handle used in the CH3 RMA here*/
+	MPIU_INSTR_DURATION_INIT(wincreate_allgather,0,"WIN_CREATE:Allgather");
+	MPIU_INSTR_DURATION_INIT(winfree_rs,0,"WIN_FREE:ReduceScatterBlock");
+	MPIU_INSTR_DURATION_INIT(winfree_complete,0,"WIN_FREE:Complete");
+	MPIU_INSTR_DURATION_INIT(rmaqueue_alloc,0,"Allocate RMA Queue element");
+	MPIDI_CH3_RMA_InitInstr();
+
+#endif    
+	MPIU_THREADSAFE_INIT_CLEAR(initRMAoptions);
+	MPIU_THREADSAFE_INIT_BLOCK_END(initRMAoptions);
+    }
+
     comm_size = comm_ptr->local_size;
     rank = comm_ptr->rank;
     
@@ -46,7 +78,8 @@
     (*win_ptr)->start_group_ptr = NULL; 
     (*win_ptr)->start_assert = 0; 
     (*win_ptr)->attributes = NULL;
-    (*win_ptr)->rma_ops_list = NULL;
+    (*win_ptr)->rma_ops_list_head = NULL;
+    (*win_ptr)->rma_ops_list_tail = NULL;
     (*win_ptr)->lock_granted = 0;
     (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
     (*win_ptr)->shared_lock_ref_cnt = 0;
@@ -56,8 +89,10 @@
     
     mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    (*win_ptr)->comm = win_comm_ptr->handle;
-    
+    (*win_ptr)->comm_ptr   = win_comm_ptr;
+    (*win_ptr)->myrank = rank;
+
+    MPIU_INSTR_DURATION_START(wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
        completion counters of all processes */ 
     MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
@@ -82,20 +117,22 @@
 			mpi_errno, "tmp_buf");
     
     /* FIXME: This needs to be fixed for heterogeneous systems */
-    tmp_buf[3*rank] = MPIU_PtrToAint(base);
+    tmp_buf[3*rank]   = MPIU_PtrToAint(base);
     tmp_buf[3*rank+1] = (MPI_Aint) disp_unit;
     tmp_buf[3*rank+2] = (MPI_Aint) (*win_ptr)->handle;
     
     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                     tmp_buf, 3 * sizeof(MPI_Aint), MPI_BYTE,
                                     comm_ptr);
+    MPIU_INSTR_DURATION_END(wincreate_allgather);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-    
+
+    k = 0;
     for (i=0; i<comm_size; i++)
     {
-	(*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[3*i]);
-	(*win_ptr)->disp_units[i] = (int) tmp_buf[3*i+1];
-	(*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[3*i+2];
+	(*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]);
+	(*win_ptr)->disp_units[i] = (int) tmp_buf[k++];
+	(*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++];
     }
         
  fn_exit:
@@ -126,18 +163,20 @@
         
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
         
-    MPID_Comm_get_ptr( (*win_ptr)->comm, comm_ptr );
-
+    comm_ptr = (*win_ptr)->comm_ptr;
+    MPIU_INSTR_DURATION_START(winfree_rs);
     mpi_errno = MPIR_Reduce_scatter_block_impl((*win_ptr)->pt_rma_puts_accs, 
                                                &total_pt_rma_puts_accs, 1, 
                                                MPI_INT, MPI_SUM, comm_ptr);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+    MPIU_INSTR_DURATION_END(winfree_rs);
 
     if (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs)
     {
 	MPID_Progress_state progress_state;
             
 	/* poke the progress engine until the two are equal */
+	MPIU_INSTR_DURATION_START(winfree_complete);
 	MPID_Progress_start(&progress_state);
 	while (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs)
 	{
@@ -151,6 +190,7 @@
 	    /* --END ERROR HANDLING-- */
 	}
 	MPID_Progress_end(&progress_state);
+	MPIU_INSTR_DURATION_END(winfree_complete);
     }
 
     
@@ -187,11 +227,10 @@
 {
     int mpi_errno = MPI_SUCCESS;
     int dt_contig, rank, predefined;
-    MPIDI_RMA_ops *curr_ptr, *prev_ptr, *new_ptr;
+    MPIDI_RMA_ops *new_ptr;
     MPID_Datatype *dtp;
     MPI_Aint dt_true_lb;
     MPIDI_msg_sz_t data_sz;
-    MPID_Comm *win_comm_ptr;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
         
@@ -205,13 +244,7 @@
 	goto fn_exit;
     }
 
-    /* FIXME: It makes sense to save the rank (and size) of the
-       communicator in the window structure to speed up these operations,
-       or to save a pointer to the communicator structure, rather than
-       just the handle 
-    */
-    MPID_Comm_get_ptr(win_ptr->comm, win_comm_ptr);
-    rank = MPIR_Comm_rank(win_comm_ptr);
+    rank = win_ptr->myrank;
     
     /* If the put is a local operation, do it here */
     if (target_rank == rank)
@@ -223,22 +256,18 @@
     else
     {
 	/* queue it up */
-	curr_ptr = win_ptr->rma_ops_list;
-	prev_ptr = curr_ptr;
-	while (curr_ptr != NULL)
-	{
-	    prev_ptr = curr_ptr;
-	    curr_ptr = curr_ptr->next;
-	}
-
-	/* FIXME: Where does this memory get freed? */
+	/* FIXME: For short operations, should we use a (per-thread) pool? */
+	MPIU_INSTR_DURATION_START(rmaqueue_alloc);
 	MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops), 
 			    mpi_errno, "RMA operation entry");
-	if (prev_ptr != NULL)
-	    prev_ptr->next = new_ptr;
-	else 
-	    win_ptr->rma_ops_list = new_ptr;
-	
+	MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+	if (win_ptr->rma_ops_list_tail) 
+	    win_ptr->rma_ops_list_tail->next = new_ptr;
+	else
+	    win_ptr->rma_ops_list_head = new_ptr;
+	win_ptr->rma_ops_list_tail = new_ptr;
+
+	/* FIXME: For contig and very short operations, use a streamlined op */
 	new_ptr->next = NULL;  
 	new_ptr->type = MPIDI_RMA_PUT;
 	new_ptr->origin_addr = origin_addr;
@@ -290,9 +319,8 @@
     MPIDI_msg_sz_t data_sz;
     int dt_contig, rank, predefined;
     MPI_Aint dt_true_lb;
-    MPIDI_RMA_ops *curr_ptr, *prev_ptr, *new_ptr;
+    MPIDI_RMA_ops *new_ptr;
     MPID_Datatype *dtp;
-    MPID_Comm *win_comm_ptr;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
         
@@ -306,10 +334,7 @@
 	goto fn_exit;
     }
 
-    /* FIXME: It makes sense to save the rank (and size) of the
-       communicator in the window structure to speed up these operations */
-    MPID_Comm_get_ptr(win_ptr->comm, win_comm_ptr);
-    rank = MPIR_Comm_rank(win_comm_ptr);
+    rank = win_ptr->myrank;
     
     /* If the get is a local operation, do it here */
     if (target_rank == rank)
@@ -323,25 +348,17 @@
     else
     {
 	/* queue it up */
-	curr_ptr = win_ptr->rma_ops_list;
-	prev_ptr = curr_ptr;
-	while (curr_ptr != NULL)
-	{
-	    prev_ptr = curr_ptr;
-	    curr_ptr = curr_ptr->next;
-	}
-	
+	MPIU_INSTR_DURATION_START(rmaqueue_alloc);
 	MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops), 
 			    mpi_errno, "RMA operation entry");
-	if (prev_ptr != NULL)
-	{
-	    prev_ptr->next = new_ptr;
-	}
+	MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+	if (win_ptr->rma_ops_list_tail) 
+	    win_ptr->rma_ops_list_tail->next = new_ptr;
 	else
-	{
-	    win_ptr->rma_ops_list = new_ptr;
-	}
+	    win_ptr->rma_ops_list_head = new_ptr;
+	win_ptr->rma_ops_list_tail = new_ptr;
             
+	/* FIXME: For contig and very short operations, use a streamlined op */
 	new_ptr->next = NULL;  
 	new_ptr->type = MPIDI_RMA_GET;
 	new_ptr->origin_addr = origin_addr;
@@ -394,9 +411,8 @@
     MPIDI_msg_sz_t data_sz;
     int dt_contig, rank, origin_predefined, target_predefined;
     MPI_Aint dt_true_lb;
-    MPIDI_RMA_ops *curr_ptr, *prev_ptr, *new_ptr;
+    MPIDI_RMA_ops *new_ptr;
     MPID_Datatype *dtp;
-    MPID_Comm *win_comm_ptr;
     MPIU_CHKLMEM_DECL(2);
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
@@ -410,18 +426,13 @@
     {
 	goto fn_exit;
     }
+
+    rank = win_ptr->myrank;
     
-    /* FIXME: It makes sense to save the rank (and size) of the
-       communicator in the window structure to speed up these operations,
-       or to save a pointer to the communicator structure, rather than
-       just the handle 
-    */
-    MPID_Comm_get_ptr(win_ptr->comm, win_comm_ptr);
-    rank = MPIR_Comm_rank(win_comm_ptr);
-    
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined);
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined);
 
+    /* Do =! rank first (most likely branch?) */
     if (target_rank == rank)
     {
 	MPI_User_function *uop;
@@ -440,7 +451,7 @@
 			     "**opnotpredefined %d", op );
 	
 	/* get the function by indexing into the op table */
-	uop = MPIR_Op_table[(op)%16 - 1];
+	uop = MPIR_Op_table[((op)&0xf) - 1];
 	
 	if (origin_predefined && target_predefined)
 	{    
@@ -524,25 +535,32 @@
     else
     {
 	/* queue it up */
-	curr_ptr = win_ptr->rma_ops_list;
-	prev_ptr = curr_ptr;
-	while (curr_ptr != NULL)
-	{
-	    prev_ptr = curr_ptr;
-	    curr_ptr = curr_ptr->next;
-	}
-	
+	MPIU_INSTR_DURATION_START(rmaqueue_alloc);
 	MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops), 
 			    mpi_errno, "RMA operation entry");
-	if (prev_ptr != NULL)
-	{
-	    prev_ptr->next = new_ptr;
-	}
+	MPIU_INSTR_DURATION_END(rmaqueue_alloc);
+	if (win_ptr->rma_ops_list_tail) 
+	    win_ptr->rma_ops_list_tail->next = new_ptr;
 	else
-	{
-	    win_ptr->rma_ops_list = new_ptr;
+	    win_ptr->rma_ops_list_head = new_ptr;
+	win_ptr->rma_ops_list_tail = new_ptr;
+
+	/* If predefined and contiguous, use a simplified element */
+	if (origin_predefined && target_predefined && enableShortACC) {
+	    new_ptr->next = NULL;
+	    new_ptr->type = MPIDI_RMA_ACC_CONTIG;
+	    /* Only the information needed for the contig/predefined acc */
+	    new_ptr->origin_addr = origin_addr;
+	    new_ptr->origin_count = origin_count;
+	    new_ptr->origin_datatype = origin_datatype;
+	    new_ptr->target_rank = target_rank;
+	    new_ptr->target_disp = target_disp;
+	    new_ptr->target_count = target_count;
+	    new_ptr->target_datatype = target_datatype;
+	    new_ptr->op = op;
+	    goto fn_exit;
 	}
-        
+
 	new_ptr->next = NULL;  
 	new_ptr->type = MPIDI_RMA_ACCUMULATE;
 	new_ptr->origin_addr = origin_addr;

Modified: mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/mpid/ch3/src/ch3u_rma_sync.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -7,6 +7,63 @@
 #include "mpidimpl.h"
 #include "mpidrma.h"
 
+static int EnableImmedAcc = 1;
+void MPIDI_CH3_RMA_SetAccImmed( int flag )
+{
+    EnableImmedAcc = flag;
+}
+
+#ifdef USE_MPIU_INSTR
+MPIU_INSTR_DURATION_DECL(winfence_clearlock);
+MPIU_INSTR_DURATION_DECL(winfence_rs);
+MPIU_INSTR_DURATION_DECL(winfence_issue);
+MPIU_INSTR_DURATION_DECL(winfence_complete);
+MPIU_INSTR_DURATION_DECL(winfence_wait);
+MPIU_INSTR_DURATION_DECL(winfence_block);
+MPIU_INSTR_DURATION_DECL(winpost_clearlock);
+MPIU_INSTR_DURATION_DECL(winpost_sendsync);
+MPIU_INSTR_DURATION_DECL(winstart_clearlock);
+MPIU_INSTR_DURATION_DECL(wincomplete_issue);
+MPIU_INSTR_DURATION_DECL(wincomplete_complete);
+MPIU_INSTR_DURATION_DECL(wincomplete_recvsync);
+MPIU_INSTR_DURATION_DECL(winwait_wait);
+MPIU_INSTR_DURATION_DECL(winlock_getlocallock);
+MPIU_INSTR_DURATION_DECL(winunlock_getlock);
+MPIU_INSTR_DURATION_DECL(winunlock_issue);
+MPIU_INSTR_DURATION_DECL(winunlock_complete);
+MPIU_INSTR_DURATION_DECL(lockqueue_alloc);
+MPIU_INSTR_DURATION_DECL(rmapkt_acc);
+MPIU_INSTR_DURATION_DECL(rmapkt_acc_predef);
+MPIU_INSTR_DURATION_DECL(rmapkt_acc_immed);
+MPIU_INSTR_DURATION_EXTERN_DECL(rmaqueue_alloc);
+void MPIDI_CH3_RMA_InitInstr(void);
+
+void MPIDI_CH3_RMA_InitInstr(void)
+{
+    MPIU_INSTR_DURATION_INIT(lockqueue_alloc,0,"Allocate Lock Queue element");
+    MPIU_INSTR_DURATION_INIT(winfence_clearlock,1,"WIN_FENCE:Clear prior lock");
+    MPIU_INSTR_DURATION_INIT(winfence_rs,0,"WIN_FENCE:ReduceScatterBlock");
+    MPIU_INSTR_DURATION_INIT(winfence_issue,2,"WIN_FENCE:Issue RMA ops");
+    MPIU_INSTR_DURATION_INIT(winfence_complete,1,"WIN_FENCE:Complete RMA ops");
+    MPIU_INSTR_DURATION_INIT(winfence_wait,1,"WIN_FENCE:Wait for ops from other processes");
+    MPIU_INSTR_DURATION_INIT(winfence_block,0,"WIN_FENCE:Wait for any progress");
+    MPIU_INSTR_DURATION_INIT(winpost_clearlock,1,"WIN_POST:Clear prior lock");
+    MPIU_INSTR_DURATION_INIT(winpost_sendsync,1,"WIN_POST:Senc sync messages");
+    MPIU_INSTR_DURATION_INIT(winstart_clearlock,1,"WIN_START:Clear prior lock");
+    MPIU_INSTR_DURATION_INIT(wincomplete_recvsync,1,"WIN_COMPLETE:Recv sync messages");
+    MPIU_INSTR_DURATION_INIT(wincomplete_issue,2,"WIN_COMPLETE:Issue RMA ops");
+    MPIU_INSTR_DURATION_INIT(wincomplete_complete,1,"WIN_COMPLETE:Complete RMA ops");
+    MPIU_INSTR_DURATION_INIT(winwait_wait,1,"WIN_WAIT:Wait for ops from other processes");
+    MPIU_INSTR_DURATION_INIT(winlock_getlocallock,0,"WIN_LOCK:Get local lock");
+    MPIU_INSTR_DURATION_INIT(winunlock_issue,2,"WIN_UNLOCK:Issue RMA ops");
+    MPIU_INSTR_DURATION_INIT(winunlock_complete,1,"WIN_UNLOCK:Complete RMA ops");
+    MPIU_INSTR_DURATION_INIT(winunlock_getlock,0,"WIN_UNLOCK:Acquire lock");
+    MPIU_INSTR_DURATION_INIT(rmapkt_acc,0,"RMA:PKTHANDLER for Accumulate");
+    MPIU_INSTR_DURATION_INIT(rmapkt_acc_predef,0,"RMA:PKTHANDLER for Accumulate: predef dtype");
+    MPIU_INSTR_DURATION_INIT(rmapkt_acc_immed,0,"RMA:PKTHANDLER for Accum immed");
+}
+#endif
+
 /*
  * These routines provide a default implementation of the MPI RMA operations
  * in terms of the low-level, two-sided channel operations.  A channel
@@ -31,6 +88,11 @@
 				   MPI_Win target_win_handle, 
 				   MPIDI_RMA_dtype_info * dtype_info, 
 				   void ** dataloop, MPID_Request ** request); 
+static int MPIDI_CH3I_Send_contig_acc_msg(MPIDI_RMA_ops * rma_op, 
+					  MPID_Win * win_ptr, 
+					  MPI_Win source_win_handle, 
+					  MPI_Win target_win_handle, 
+					  MPID_Request ** request);
 static int MPIDI_CH3I_Do_passive_target_rma(MPID_Win *win_ptr, 
 					    int *wait_for_rma_done_pkt);
 static int MPIDI_CH3I_Send_lock_put_or_acc(MPID_Win *win_ptr);
@@ -48,16 +110,13 @@
 int MPIDI_Win_fence(int assert, MPID_Win *win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    int comm_size, done;
+    int comm_size;
     int *rma_target_proc, *nops_to_proc, i, total_op_count, *curr_ops_cnt;
-    MPIDI_RMA_ops *curr_ptr, *next_ptr;
+    MPIDI_RMA_ops *curr_ptr;
     MPID_Comm *comm_ptr;
-    MPID_Request **requests=NULL; /* array of requests */
     MPI_Win source_win_handle, target_win_handle;
-    MPIDI_RMA_dtype_info *dtype_infos=NULL;
-    void **dataloops=NULL;    /* to store dataloops for each datatype */
     MPID_Progress_state progress_state;
-    MPIU_CHKLMEM_DECL(6);
+    MPIU_CHKLMEM_DECL(3);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);
@@ -71,6 +130,7 @@
      * have completed and the lock is released. */
     if (win_ptr->current_lock_type != MPID_LOCK_NONE)
     {
+	MPIU_INSTR_DURATION_START(winfence_clearlock);
 	MPID_Progress_start(&progress_state);
 	while (win_ptr->current_lock_type != MPID_LOCK_NONE)
 	{
@@ -82,9 +142,10 @@
 		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
 	    }
 	    /* --END ERROR HANDLING-- */
-
+	    MPIU_INSTR_DURATION_INCR(winfence_clearlock,0,1);
 	}
 	MPID_Progress_end(&progress_state);
+	MPIU_INSTR_DURATION_END(winfence_clearlock);
     }
 
     /* Note that the NOPRECEDE and NOSUCCEED must be specified by all processes
@@ -108,10 +169,10 @@
     }
     else
     {
+	MPIDI_RMA_ops **prevNextPtr, *tmpptr;
+	MPIU_INSTR_DURATION_START(winfence_rs);
 	/* This is the second or later fence. Do all the preceding RMA ops. */
-	
-	MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
-	
+	comm_ptr = win_ptr->comm_ptr;
 	/* First inform every process whether it is a target of RMA
 	   ops from this process */
 	comm_size = comm_ptr->local_size;
@@ -131,7 +192,7 @@
 	/* set rma_target_proc[i] to 1 if rank i is a target of RMA
 	   ops from this process */
 	total_op_count = 0;
-	curr_ptr = win_ptr->rma_ops_list;
+	curr_ptr = win_ptr->rma_ops_list_head;
 	while (curr_ptr != NULL)
 	{
 	    total_op_count++;
@@ -143,22 +204,8 @@
 	MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size*sizeof(int),
 			    mpi_errno, "curr_ops_cnt");
 	for (i=0; i<comm_size; i++) curr_ops_cnt[i] = 0;
-	
-	if (total_op_count != 0)
-	{
-	    MPIU_CHKLMEM_MALLOC(requests, MPID_Request **, 
-				total_op_count*sizeof(MPID_Request*),
-				mpi_errno, "requests");
-	    MPIU_CHKLMEM_MALLOC(dtype_infos, MPIDI_RMA_dtype_info *, 
-				total_op_count*sizeof(MPIDI_RMA_dtype_info),
-				mpi_errno, "dtype_infos");
-	    MPIU_CHKLMEM_MALLOC(dataloops, void **, 
-				total_op_count*sizeof(void*),
-				mpi_errno, "dataloops");
-	    for (i=0; i<total_op_count; i++) dataloops[i] = NULL;
-	}
-	
-	/* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc. As a result,
+	/* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc. 
+	   As a result,
 	   each process knows how many other processes will be doing
 	   RMA ops on its window */  
             
@@ -167,6 +214,7 @@
             
 	mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_proc, 1,
                                                    MPI_INT, MPI_SUM, comm_ptr);
+	MPIU_INSTR_DURATION_END(winfence_rs);
 	/* result is stored in rma_target_proc[0] */
 	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 
@@ -175,9 +223,13 @@
 	   procs have the address and could decrement it. */
 	win_ptr->my_counter = win_ptr->my_counter - comm_size + 
 	    rma_target_proc[0];  
-            
+
+	MPIU_INSTR_DURATION_START(winfence_issue);
+	MPIU_INSTR_DURATION_INCR(winfence_issue,0,total_op_count);
+	MPIU_INSTR_DURATION_MAX(winfence_issue,1,total_op_count);
 	i = 0;
-	curr_ptr = win_ptr->rma_ops_list;
+	curr_ptr = win_ptr->rma_ops_list_head;
+	prevNextPtr = &win_ptr->rma_ops_list_head;
 	while (curr_ptr != NULL)
 	{
 	    /* The completion counter at the target is decremented only on 
@@ -191,22 +243,28 @@
 		source_win_handle = MPI_WIN_NULL;
 	    
 	    target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
-	    
+
+	    curr_ptr->dataloop = 0;
 	    switch (curr_ptr->type)
 	    {
 	    case (MPIDI_RMA_PUT):
 	    case (MPIDI_RMA_ACCUMULATE):
 		mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
 					source_win_handle, target_win_handle, 
-					&dtype_infos[i],
-					&dataloops[i], &requests[i]);
+					&curr_ptr->dtype_info,
+					&curr_ptr->dataloop, &curr_ptr->request);
 		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 		break;
+	    case MPIDI_RMA_ACC_CONTIG:
+		mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
+				   source_win_handle, target_win_handle, 
+				   &curr_ptr->request );
+		break;
 	    case (MPIDI_RMA_GET):
 		mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
 					source_win_handle, target_win_handle, 
-					&dtype_infos[i], 
-					&dataloops[i], &requests[i]);
+					&curr_ptr->dtype_info, 
+					&curr_ptr->dataloop, &curr_ptr->request);
 		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 		break;
 	    default:
@@ -214,87 +272,119 @@
 	    }
 	    i++;
 	    curr_ops_cnt[curr_ptr->target_rank]++;
-	    curr_ptr = curr_ptr->next;
+	    /* If the request is null, we can remove it immediately */
+	    if (!curr_ptr->request) {
+		if (curr_ptr->dataloop != NULL) {
+		    MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
+						      recv_rma_msg */
+		}
+		tmpptr       = curr_ptr->next;
+		*prevNextPtr = tmpptr;
+		MPIU_Free( curr_ptr );
+		curr_ptr     = tmpptr;
+	    }
+	    else  {
+		curr_ptr    = curr_ptr->next;
+		prevNextPtr = &curr_ptr->next;
+		/* FIXME: We could at least occassionally try to wait
+		   on completion of the pending send requests rather than
+		   focus on filling the queues.  */
+	    }
 	}
-	
-            
+	MPIU_INSTR_DURATION_END(winfence_issue);
+
+	/* We replaced a loop over an array of requests with a list of the
+	   incomplete requests.  The reason to do 
+	   that is for long lists - processing the entire list until
+	   all are done introduces a potentially n^2 time.  In 
+	   testing with test/mpi/perf/manyrma.c , the number of iterations
+	   within the "while (total_op_count) was O(total_op_count).
+	   
+	   Another alternative is to create a more compressed list (storing
+	   only the necessary information, reducing the number of cache lines
+	   needed while looping through the requests.
+	*/
 	if (total_op_count)
 	{ 
-	    done = 1;
+	    int ntimes = 0;
+	    MPIU_INSTR_DURATION_START(winfence_complete);
 	    MPID_Progress_start(&progress_state);
-	    while (total_op_count)
-	    {
-		for (i=0; i<total_op_count; i++)
-		{
-		    if (requests[i] != NULL)
-		    {
-			if (!MPID_Request_is_complete(requests[i]))
-			{
-			    done = 0;
-			    break;
-			}
-			else
-			{
-			    mpi_errno = requests[i]->status.MPI_ERROR;
+	    /* Process all operations until they are complete */
+	    while (win_ptr->rma_ops_list_head) {
+		int loopcount = 0;
+		prevNextPtr = &win_ptr->rma_ops_list_head;
+		ntimes++;
+		curr_ptr = win_ptr->rma_ops_list_head;
+		do {
+		    if (MPID_Request_is_complete(curr_ptr->request)) {
+			/* Once we find a complete request, we complete
+			 as many as possible until we find an incomplete
+			 or null request */
+			do {
+			    mpi_errno = curr_ptr->request->status.MPI_ERROR;
 			    /* --BEGIN ERROR HANDLING-- */
-			    if (mpi_errno != MPI_SUCCESS)
-			    {
+			    if (mpi_errno != MPI_SUCCESS) {
 				MPID_Progress_end(&progress_state);
 				MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMAmessage");
 			    }
 			    /* --END ERROR HANDLING-- */
-			    /* if origin datatype was a derived
-			       datatype, it will get freed when the
-			       request gets freed. */ 
-			    MPID_Request_release(requests[i]);
-			    requests[i] = NULL;
+			    MPID_Request_release(curr_ptr->request);
+			    if (curr_ptr->dataloop != NULL) {
+				MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
+								  recv_rma_msg */
+			    }
+			    /* We can remove and free this rma op element */
+			    tmpptr       = curr_ptr->next;
+			    *prevNextPtr = tmpptr;
+			    MPIU_Free( curr_ptr );
+			    curr_ptr     = tmpptr;
 			}
+			while (curr_ptr &&
+			       MPID_Request_is_complete(curr_ptr->request));
+			/* Once a request completes, we wait for another
+			   operation to arrive rather than check the
+			   rest of the requests.  */
+			break;
 		    }
+		    else {
+			/* In many cases, if the list of pending requests
+			   is long, there's no point in checking the entire
+			   list */
+			if (loopcount++ > 4) /* FIXME: threshold as parameter */
+			    break;  /* wait for an event */
+			prevNextPtr = &curr_ptr->next;
+			curr_ptr    = curr_ptr->next;
+		    }
+		} while (curr_ptr);
+         
+		/* Wait for something to arrive*/
+		/* In some tests, this hung unless the test ensured that 
+		   there was an incomplete request. */
+		curr_ptr = win_ptr->rma_ops_list_head;
+		if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
+		    MPIU_INSTR_DURATION_START(winfence_block);
+		    mpi_errno = MPID_Progress_wait(&progress_state);
+		    /* --BEGIN ERROR HANDLING-- */
+		    if (mpi_errno != MPI_SUCCESS) {
+			MPID_Progress_end(&progress_state);
+			MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+		    }
+		    /* --END ERROR HANDLING-- */
+		    MPIU_INSTR_DURATION_END(winfence_block);
 		}
-                    
-		if (done)
-		{
-		    break;
-		}
-                    
-		mpi_errno = MPID_Progress_wait(&progress_state);
-		/* --BEGIN ERROR HANDLING-- */
-		if (mpi_errno != MPI_SUCCESS) {
-		    MPID_Progress_end(&progress_state);
-		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
-		}
-		/* --END ERROR HANDLING-- */
-		
-		done = 1;
-	    } 
+	    } /* While list of rma operation is non-empty */
 	    MPID_Progress_end(&progress_state);
+	    MPIU_INSTR_DURATION_INCR(winfence_complete,0,ntimes);
+	    MPIU_INSTR_DURATION_END(winfence_complete);
 	}
             
-	if (total_op_count != 0)
-	{
-	    for (i=0; i<total_op_count; i++)
-	    {
-		if (dataloops[i] != NULL)
-		{
-		    MPIU_Free(dataloops[i]); /* allocated in send_rma_msg or 
-						recv_rma_msg */
-		}
-	    }
-	}
+	win_ptr->rma_ops_list_head = NULL;
+	win_ptr->rma_ops_list_tail = NULL;
 	
-	/* free MPIDI_RMA_ops_list */
-	curr_ptr = win_ptr->rma_ops_list;
-	while (curr_ptr != NULL)
-	{
-	    next_ptr = curr_ptr->next;
-	    MPIU_Free(curr_ptr);
-	    curr_ptr = next_ptr;
-	}
-	win_ptr->rma_ops_list = NULL;
-	
 	/* wait for all operations from other processes to finish */
 	if (win_ptr->my_counter)
 	{
+	    MPIU_INSTR_DURATION_START(winfence_wait);
 	    MPID_Progress_start(&progress_state);
 	    while (win_ptr->my_counter)
 	    {
@@ -305,8 +395,10 @@
 		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
 		}
 		/* --END ERROR HANDLING-- */
+		MPIU_INSTR_DURATION_INCR(winfence_wait,0,1);
 	    }
 	    MPID_Progress_end(&progress_state);
+	    MPIU_INSTR_DURATION_END(winfence_wait);
 	} 
 	
 	if (assert & MPI_MODE_NOSUCCEED)
@@ -470,7 +562,7 @@
           fflush(stdout);
     */
 
-    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+    comm_ptr = win_ptr->comm_ptr;
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype, predefined);
@@ -641,8 +733,120 @@
     /* --END ERROR HANDLING-- */
 }
 
+/*
+ * Use this for contiguous accumulate operations
+ */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Send_contig_acc_msg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPIDI_CH3I_Send_contig_acc_msg(MPIDI_RMA_ops *rma_op, 
+					  MPID_Win *win_ptr,
+					  MPI_Win source_win_handle, 
+					  MPI_Win target_win_handle, 
+					  MPID_Request **request) 
+{
+    MPIDI_CH3_Pkt_t upkt;
+    MPIDI_CH3_Pkt_accum_t *accum_pkt = &upkt.accum;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    int mpi_errno=MPI_SUCCESS;
+    int origin_type_size, iovcnt; 
+    MPIDI_VC_t * vc;
+    MPID_Comm *comm_ptr;
+    int len;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
+    MPIDI_STATE_DECL(MPID_STATE_MEMCPY);
 
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
 
+    *request = NULL;
+
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+    /* FIXME: Make this size check efficient and match the packet type */
+    len = rma_op->origin_count * origin_type_size;
+    if (EnableImmedAcc && len <= MPIDI_RMA_IMMED_INTS*sizeof(int)) {
+	MPIDI_CH3_Pkt_accum_immed_t * accumi_pkt = &upkt.accum_immed;
+	void *dest = accumi_pkt->data, *src = rma_op->origin_addr;
+	
+	MPIDI_Pkt_init(accumi_pkt, MPIDI_CH3_PKT_ACCUM_IMMED);
+	accumi_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
+	    win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
+	accumi_pkt->count = rma_op->target_count;
+	accumi_pkt->datatype = rma_op->target_datatype;
+	accumi_pkt->op = rma_op->op;
+	accumi_pkt->target_win_handle = target_win_handle;
+	accumi_pkt->source_win_handle = source_win_handle;
+	
+	switch (len) {
+	case 1: *(uint8_t *)dest  = *(uint8_t *)src;  break;
+	case 2: *(uint16_t *)dest = *(uint16_t *)src; break;
+	case 4: *(uint32_t *)dest = *(uint32_t *)src; break;
+	case 8: *(uint64_t *)dest = *(uint64_t *)src; break;
+	default:
+	    MPIU_Memcpy( accumi_pkt->data, (void *)rma_op->origin_addr, len );
+	}
+	comm_ptr = win_ptr->comm_ptr;
+	MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+	MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+	mpi_errno = MPIU_CALL(MPIDI_CH3,iStartMsg(vc, accumi_pkt, sizeof(*accumi_pkt), request));
+	MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+	MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+	goto fn_exit;
+    }
+
+    MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
+    accum_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
+	win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
+    accum_pkt->count = rma_op->target_count;
+    accum_pkt->datatype = rma_op->target_datatype;
+    accum_pkt->dataloop_size = 0;
+    accum_pkt->op = rma_op->op;
+    accum_pkt->target_win_handle = target_win_handle;
+    accum_pkt->source_win_handle = source_win_handle;
+    
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
+    iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
+
+    /*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
+          rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
+          fflush(stdout);
+    */
+
+    comm_ptr = win_ptr->comm_ptr;
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+
+    /* basic datatype on target */
+    /* basic datatype on origin */
+    /* FIXME: This is still very heavyweight for a small message operation,
+       such as a single word update */
+    /* One possibility is to use iStartMsg with a buffer that is just large 
+       enough, though note that nemesis has an optimization for this */
+    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rma_op->origin_addr;
+    iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
+    iovcnt = 2;
+    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+    mpi_errno = MPIU_CALL(MPIDI_CH3,iStartMsgv(vc, iov, iovcnt, request));
+    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (*request)
+    {
+        MPIU_Object_set_ref(*request, 0);
+        MPIDI_CH3_Request_destroy(*request);
+    }
+    *request = NULL;
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_Recv_rma_msg
 #undef FCNAME
@@ -708,7 +912,7 @@
     fflush(stdout);
 */
 	    
-    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+    comm_ptr = win_ptr->comm_ptr;
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->target_datatype, predefined);
@@ -790,8 +994,6 @@
 }
 
 
-
-
 #undef FUNCNAME
 #define FUNCNAME MPIDI_Win_post
 #undef FCNAME
@@ -824,6 +1026,7 @@
     {
 	MPID_Progress_state progress_state;
 	
+	MPIU_INSTR_DURATION_START(winpost_clearlock);
 	/* poke the progress engine */
 	MPID_Progress_start(&progress_state);
 	while (win_ptr->current_lock_type != MPID_LOCK_NONE)
@@ -835,8 +1038,10 @@
 		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
 	    }
 	    /* --END ERROR HANDLING-- */
+	    MPIU_INSTR_DURATION_INCR(winpost_clearlock,0,1);
 	}
 	MPID_Progress_end(&progress_state);
+	MPIU_INSTR_DURATION_END(winpost_clearlock);
     }
         
     post_grp_size = post_grp_ptr->size;
@@ -848,12 +1053,14 @@
     {
         MPI_Request *req;
         MPI_Status *status;
+
+	MPIU_INSTR_DURATION_START(winpost_sendsync);
  
 	/* NOCHECK not specified. We need to notify the source
 	   processes that Post has been called. */  
 	
 	/* We need to translate the ranks of the processes in
-	   post_group to ranks in win_ptr->comm, so that we
+	   post_group to ranks in win_ptr->comm_ptr, so that we
 	   can do communication */
             
 	MPIU_CHKLMEM_MALLOC(ranks_in_post_grp, int *, 
@@ -868,7 +1075,7 @@
 	    ranks_in_post_grp[i] = i;
 	}
         
-        MPID_Comm_get_ptr( win_ptr->comm, win_comm_ptr );
+	win_comm_ptr = win_ptr->comm_ptr;
 
         mpi_errno = MPIR_Comm_group_impl(win_comm_ptr, &win_grp_ptr);
 	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -877,15 +1084,20 @@
 	MPIR_Group_translate_ranks_impl(post_grp_ptr, post_grp_size, ranks_in_post_grp,
                                         win_grp_ptr, ranks_in_win_grp);
 	
-        rank = MPIR_Comm_rank(win_comm_ptr);
+        rank = win_ptr->myrank;
 	
 	MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request), mpi_errno, "req");
         MPIU_CHKLMEM_MALLOC(status, MPI_Status *, post_grp_size*sizeof(MPI_Status), mpi_errno, "status");
 
 	/* Send a 0-byte message to the source processes */
+	MPIU_INSTR_DURATION_INCR(winpost_sendsync,0,post_grp_size);
 	for (i = 0; i < post_grp_size; i++) {
 	    dst = ranks_in_win_grp[i];
-	    
+
+	    /* FIXME: Short messages like this shouldn't normally need a 
+	       request - this should consider using the ch3 call to send
+	       a short message and return a request only if the message is
+	       not delivered. */
 	    if (dst != rank) {
                 MPID_Request *req_ptr;
 		mpi_errno = MPID_Isend(&i, 0, MPI_INT, dst, SYNC_POST_TAG, win_comm_ptr,
@@ -912,6 +1124,7 @@
 
         mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
 	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+	MPIU_INSTR_DURATION_END(winpost_sendsync);
     }
 
  fn_exit:
@@ -954,6 +1167,7 @@
     {
 	MPID_Progress_state progress_state;
 	
+	MPIU_INSTR_DURATION_START(winstart_clearlock);
 	/* poke the progress engine */
 	MPID_Progress_start(&progress_state);
 	while (win_ptr->current_lock_type != MPID_LOCK_NONE)
@@ -965,8 +1179,10 @@
 		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
 	    }
 	    /* --END ERROR HANDLING-- */
+	    MPIU_INSTR_DURATION_INCR(winstart_clearlock,0,1);
 	}
 	MPID_Progress_end(&progress_state);
+	MPIU_INSTR_DURATION_END(winstart_clearlock);
     }
     
     win_ptr->start_group_ptr = group_ptr;
@@ -989,12 +1205,9 @@
     int mpi_errno = MPI_SUCCESS;
     int comm_size, *nops_to_proc, src, new_total_op_count;
     int i, j, dst, done, total_op_count, *curr_ops_cnt;
-    MPIDI_RMA_ops *curr_ptr, *next_ptr;
+    MPIDI_RMA_ops *curr_ptr, *tmpptr, **prevNextPtr;
     MPID_Comm *comm_ptr;
-    MPID_Request **requests; /* array of requests */
     MPI_Win source_win_handle, target_win_handle;
-    MPIDI_RMA_dtype_info *dtype_infos=NULL;
-    void **dataloops=NULL;    /* to store dataloops for each datatype */
     MPID_Group *win_grp_ptr;
     int start_grp_size, *ranks_in_start_grp, *ranks_in_win_grp, rank;
     MPIU_CHKLMEM_DECL(9);
@@ -1002,14 +1215,15 @@
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_COMPLETE);
 
-    MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+    comm_ptr = win_ptr->comm_ptr;
     comm_size = comm_ptr->local_size;
         
     /* Translate the ranks of the processes in
-       start_group to ranks in win_ptr->comm */
+       start_group to ranks in win_ptr->comm_ptr */
     
     start_grp_size = win_ptr->start_group_ptr->size;
-        
+
+    MPIU_INSTR_DURATION_START(wincomplete_recvsync);
     MPIU_CHKLMEM_MALLOC(ranks_in_start_grp, int *, start_grp_size*sizeof(int), 
 			mpi_errno, "ranks_in_start_grp");
         
@@ -1024,10 +1238,11 @@
     mpi_errno = MPIR_Comm_group_impl(comm_ptr, &win_grp_ptr);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 
-    MPIR_Group_translate_ranks_impl(win_ptr->start_group_ptr, start_grp_size, ranks_in_start_grp,
+    MPIR_Group_translate_ranks_impl(win_ptr->start_group_ptr, start_grp_size, 
+				    ranks_in_start_grp,
                                     win_grp_ptr, ranks_in_win_grp);
         
-    rank = MPIR_Comm_rank(comm_ptr);
+    rank = win_ptr->myrank;
 
     /* If MPI_MODE_NOCHECK was not specified, we need to check if
        Win_post was called on the target processes. Wait for a 0-byte sync
@@ -1036,14 +1251,19 @@
     {
         MPI_Request *req;
         MPI_Status *status;
-        
+
         MPIU_CHKLMEM_MALLOC(req, MPI_Request *, start_grp_size*sizeof(MPI_Request), mpi_errno, "req");
         MPIU_CHKLMEM_MALLOC(status, MPI_Status *, start_grp_size*sizeof(MPI_Status), mpi_errno, "status");
 
+	MPIU_INSTR_DURATION_INCR(wincomplete_recvsync,0,start_grp_size);
 	for (i = 0; i < start_grp_size; i++) {
 	    src = ranks_in_win_grp[i];
 	    if (src != rank) {
                 MPID_Request *req_ptr;
+		/* FIXME: This is a heavyweight way to process these sync 
+		   messages - this should be handled with a special packet
+		   type and callback function.
+		*/
                 mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
                                        comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
 		if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -1067,28 +1287,31 @@
         }
         /* --END ERROR HANDLING-- */
     }
-        
+    MPIU_INSTR_DURATION_END(wincomplete_recvsync);
+
     /* keep track of no. of ops to each proc. Needed for knowing
        whether or not to decrement the completion counter. The
        completion counter is decremented only on the last
        operation. */
-        
+
+    MPIU_INSTR_DURATION_START(wincomplete_issue);
+
     MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size*sizeof(int), 
 			mpi_errno, "nops_to_proc");
     for (i=0; i<comm_size; i++) nops_to_proc[i] = 0;
 
     total_op_count = 0;
-    curr_ptr = win_ptr->rma_ops_list;
+    curr_ptr = win_ptr->rma_ops_list_head;
     while (curr_ptr != NULL)
     {
 	nops_to_proc[curr_ptr->target_rank]++;
 	total_op_count++;
 	curr_ptr = curr_ptr->next;
     }
-    
-    MPIU_CHKLMEM_MALLOC(requests, MPID_Request **, 
-			(total_op_count+start_grp_size) * sizeof(MPID_Request*),
-			mpi_errno, "requests");
+
+    MPIU_INSTR_DURATION_INCR(wincomplete_issue,0,total_op_count);
+    MPIU_INSTR_DURATION_MAX(wincomplete_issue,1,total_op_count);
+
     /* We allocate a few extra requests because if there are no RMA
        ops to a target process, we need to send a 0-byte message just
        to decrement the completion counter. */
@@ -1096,19 +1319,10 @@
     MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size*sizeof(int),
 			mpi_errno, "curr_ops_cnt");
     for (i=0; i<comm_size; i++) curr_ops_cnt[i] = 0;
-    
-    if (total_op_count != 0)
-    {
-	MPIU_CHKLMEM_MALLOC(dtype_infos, MPIDI_RMA_dtype_info *, 
-			    total_op_count*sizeof(MPIDI_RMA_dtype_info),
-			    mpi_errno, "dtype_infos");
-	MPIU_CHKLMEM_MALLOC(dataloops, void **, total_op_count*sizeof(void*),
-			    mpi_errno, "dataloops");
-	for (i=0; i<total_op_count; i++) dataloops[i] = NULL;
-    }
         
     i = 0;
-    curr_ptr = win_ptr->rma_ops_list;
+    prevNextPtr = &win_ptr->rma_ops_list_head;
+    curr_ptr = win_ptr->rma_ops_list_head;
     while (curr_ptr != NULL)
     {
 	/* The completion counter at the target is decremented only on 
@@ -1122,22 +1336,28 @@
 	    source_win_handle = MPI_WIN_NULL;
 	
 	target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
-	
+
+	curr_ptr->dataloop = 0;
 	switch (curr_ptr->type)
 	{
 	case (MPIDI_RMA_PUT):
 	case (MPIDI_RMA_ACCUMULATE):
 	    mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
 				source_win_handle, target_win_handle, 
-				&dtype_infos[i],
-				&dataloops[i], &requests[i]); 
+				&curr_ptr->dtype_info,
+				&curr_ptr->dataloop, &curr_ptr->request); 
 	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 	    break;
+	case MPIDI_RMA_ACC_CONTIG:
+	    mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
+				       source_win_handle, target_win_handle, 
+				       &curr_ptr->request );
+	    break;
 	case (MPIDI_RMA_GET):
 	    mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
 				source_win_handle, target_win_handle, 
-				&dtype_infos[i], 
-				&dataloops[i], &requests[i]);
+				&curr_ptr->dtype_info, 
+				&curr_ptr->dataloop, &curr_ptr->request);
 	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 	    break;
 	default:
@@ -1145,8 +1365,23 @@
 	}
 	i++;
 	curr_ops_cnt[curr_ptr->target_rank]++;
-	curr_ptr = curr_ptr->next;
+	/* If the request is null, we can remove it immediately */
+	if (!curr_ptr->request) {
+	    if (curr_ptr->dataloop != NULL) {
+		MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
+						  recv_rma_msg */
+	    }
+	    tmpptr       = curr_ptr->next;
+	    *prevNextPtr = tmpptr;
+	    MPIU_Free( curr_ptr );
+	    curr_ptr     = tmpptr;
+	}
+	else  {
+	    curr_ptr    = curr_ptr->next;
+	    prevNextPtr = &curr_ptr->next;
+	}
     }
+    MPIU_INSTR_DURATION_END(wincomplete_issue);
         
     /* If the start_group included some processes that did not end up
        becoming targets of  RMA operations from this process, we need
@@ -1167,6 +1402,7 @@
 	    MPIDI_CH3_Pkt_t upkt;
 	    MPIDI_CH3_Pkt_put_t *put_pkt = &upkt.put;
 	    MPIDI_VC_t * vc;
+	    MPID_Request *request;
 	    
 	    MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
 	    put_pkt->addr = NULL;
@@ -1180,80 +1416,111 @@
 	    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
 	    mpi_errno = MPIU_CALL(MPIDI_CH3,iStartMsg(vc, put_pkt,
 						      sizeof(*put_pkt),
-						      &requests[j]));
+						      &request));
 	    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
 	    if (mpi_errno != MPI_SUCCESS) {
 		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg" );
 	    }
+	    /* In the unlikely event that a request is returned (the message
+	       is not sent yet), add it to the list of pending operations */
+	    if (request) {
+		/* Its hard to use the automatic allocator here, as those 
+		   macros are optimized for a known maximum number of items. */
+		MPIDI_RMA_ops *new_ptr;
+		new_ptr = (MPIDI_RMA_ops *)MPIU_Malloc(sizeof(MPIDI_RMA_ops) );
+		/* --BEGIN ERROR HANDLING-- */
+		if (!new_ptr) {
+		    MPIU_CHKMEM_SETERR(mpi_errno,sizeof(MPIDI_RMA_ops),
+					"RMA operation entry");
+		    goto fn_fail;
+		}
+		/* --END ERROR HANDLING-- */
+		if (win_ptr->rma_ops_list_tail) 
+		    win_ptr->rma_ops_list_tail->next = new_ptr;
+		else
+		    win_ptr->rma_ops_list_head = new_ptr;
+		win_ptr->rma_ops_list_tail = new_ptr;
+		new_ptr->next     = NULL;
+		new_ptr->request  = request;
+		new_ptr->dataloop = 0;
+	    }
 	    j++;
 	    new_total_op_count++;
 	}
     }
-        
+
     if (new_total_op_count)
     {
 	MPID_Progress_state progress_state;
 	
 	done = 1;
+	MPIU_INSTR_DURATION_START(wincomplete_complete);
 	MPID_Progress_start(&progress_state);
-	while (new_total_op_count)
-	{
-	    for (i=0; i<new_total_op_count; i++)
-	    {
-		if (requests[i] != NULL)
-		{
-		    if (!MPID_Request_is_complete(requests[i]))
-		    {
-			done = 0;
-			break;
-		    }
-		    else
-		    {
-			mpi_errno = requests[i]->status.MPI_ERROR;
+	while (win_ptr->rma_ops_list_head) {
+	    prevNextPtr = &win_ptr->rma_ops_list_head;
+	    curr_ptr    = win_ptr->rma_ops_list_head;
+	    do {
+		if (MPID_Request_is_complete(curr_ptr->request)) {
+		    /* Once we find a complete request, we complete
+		       as many as possible until we find an incomplete
+		       or null request */
+		    do {
+			mpi_errno = curr_ptr->request->status.MPI_ERROR;
 			/* --BEGIN ERROR HANDLING-- */
-			if (mpi_errno != MPI_SUCCESS)
-			{
+			if (mpi_errno != MPI_SUCCESS) {
 			    MPID_Progress_end(&progress_state);
-			    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMArequest");
+			    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMAmessage");
 			}
 			/* --END ERROR HANDLING-- */
-			MPID_Request_release(requests[i]);
-			requests[i] = NULL;
+			MPID_Request_release(curr_ptr->request);
+			if (curr_ptr->dataloop != NULL) {
+			    MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
+							      recv_rma_msg */
+			}
+			/* We can remove and free this rma op element */
+			tmpptr       = curr_ptr->next;
+			*prevNextPtr = tmpptr;
+			MPIU_Free( curr_ptr );
+			curr_ptr     = tmpptr;
 		    }
+		    while (curr_ptr &&
+			   MPID_Request_is_complete(curr_ptr->request));
+		    /* Once a request completes, we wait for another
+		       operation to arrive rather than check the
+		       rest of the requests.  */
+		    break;
 		}
+		else {
+		    prevNextPtr = &curr_ptr->next;
+		    curr_ptr    = curr_ptr->next;
+		    break;
+		}
+	    } while (curr_ptr);
+
+	    /* Wait for something to arrive*/
+	    /* In some tests, this hung unless the test ensured that 
+	       there was an incomplete request. */
+	    curr_ptr = win_ptr->rma_ops_list_head;
+	    if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
+		MPIU_INSTR_DURATION_START(winfence_block);
+		mpi_errno = MPID_Progress_wait(&progress_state);
+		/* --BEGIN ERROR HANDLING-- */
+		if (mpi_errno != MPI_SUCCESS) {
+		    MPID_Progress_end(&progress_state);
+		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+		}
+		/* --END ERROR HANDLING-- */
+		MPIU_INSTR_DURATION_END(winfence_block);
 	    }
-                
-	    if (done)
-	    {
-		break;
-	    }
+	} /* While list of rma operation is non-empty */
 	    
-	    mpi_errno = MPID_Progress_wait(&progress_state);
-	    done = 1;
-	} 
+	    
 	MPID_Progress_end(&progress_state);
     }
-        
-    if (total_op_count != 0)
-    {
-	for (i=0; i<total_op_count; i++)
-	{
-	    if (dataloops[i] != NULL)
-	    {
-		MPIU_Free(dataloops[i]);
-	    }
-	}
-    }
-        
-    /* free MPIDI_RMA_ops_list */
-    curr_ptr = win_ptr->rma_ops_list;
-    while (curr_ptr != NULL)
-    {
-	next_ptr = curr_ptr->next;
-	MPIU_Free(curr_ptr);
-	curr_ptr = next_ptr;
-    }
-    win_ptr->rma_ops_list = NULL;
+
+    MPIU_Assert( !win_ptr->rma_ops_list_head );
+    win_ptr->rma_ops_list_head = NULL;
+    win_ptr->rma_ops_list_tail = NULL;
     
     mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -1291,6 +1558,7 @@
     {
 	MPID_Progress_state progress_state;
 	
+	MPIU_INSTR_DURATION_START(winwait_wait);
 	MPID_Progress_start(&progress_state);
 	while (win_ptr->my_counter)
 	{
@@ -1303,8 +1571,10 @@
 		return mpi_errno;
 	    }
 	    /* --END ERROR HANDLING-- */
+	    MPIU_INSTR_DURATION_INCR(winwait_wait,0,1)
 	}
 	MPID_Progress_end(&progress_state);
+	MPIU_INSTR_DURATION_END(winwait_wait);
     } 
 
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_WAIT);
@@ -1362,9 +1632,9 @@
 
     if (dest == MPI_PROC_NULL) goto fn_exit;
         
-    MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+    comm_ptr = win_ptr->comm_ptr;
     
-    if (dest == comm_ptr->rank) {
+    if (dest == win_ptr->myrank) {
 	/* The target is this process itself. We must block until the lock
 	 * is acquired. */
             
@@ -1373,6 +1643,7 @@
 	{
 	    MPID_Progress_state progress_state;
 	    
+	    MPIU_INSTR_DURATION_START(winlock_getlocallock);
 	    MPID_Progress_start(&progress_state);
 	    while (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 0) 
 	    {
@@ -1385,6 +1656,7 @@
 		/* --END ERROR HANDLING-- */
 	    }
 	    MPID_Progress_end(&progress_state);
+	    MPIU_INSTR_DURATION_END(winlock_getlocallock);
 	}
 	/* local lock acquired. local puts, gets, accumulates will be done 
 	   directly without queueing. */
@@ -1392,11 +1664,13 @@
     
     else {
 	/* target is some other process. add the lock request to rma_ops_list */
-            
+	MPIU_INSTR_DURATION_START(rmaqueue_alloc);
 	MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops), 
 			    mpi_errno, "RMA operation entry");
+	MPIU_INSTR_DURATION_END(rmaqueue_alloc);
             
-	win_ptr->rma_ops_list = new_ptr;
+	win_ptr->rma_ops_list_head = new_ptr;
+	win_ptr->rma_ops_list_tail = new_ptr;
         
 	new_ptr->next = NULL;  
 	new_ptr->type = MPIDI_RMA_LOCK;
@@ -1434,9 +1708,9 @@
 
     if (dest == MPI_PROC_NULL) goto fn_exit;
         
-    MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+    comm_ptr = win_ptr->comm_ptr;
         
-    if (dest == comm_ptr->rank) {
+    if (dest == win_ptr->myrank) {
 	/* local lock. release the lock on the window, grant the next one
 	 * in the queue, and return. */
 	mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
@@ -1445,7 +1719,7 @@
 	goto fn_exit;
     }
         
-    rma_op = win_ptr->rma_ops_list;
+    rma_op = win_ptr->rma_ops_list_head;
     
     /* win_lock was not called. return error */
     if ( (rma_op == NULL) || (rma_op->type != MPIDI_RMA_LOCK) ) { 
@@ -1461,7 +1735,8 @@
     if (rma_op->next == NULL) {
 	/* only win_lock called, no put/get/acc. Do nothing and return. */
 	MPIU_Free(rma_op);
-	win_ptr->rma_ops_list = NULL;
+	win_ptr->rma_ops_list_head = NULL;
+	win_ptr->rma_ops_list_tail = NULL;
 	goto fn_exit;
     }
         
@@ -1502,7 +1777,7 @@
     if (single_op_opt == 0) {
 	
 	/* Send a lock packet over to the target. wait for the lock_granted
-	 * reply. then do all the RMA ops. */ 
+	 * reply. Then do all the RMA ops. */ 
 	
 	MPIDI_Pkt_init(lock_pkt, MPIDI_CH3_PKT_LOCK);
 	lock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
@@ -1535,6 +1810,7 @@
 	{
 	    MPID_Progress_state progress_state;
 	    
+	    MPIU_INSTR_DURATION_START(winunlock_getlock);
 	    MPID_Progress_start(&progress_state);
 	    while (win_ptr->lock_granted == 0)
 	    {
@@ -1547,6 +1823,7 @@
 		/* --END ERROR HANDLING-- */
 	    }
 	    MPID_Progress_end(&progress_state);
+	    MPIU_INSTR_DURATION_END(winunlock_getlock);
 	}
 	
 	/* Now do all the RMA operations */
@@ -1603,18 +1880,15 @@
 					    int *wait_for_rma_done_pkt)
 {
     int mpi_errno = MPI_SUCCESS, done, i, nops;
-    MPIDI_RMA_ops *curr_ptr, *next_ptr, **curr_ptr_ptr, *tmp_ptr;
+    MPIDI_RMA_ops *curr_ptr;
     MPID_Comm *comm_ptr;
-    MPID_Request **requests=NULL; /* array of requests */
-    MPIDI_RMA_dtype_info *dtype_infos=NULL;
-    void **dataloops=NULL;    /* to store dataloops for each datatype */
+    MPIDI_RMA_ops **prevNextPtr, *tmpptr;
     MPI_Win source_win_handle, target_win_handle;
-    MPIU_CHKLMEM_DECL(3);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
 
-    if (win_ptr->rma_ops_list->lock_type == MPI_LOCK_EXCLUSIVE) {
+    if (win_ptr->rma_ops_list_head->lock_type == MPI_LOCK_EXCLUSIVE) {
         /* exclusive lock. no need to wait for rma done pkt at the end */
         *wait_for_rma_done_pkt = 0;
     }
@@ -1623,179 +1897,200 @@
            to the end of the list and do it last, in which case an rma done 
            pkt is not needed. If there is no get, rma done pkt is needed */
 
-        /* First check whether the last operation is a get. Skip the first op, 
-           which is a lock. */
-
-        curr_ptr = win_ptr->rma_ops_list->next;
-        while (curr_ptr->next != NULL) 
-            curr_ptr = curr_ptr->next;
-    
-        if (curr_ptr->type == MPIDI_RMA_GET) {
+        if (win_ptr->rma_ops_list_tail->type == MPIDI_RMA_GET) {
             /* last operation is a get. no need to wait for rma done pkt */
             *wait_for_rma_done_pkt = 0;
         }
         else {
             /* go through the list and move the first get operation 
-               (if there is one) to the end */
+               (if there is one) to the end.  Note that the first
+	       operation must be a lock, so we can skip it */
             
-            curr_ptr = win_ptr->rma_ops_list->next;
-            curr_ptr_ptr = &(win_ptr->rma_ops_list->next);
+            curr_ptr = win_ptr->rma_ops_list_head->next;
+            prevNextPtr = &(win_ptr->rma_ops_list_head->next);
             
             *wait_for_rma_done_pkt = 1;
             
             while (curr_ptr != NULL) {
                 if (curr_ptr->type == MPIDI_RMA_GET) {
+		    /* Found a GET, move it to the end */
                     *wait_for_rma_done_pkt = 0;
-                    *curr_ptr_ptr = curr_ptr->next;
-                    tmp_ptr = curr_ptr;
-                    while (curr_ptr->next != NULL)
-                        curr_ptr = curr_ptr->next;
-                    curr_ptr->next = tmp_ptr;
-                    tmp_ptr->next = NULL;
+		    win_ptr->rma_ops_list_tail->next = curr_ptr;
+		    *prevNextPtr = curr_ptr->next;
+		    curr_ptr->next = NULL;
+		    win_ptr->rma_ops_list_tail = curr_ptr;
                     break;
                 }
                 else {
-                    curr_ptr_ptr = &(curr_ptr->next);
-                    curr_ptr = curr_ptr->next;
+                    prevNextPtr = &(curr_ptr->next);
+                    curr_ptr    = curr_ptr->next;
                 }
             }
         }
     }
 
-    MPID_Comm_get_ptr( win_ptr->comm, comm_ptr );
+    comm_ptr = win_ptr->comm_ptr;
 
     /* Ignore the first op in the list because it is a win_lock and do
        the rest */
 
-    curr_ptr = win_ptr->rma_ops_list->next;
+    /* 
+       This list has a head (lock) (but no tail (unlock)) that is not 
+       processed, so we must skip over that head 
+    */
+
+    curr_ptr = win_ptr->rma_ops_list_head->next;
     nops = 0;
     while (curr_ptr != NULL) {
         nops++;
         curr_ptr = curr_ptr->next;
     }
+    
+    MPIU_INSTR_DURATION_START(winunlock_issue);
+    i = 0;
 
-    MPIU_CHKLMEM_MALLOC(requests, MPID_Request **, nops*sizeof(MPID_Request*),
-			mpi_errno, "requests");
-    MPIU_CHKLMEM_MALLOC(dtype_infos, MPIDI_RMA_dtype_info *, 
-			nops*sizeof(MPIDI_RMA_dtype_info),
-			mpi_errno, "dtype_infos");
-    MPIU_CHKLMEM_MALLOC(dataloops, void **, nops*sizeof(void*),
-			mpi_errno, "dataloops");
+    /* Remove the lock entry */
+    curr_ptr = win_ptr->rma_ops_list_head;
+    tmpptr       = curr_ptr->next;
+    win_ptr->rma_ops_list_head = tmpptr;
+    MPIU_Free( curr_ptr );
 
-    for (i=0; i<nops; i++)
-    {
-        dataloops[i] = NULL;
-    }
-    
-    i = 0;
-    curr_ptr = win_ptr->rma_ops_list->next;
+    prevNextPtr = &win_ptr->rma_ops_list_head;
+    curr_ptr    = win_ptr->rma_ops_list_head;
     target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
     while (curr_ptr != NULL)
     {
         /* To indicate the last RMA operation, we pass the
            source_win_handle only on the last operation. Otherwise, 
            we pass MPI_WIN_NULL. */
-        if (i == nops - 1)
+	/* Could also be curr_ptr->next == NULL */
+        if (/*i == nops - 1*/!curr_ptr->next)
             source_win_handle = win_ptr->handle;
         else 
             source_win_handle = MPI_WIN_NULL;
         
+	curr_ptr->dataloop = 0;
         switch (curr_ptr->type)
         {
         case (MPIDI_RMA_PUT):  /* same as accumulate */
         case (MPIDI_RMA_ACCUMULATE):
             win_ptr->pt_rma_puts_accs[curr_ptr->target_rank]++;
             mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
-                         source_win_handle, target_win_handle, &dtype_infos[i],
-                                                &dataloops[i], &requests[i]);
+				source_win_handle, target_win_handle, 
+				&curr_ptr->dtype_info,
+                                &curr_ptr->dataloop, &curr_ptr->request);
 	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
             break;
+	case MPIDI_RMA_ACC_CONTIG:
+            win_ptr->pt_rma_puts_accs[curr_ptr->target_rank]++;
+	    mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
+				       source_win_handle, target_win_handle, 
+				       &curr_ptr->request );
+	    break;
         case (MPIDI_RMA_GET):
             mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
-                         source_win_handle, target_win_handle, &dtype_infos[i],
-                                                &dataloops[i], &requests[i]);
+                         source_win_handle, target_win_handle, 
+				&curr_ptr->dtype_info,
+                                &curr_ptr->dataloop, &curr_ptr->request);
 	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
             break;
         default:
 	    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winInvalidOp");
         }
         i++;
-        curr_ptr = curr_ptr->next;
+	/* If the request is null, we can remove it immediately */
+	if (!curr_ptr->request) {
+	    if (curr_ptr->dataloop != NULL) {
+		MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
+						  recv_rma_msg */
+	    }
+	    tmpptr       = curr_ptr->next;
+	    *prevNextPtr = tmpptr;
+	    MPIU_Free( curr_ptr );
+	    curr_ptr     = tmpptr;
+	}
+	else  {
+	    curr_ptr    = curr_ptr->next;
+	    prevNextPtr = &curr_ptr->next;
+	}
     }
+    MPIU_INSTR_DURATION_END(winunlock_issue);
     
     if (nops)
     {
 	MPID_Progress_state progress_state;
 	
 	done = 1;
+	MPIU_INSTR_DURATION_START(winunlock_complete);
 	MPID_Progress_start(&progress_state);
-	while (nops)
-	{
-	    for (i=0; i<nops; i++)
-	    {
-		if (requests[i] != NULL)
-		{
-		    if (!MPID_Request_is_complete(requests[i]))
-		    {
-			done = 0;
-			break;
-		    }
-		    else
-		    {
-			mpi_errno = requests[i]->status.MPI_ERROR;
+	while (win_ptr->rma_ops_list_head) {
+	    prevNextPtr = &win_ptr->rma_ops_list_head;
+	    curr_ptr    = win_ptr->rma_ops_list_head;
+	    do {
+		if (MPID_Request_is_complete(curr_ptr->request)) {
+		    /* Once we find a complete request, we complete
+		       as many as possible until we find an incomplete
+		       or null request */
+		    do {
+			mpi_errno = curr_ptr->request->status.MPI_ERROR;
 			/* --BEGIN ERROR HANDLING-- */
-			if (mpi_errno != MPI_SUCCESS)
-			{
+			if (mpi_errno != MPI_SUCCESS) {
 			    MPID_Progress_end(&progress_state);
 			    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winRMAmessage");
 			}
 			/* --END ERROR HANDLING-- */
-			/* if origin datatype was a derived
-			   datatype, it will get freed when the
-			   request gets freed. */ 
-			MPID_Request_release(requests[i]);
-			requests[i] = NULL;
+			MPID_Request_release(curr_ptr->request);
+			if (curr_ptr->dataloop != NULL) {
+			    MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
+							      recv_rma_msg */
+			}
+			/* We can remove and free this rma op element */
+			tmpptr       = curr_ptr->next;
+			*prevNextPtr = tmpptr;
+			MPIU_Free( curr_ptr );
+			curr_ptr     = tmpptr;
 		    }
+		    while (curr_ptr &&
+			   MPID_Request_is_complete(curr_ptr->request));
+		    /* Once a request completes, we wait for another
+		       operation to arrive rather than check the
+		       rest of the requests.  */
+		    break;
 		}
+		else {
+		    prevNextPtr = &curr_ptr->next;
+		    curr_ptr    = curr_ptr->next;
+		    break;
+		}
+	    } while (curr_ptr);
+	    
+	    /* Wait for something to arrive*/
+	    /* In some tests, this hung unless the test ensured that 
+	       there was an incomplete request. */
+	    curr_ptr = win_ptr->rma_ops_list_head;
+	    if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request) ) {
+		MPIU_INSTR_DURATION_START(winfence_block);
+		mpi_errno = MPID_Progress_wait(&progress_state);
+		/* --BEGIN ERROR HANDLING-- */
+		if (mpi_errno != MPI_SUCCESS) {
+		    MPID_Progress_end(&progress_state);
+		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+		}
+		/* --END ERROR HANDLING-- */
+		MPIU_INSTR_DURATION_END(winfence_block);
 	    }
-	
-	    if (done) 
-	    {
-		break;
-	    }
-	
-	    mpi_errno = MPID_Progress_wait(&progress_state);
-	    /* --BEGIN ERROR HANDLING-- */
-	    if (mpi_errno != MPI_SUCCESS) {
-		MPID_Progress_end(&progress_state);
-		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
-	    }
-	    /* --END ERROR HANDLING-- */
-	    done = 1;
-	}
+	} /* While list of rma operation is non-empty */
+	    
 	MPID_Progress_end(&progress_state);
     } 
     
-    for (i=0; i<nops; i++)
-    {
-        if (dataloops[i] != NULL)
-        {
-            MPIU_Free(dataloops[i]);
-        }
-    }
-    
-    /* free MPIDI_RMA_ops_list */
-    curr_ptr = win_ptr->rma_ops_list;
-    while (curr_ptr != NULL)
-    {
-        next_ptr = curr_ptr->next;
-        MPIU_Free(curr_ptr);
-        curr_ptr = next_ptr;
-    }
-    win_ptr->rma_ops_list = NULL;
 
+    MPIU_Assert( !win_ptr->rma_ops_list_head );
+
+    win_ptr->rma_ops_list_head = NULL;
+    win_ptr->rma_ops_list_tail = NULL;
+
  fn_exit:
-    MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -1829,9 +2124,9 @@
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_LOCK_PUT_OR_ACC);
 
-    lock_type = win_ptr->rma_ops_list->lock_type;
+    lock_type = win_ptr->rma_ops_list_head->lock_type;
 
-    rma_op = win_ptr->rma_ops_list->next;
+    rma_op = win_ptr->rma_ops_list_head->next;
 
     win_ptr->pt_rma_puts_accs[rma_op->target_rank]++;
 
@@ -1871,8 +2166,30 @@
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_accum_unlock_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
     }
+    else if (rma_op->type == MPIDI_RMA_ACC_CONTIG) {
+        MPIDI_Pkt_init(lock_accum_unlock_pkt, MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK);
+        lock_accum_unlock_pkt->target_win_handle = 
+            win_ptr->all_win_handles[rma_op->target_rank];
+        lock_accum_unlock_pkt->source_win_handle = win_ptr->handle;
+        lock_accum_unlock_pkt->lock_type = lock_type;
 
-    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+        lock_accum_unlock_pkt->addr = 
+            (char *) win_ptr->base_addrs[rma_op->target_rank] +
+            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
+        
+        lock_accum_unlock_pkt->count = rma_op->target_count;
+        lock_accum_unlock_pkt->datatype = rma_op->target_datatype;
+        lock_accum_unlock_pkt->op = rma_op->op;
+
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_accum_unlock_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
+    }
+    else {
+	printf( "expected short accumulate...\n" );
+	/* */
+    }
+
+    comm_ptr = win_ptr->comm_ptr;
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype, predefined);
@@ -1974,9 +2291,10 @@
     }
 
     /* free MPIDI_RMA_ops_list */
-    MPIU_Free(win_ptr->rma_ops_list->next);
-    MPIU_Free(win_ptr->rma_ops_list);
-    win_ptr->rma_ops_list = NULL;
+    MPIU_Free(win_ptr->rma_ops_list_head->next);
+    MPIU_Free(win_ptr->rma_ops_list_head);
+    win_ptr->rma_ops_list_head = NULL;
+    win_ptr->rma_ops_list_tail = NULL;
 
  fn_fail:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_LOCK_PUT_OR_ACC);
@@ -2004,9 +2322,9 @@
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_LOCK_GET);
 
-    lock_type = win_ptr->rma_ops_list->lock_type;
+    lock_type = win_ptr->rma_ops_list_head->lock_type;
 
-    rma_op = win_ptr->rma_ops_list->next;
+    rma_op = win_ptr->rma_ops_list_head->next;
 
     /* create a request, store the origin buf, cnt, datatype in it,
        and pass a handle to it in the get packet. When the get
@@ -2048,7 +2366,7 @@
     lock_get_unlock_pkt->datatype = rma_op->target_datatype;
     lock_get_unlock_pkt->request_handle = rreq->handle;
 
-    MPID_Comm_get_ptr(win_ptr->comm, comm_ptr);
+    comm_ptr = win_ptr->comm_ptr;
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     MPIU_THREAD_CS_ENTER(CH3COMM,vc);
@@ -2095,10 +2413,12 @@
     MPID_Request_release(rreq);
 
     /* free MPIDI_RMA_ops_list */
-    MPIU_Free(win_ptr->rma_ops_list->next);
-    MPIU_Free(win_ptr->rma_ops_list);
-    win_ptr->rma_ops_list = NULL;
+    MPIU_Free(win_ptr->rma_ops_list_head->next);
+    MPIU_Free(win_ptr->rma_ops_list_head);
+    win_ptr->rma_ops_list_head = NULL;
+    win_ptr->rma_ops_list_tail = NULL;
 
+
  fn_fail:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_LOCK_GET);
     return mpi_errno;
@@ -2239,7 +2559,6 @@
         {
             mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(vc, req, &complete);
             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-            
             if (complete)
             {
                 *rreqp = NULL;
@@ -2467,7 +2786,8 @@
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
     
     MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"received accumulate pkt");
-    
+
+    MPIU_INSTR_DURATION_START(rmapkt_acc);
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *)pkt + sizeof(MPIDI_CH3_Pkt_t);
     
@@ -2484,6 +2804,7 @@
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(accum_pkt->datatype, predefined);
     if (predefined)
     {
+	MPIU_INSTR_DURATION_START(rmapkt_acc_predef);
 	MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
 	req->dev.datatype = accum_pkt->datatype;
 
@@ -2530,9 +2851,11 @@
                 if (complete)
                 {
                     *rreqp = NULL;
+		    MPIU_INSTR_DURATION_END(rmapkt_acc_predef);
                     goto fn_exit;
                 }
             }
+	    MPIU_INSTR_DURATION_END(rmapkt_acc_predef);
 	}
     }
     else
@@ -2590,6 +2913,7 @@
     }
 
  fn_exit:
+    MPIU_INSTR_DURATION_END(rmapkt_acc);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
     return mpi_errno;
  fn_fail:
@@ -2597,7 +2921,110 @@
 
 }
 
+/* Special accumulate for short data items entirely within the packet */
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_PktHandler_Accumulate_Immed
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
+					   MPIDI_msg_sz_t *buflen, 
+					   MPID_Request **rreqp )
+{
+    MPIDI_CH3_Pkt_accum_immed_t * accum_pkt = &pkt->accum_immed;
+    MPID_Win *win_ptr;
+    MPI_Aint extent;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
+    
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
+
+    MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"received accumulate immedidate pkt");
+
+    MPIU_INSTR_DURATION_START(rmapkt_acc_immed);
+
+    /* return the number of bytes processed in this function */
+    /* data_len == 0 (all within packet) */
+    *buflen = sizeof(MPIDI_CH3_Pkt_t);
+    *rreqp  = NULL;
+
+    MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent); 
+    
+    /* size == 0 should never happen */
+    if (accum_pkt->count == 0 || extent == 0) {
+	;
+    }
+    else {
+	/* Data is already present */
+	if (accum_pkt->op == MPI_REPLACE) {
+	    /* no datatypes required */
+	    int len = accum_pkt->count * extent;
+	    /* FIXME: use immediate copy because this is short */
+	    MPIUI_Memcpy( accum_pkt->addr, accum_pkt->data, len );
+	}
+	else {
+	    if (HANDLE_GET_KIND(accum_pkt->op) == HANDLE_KIND_BUILTIN) {
+		MPI_User_function *uop;
+		/* get the function by indexing into the op table */
+		uop = MPIR_Op_table[((accum_pkt->op)&0xf) - 1];
+		(*uop)(accum_pkt->data, accum_pkt->addr,
+		       &(accum_pkt->count), &(accum_pkt->datatype));
+	    }
+	    else {
+		MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OP, "**opnotpredefined",
+				     "**opnotpredefined %d", accum_pkt->op );
+	    }
+	}
+
+	/* There are additional steps to take if this is a passive 
+	   target RMA or the last operation from the source */
+	
+	/* Here is the code executed in PutAccumRespComplete after the
+	   accumulation operation */
+	MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
+	
+	/* if passive target RMA, increment counter */
+	if (win_ptr->current_lock_type != MPID_LOCK_NONE)
+	    win_ptr->my_pt_rma_puts_accs++;
+	
+	if (accum_pkt->source_win_handle != MPI_WIN_NULL) {
+	    /* Last RMA operation from source. If active
+	       target RMA, decrement window counter. If
+	       passive target RMA, release lock on window and
+	       grant next lock in the lock queue if there is
+	       any. If it's a shared lock or a lock-put-unlock
+	       type of optimization, we also need to send an
+	       ack to the source. */ 
+	    if (win_ptr->current_lock_type == MPID_LOCK_NONE) {
+		/* FIXME: MT: this has to be done atomically */
+		win_ptr->my_counter -= 1;
+		MPIDI_CH3_Progress_signal_completion();
+	    }
+	    else {
+		if ((win_ptr->current_lock_type == MPI_LOCK_SHARED) ||
+		    (/*rreq->dev.single_op_opt*/ 0 == 1)) {
+		    mpi_errno = MPIDI_CH3I_Send_pt_rma_done_pkt(vc, 
+					accum_pkt->source_win_handle);
+		    if (mpi_errno) {
+			    MPIU_ERR_POP(mpi_errno);
+		    }
+		}
+		mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+	    }
+	}
+
+	goto fn_exit;
+    }
+
+ fn_exit:
+    MPIU_INSTR_DURATION_END(rmapkt_acc_immed);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_PktHandler_Lock
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -2631,6 +3058,9 @@
 	
 	/* FIXME: MT: This may need to be done atomically. */
 	
+	/* FIXME: Since we need to add to the tail of the list,
+	   we should maintain a tail pointer rather than traversing the 
+	   list each time to find the tail. */
 	curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
 	prev_ptr = curr_ptr;
 	while (curr_ptr != NULL)
@@ -2639,7 +3069,9 @@
 	    curr_ptr = curr_ptr->next;
 	}
 	
+	MPIU_INSTR_DURATION_START(lockqueue_alloc);
 	new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+	MPIU_INSTR_DURATION_END(lockqueue_alloc);
 	if (!new_ptr) {
 	    MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
 				 "MPIDI_Win_lock_queue");
@@ -2713,7 +3145,9 @@
 	/* queue the information */
 	MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
 	
+	MPIU_INSTR_DURATION_START(lockqueue_alloc);
 	new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+	MPIU_INSTR_DURATION_END(lockqueue_alloc);
 	if (!new_ptr) {
 	    MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
 				 "MPIDI_Win_lock_queue");
@@ -2874,7 +3308,7 @@
 	MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
 	
 	/* FIXME: MT: This may need to be done atomically. */
-	
+
 	curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
 	prev_ptr = curr_ptr;
 	while (curr_ptr != NULL)
@@ -2883,7 +3317,9 @@
 	    curr_ptr = curr_ptr->next;
 	}
 	
+	MPIU_INSTR_DURATION_START(lockqueue_alloc);
 	new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+	MPIU_INSTR_DURATION_END(lockqueue_alloc);
 	if (!new_ptr) {
 	    MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
 				 "MPIDI_Win_lock_queue");
@@ -2961,7 +3397,9 @@
     
     /* queue the information */
     
+    MPIU_INSTR_DURATION_START(lockqueue_alloc);
     new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+    MPIU_INSTR_DURATION_END(lockqueue_alloc);
     if (!new_ptr) {
 	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
 			     "MPIDI_Win_lock_queue");
@@ -3219,6 +3657,19 @@
     /*MPIU_DBG_PRINTF((" win_ptr ...... 0x%08X\n", pkt->accum.win_ptr));*/
     return MPI_SUCCESS;
 }
+int MPIDI_CH3_PktPrint_Accum_Immed( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
+{
+    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_ACCUM_IMMED\n"));
+    MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->accum_immed.addr));
+    MPIU_DBG_PRINTF((" count ........ %d\n", pkt->accum_immed.count));
+    MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->accum_immed.datatype));
+    MPIU_DBG_PRINTF((" op ........... 0x%08X\n", pkt->accum_immed.op));
+    MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->accum_immed.target_win_handle));
+    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->accum_immed.source_win_handle));
+    /*MPIU_DBG_PRINTF((" win_ptr ...... 0x%08X\n", pkt->accum.win_ptr));*/
+    fflush(stdout);
+    return MPI_SUCCESS;
+}
 int MPIDI_CH3_PktPrint_Lock( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
 {
     MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_LOCK\n"));

Modified: mpich2/trunk/src/util/instrm/Makefile.sm
===================================================================
--- mpich2/trunk/src/util/instrm/Makefile.sm	2010-11-04 20:24:55 UTC (rev 7415)
+++ mpich2/trunk/src/util/instrm/Makefile.sm	2010-11-06 13:08:15 UTC (rev 7416)
@@ -1,2 +1,2 @@
-lib${MPILIBNAME}_a_SOURCES = states.c
+lib${MPILIBNAME}_a_SOURCES = states.c instr.c
 INCLUDES = -I../../include -I${top_srcdir}/src/include

Added: mpich2/trunk/src/util/instrm/instr.c
===================================================================
--- mpich2/trunk/src/util/instrm/instr.c	                        (rev 0)
+++ mpich2/trunk/src/util/instrm/instr.c	2010-11-06 13:08:15 UTC (rev 7416)
@@ -0,0 +1,132 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpiimpl.h"
+
+#ifdef USE_MPIU_INSTR
+
+static int MPIU_INSTR_Printf( FILE *fp );
+static int MPIU_INSTR_Finalize( void *p );
+
+/* */
+/*
+ * Basic but general support for instrumentation hooks in MPICH2
+ *
+ * Most actions are handled by MPIU_INSTR_xxx macros (to permit both lowest
+ * overhead and to allow instrumentation to be selected at compile time.
+ */
+static struct MPIU_INSTR_Generic_t *instrHead = 0, *instrTail = 0;
+
+int MPIU_INSTR_AddHandle( void *handlePtr )
+{
+    struct MPIU_INSTR_Generic_t *gPtr = 
+	(struct MPIU_INSTR_Generic_t *)handlePtr;
+
+    /* Note that Addhandle must be within a thread-safe initialization */
+    if (!instrHead) {
+	/* Make sure that this call back occurs early (before MPID_Finalize) */
+	MPIR_Add_finalize( MPIU_INSTR_Finalize, stdout,
+			   MPIR_FINALIZE_CALLBACK_PRIO + 2 );
+    }
+
+    if (instrHead) {
+	instrTail->next = gPtr;
+    }
+    else {
+	instrHead       = gPtr;
+    }
+    instrTail       = gPtr;
+}
+
+#define MAX_INSTR_BUF 1024
+static int MPIU_INSTR_Printf( FILE *fp )
+{
+    struct MPIU_INSTR_Generic_t *gPtr = instrHead;
+    char instrBuf[MAX_INSTR_BUF];
+    
+    while (gPtr) {
+	/* We only output information on events that occured */
+	if (gPtr->count) {
+	    if (gPtr->toStr) {
+		(*gPtr->toStr)( instrBuf, sizeof(instrBuf), gPtr );
+	    }
+	    else {
+		if (gPtr->desc) {
+		    MPIU_Strncpy( instrBuf, gPtr->desc, sizeof(instrBuf) );
+		}
+		else {
+		    /* This should not happen */
+		    MPIU_Strncpy( instrBuf, "", sizeof(instrBuf) );
+		}
+	    }
+	    fputs( instrBuf, fp );
+	    fputc( '\n', fp );
+	}
+	gPtr = gPtr->next;
+    }
+    fflush( fp );
+    return 0;
+}
+
+static int MPIU_INSTR_Finalize( void *p )
+{
+    int rc;
+    struct MPIU_INSTR_Generic_t *gPtr = instrHead;
+    /* FIXME: This should at least issue the writes in process order */
+    /* Allow whether output is generated to be controlled */
+    if (!MPL_env2bool( "MPICH_INSTR_AT_FINALIZE", &rc )) 
+	rc = 0;
+
+    if (rc) {
+	MPIU_INSTR_Printf( stdout );
+    }
+
+    /* Free any memory allocated for the descriptions */
+    while (gPtr) {
+	if (gPtr->desc) {
+	    MPIU_Free( (char *)gPtr->desc );
+	    gPtr->desc = 0;
+	}
+	gPtr = gPtr->next;
+    }
+    
+    return 0;
+}
+
+/*
+ * Standard print routines for the instrumentation objects
+ */
+
+/* 
+ * Print a duration, which may have extra integer fields.  Those fields
+ * are printed as integers, in order, separate by tabs
+ */
+int MPIU_INSTR_ToStr_Duration_Count( char *buf, size_t maxBuf, void *ptr )
+{
+    double ttime;
+    struct MPIU_INSTR_Duration_count_t *dPtr = 
+	(struct MPIU_INSTR_Duration_count_t *)ptr;
+    MPID_Wtime_todouble( &dPtr->ttime, &ttime );
+    snprintf( buf, maxBuf, "%-40s:\t%d\t%e", dPtr->desc, dPtr->count, ttime );
+    if (dPtr->nitems) {
+	char *p;
+	int  len = strlen(buf);
+	int  i;
+	/* Add each integer value, separated by a tab. */
+	maxBuf -= len;
+	p       = buf + len;
+	for (i=0; i<dPtr->nitems; i++) {
+	    snprintf( p, maxBuf, "\t%d", dPtr->data[i] );
+	    len     = strlen(p);
+	    maxBuf -= len;
+	    p      += len;
+	}
+    }
+    return 0;
+}
+#else
+/* No routines required if instrumentation is not selected */
+#endif



More information about the mpich2-commits mailing list