[mpich2-commits] r8003 - mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd

jayesh at mcs.anl.gov jayesh at mcs.anl.gov
Mon Feb 21 16:57:52 CST 2011


Author: jayesh
Date: 2011-02-21 16:57:52 -0600 (Mon, 21 Feb 2011)
New Revision: 8003

Modified:
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h
Log:
    This patch includes fixes/changes listed below. Since the code isn't completely functional without this patch - the different changes are not separately checked-in.
1. Fixed several bugs in the conn protocol
2. We only wait on EX events now - we no longer wait on ND events
3. Added support for non-contig data
4. Added support for user-defined datatypes. Data is now packed before sending
5. Added support for data transfer gt than device limit
6. Support blocking sends
7. Handle any unfinished reqs before terminating/closing a VC
8. Nemesis uses iov_count to indicate the rem IOVs. Fixed code where the sock channel interpretation of iov_count was used.
9. ND_PENDING is a SUCCESS not a FAILURE - fixed parts of code which considered the condn as a failure
10. Added several dbg stmts
11. Removed unused code/comments etc
 


Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -49,10 +49,16 @@
 
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "Successfully created an ND CQ (sz=%d)", cq_sz);
     MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,
-        "ND CQ : size = %d, msz = %d, mir = %d, mor = %d, mirl = %d, morl = %d",
+        "ND CQ : size = " MPIR_UPINT_FMT_DEC_SPEC ", mcq = " MPIR_UPINT_FMT_DEC_SPEC
+        ", mir = " MPIR_UPINT_FMT_DEC_SPEC ", mor = " MPIR_UPINT_FMT_DEC_SPEC
+        ", mirl = " MPIR_UPINT_FMT_DEC_SPEC ", morl = " MPIR_UPINT_FMT_DEC_SPEC
+        ", mol = " MPIR_UPINT_FMT_DEC_SPEC ", mreg_sz = " MPIR_UPINT_FMT_DEC_SPEC
+        ", lreq_thres = " MPIR_UPINT_FMT_DEC_SPEC,
         cq_sz, hnd->ad_info.MaxCqEntries,
         hnd->ad_info.MaxInboundRequests, hnd->ad_info.MaxOutboundRequests,
-        hnd->ad_info.MaxInboundReadLimit, hnd->ad_info.MaxOutboundReadLimit));
+        hnd->ad_info.MaxInboundReadLimit, hnd->ad_info.MaxOutboundReadLimit,
+        hnd->ad_info.MaxOutboundLength, hnd->ad_info.MaxRegistrationSize,
+        hnd->ad_info.LargeRequestThreshold));
 
     /* Associate the adapter with the Executive */
     MPIU_ExAttachHandle(ex_hnd, MPIU_EX_GENERIC_COMP_PROC_KEY, hnd->p_ad->GetFileHandle());
@@ -126,6 +132,9 @@
     mpi_errno = MPID_Nem_nd_ad_init(*phnd, ex_hnd);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+    (*phnd)->npending_rds = 0;
+    (*phnd)->zcp_pending = 0;
+
     MPIU_CHKPMEM_COMMIT();
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_DEV_HND_INIT);

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -11,7 +11,7 @@
 #define FUNCNAME MPID_Nem_nd_conn_hnd_init
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPID_Nem_nd_conn_hnd_t *pconn_hnd)
+int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPIDI_VC_t *vc, MPID_Nem_nd_conn_hnd_t *pconn_hnd)
 {
     int mpi_errno = MPI_SUCCESS;
     HRESULT hr;
@@ -41,16 +41,44 @@
 
     MPID_NEM_ND_CONN_STATE_SET((*pconn_hnd), MPID_NEM_ND_CONN_QUIESCENT);
     (*pconn_hnd)->vc = NULL;
+    if(vc != NULL){
+        /* Make sure that we set the tmp conn info in the vc before we block 
+         * We wait till 3-way handshake before setting vc info
+         * for this conn & conn info for the vc
+         * This info could be used by accept() side to signal that the conn
+         * is no longer valid - orphan
+         */
+        MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_SET(vc, (*pconn_hnd));
+    }
     MPIU_ExInitOverlapped(&((*pconn_hnd)->recv_ov), NULL, NULL);
     MPIU_ExInitOverlapped(&((*pconn_hnd)->send_ov), NULL, NULL);
 
+    (*pconn_hnd)->is_orphan = 0;
+    (*pconn_hnd)->tmp_vc = NULL;
 	(*pconn_hnd)->npending_ops = 0;
+    (*pconn_hnd)->send_in_progress = 0;
     (*pconn_hnd)->zcp_in_progress = 0;
-
+    (*pconn_hnd)->zcp_rreqp = NULL;
+
+    (*pconn_hnd)->npending_rds = 0;
+
+    (*pconn_hnd)->zcp_send_offset = 0;
+
     /* Create an endpoint - listen conns don't need an endpoint */
     if((conn_type == MPID_NEM_ND_CONNECT_CONN) || (conn_type == MPID_NEM_ND_ACCEPT_CONN)){
+        ND_ADAPTER_INFO info;
+        SIZE_T len = sizeof(info);
+
+        hr = dev_hnd->p_ad->Query(1, &info, &len);
+        MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+            mpi_errno, MPI_ERR_OTHER, "**nd_ep_create", "**nd_ep_create %s %d",
+            _com_error(hr).ErrorMessage(), hr);
+
+        /* FIXME: Use MPID_NEM_ND_CONN_RECVQ_SZ, MPID_NEM_ND_CONN_SENDQ_SZ for
+         * number of inboud/outbound requests
+         */
         hr = (*pconn_hnd)->p_conn->CreateEndpoint(dev_hnd->p_cq, dev_hnd->p_cq,
-            MPID_NEM_ND_CONN_RECVQ_SZ, MPID_NEM_ND_CONN_SENDQ_SZ,
+            info.MaxInboundRequests, info.MaxOutboundRequests,
             MPID_NEM_ND_CONN_SGE_MAX, MPID_NEM_ND_CONN_SGE_MAX,
             MPID_NEM_ND_CONN_RDMA_RD_MAX, MPID_NEM_ND_CONN_RDMA_RD_MAX,
             NULL, &((*pconn_hnd)->p_ep));
@@ -88,6 +116,7 @@
      * allowed even if init() fails
      */
     MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(*p_conn_hnd));
+
     if((*p_conn_hnd)->p_ep){
         /* Release endpoint */
         (*p_conn_hnd)->p_ep->Release();

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -82,6 +82,13 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_VC_TERMINATE);
 
+    /* Poll till no more pending/posted sends */
+    while(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(vc)
+        || !MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_EMPTY(vc)){
+        mpi_errno = MPID_Nem_nd_sm_poll(1);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
     vc_ch->next = NULL;
     vc_ch->prev = NULL;
     MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_DISCONNECTED);

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h	2011-02-21 22:57:52 UTC (rev 8003)
@@ -35,9 +35,13 @@
      * completion queue
      */
     INDCompletionQueue *p_cq;
+    int npending_rds;
+    volatile int zcp_pending;
 } *MPID_Nem_nd_dev_hnd_t;
 
 #define MPID_NEM_ND_DEV_HND_INVALID    NULL
+#define MPID_NEM_ND_DEV_RDMA_RD_MAX 2
+#define MPID_NEM_ND_DEV_IO_LIMIT(_dev_hnd) (_dev_hnd->ad_info.MaxOutboundLength - 1)
 
 /* Checks whether dev handle is initialized */
 #define MPID_NEM_ND_DEV_HND_IS_INIT(hnd)    ((hnd) != NULL)
@@ -58,23 +62,25 @@
 
 #define MPID_NEM_ND_CONN_SENDQ_SZ (MPID_NEM_ND_CONN_FC_BUFS_MAX+MPID_NEM_ND_CONN_RDMA_RD_MAX+MPID_NEM_ND_CONN_FC_MSG_MAX)
 #define MPID_NEM_ND_CONN_RECVQ_SZ (MPID_NEM_ND_CONN_FC_BUFS_MAX+MPID_NEM_ND_CONN_RDMA_RD_MAX)
-#define MPID_NEM_ND_CONN_SGE_MAX 1
+#define MPID_NEM_ND_CONN_SGE_MAX 16
 
 /* We use bcopy for upto 1K of upper layer data - pkt + user data 
  * FIXME: Tune this value after some runtime exp
  */
-#define MPID_NEM_ND_CONN_UDATA_SZ   1024
+#define MPID_NEM_ND_CONN_UDATA_SZ   2048
 typedef struct MPID_Nem_nd_msg_mw_{
     /* The memory window descriptor of next data
      * i.e., upper layer data > MPID_NEM_ND_CONN_UDATA_SZ
      * - if any 
      */
+    /* FIXME: Only use/send the valid mw_datas */
     ND_MW_DESCRIPTOR mw_data;
     /* The memory window descriptor containing 
      * memory window descriptors of subsequent user data
      * eg: Non contig sends
      * - if any
      */
+    /* FIXME: Use this for multi-mws */
     ND_MW_DESCRIPTOR mw_mws;
 
 } MPID_Nem_nd_msg_mw_t;
@@ -131,6 +137,10 @@
     INDEndpoint     *p_ep;
     INDConnector    *p_conn;
     MPIDI_VC_t      *vc;
+    /* Set if this conn loses in H-H */
+    int is_orphan;
+    /* Used by conns to store vc till 3-way handshake - i.e., LACK/CACK */
+    MPIDI_VC_t *tmp_vc;
     /* EX OV for Connect() */
     /* FIXME: Use this for Send() etc after extending Executive */
     MPIU_EXOVERLAPPED send_ov;
@@ -164,24 +174,32 @@
 	 * FIXME: Can we get this info from send_credits ?
 	 */
 	int npending_ops;
+
+    /* FIXME : REMOVE ME ! */
+    int npending_rds;
     /* Is a Flow control pkt pending ? */
     int fc_pkt_pending;
 
-	/* FIXME: Make sure that we only have 1 pending RDMA read */
-    /* FIXME: Move rdma fields to another struct */
-    /* Once we finish invalidating a MW - use these credits as send_credits */
-
     /* RDMA Send side fields */
+    int zcp_send_offset;
+    int send_in_progress;
     int zcp_in_progress;
+    ND_SGE  zcp_send_sge;
+    /* MPID_Request *zcp_sreq; */
+    /* The ND memory window */
     INDMemoryWindow *zcp_send_mw;
+    /* The memory window desc sent in the ND message */
     MPID_Nem_nd_msg_mw_t zcp_msg_send_mw;
     ND_MR_HANDLE zcp_send_mr_hnd;
     MPID_Nem_nd_msg_result_t zcp_send_result;
 
     /* RDMA Recv side fields*/
-    int zcp_credits;
+    int cache_credits;
+    int zcp_recv_sge_count;
+    ND_SGE zcp_recv_sge[MPID_IOV_LIMIT];
+    MPID_Request *zcp_rreqp;
     MPID_Nem_nd_msg_mw_t zcp_msg_recv_mw;
-    ND_SGE zcp_recv_sge;
+    /* int zcp_recv_mw_offset; */
     /* MPID_Nem_nd_msg_result_t zcp_recv_result; */
 } *MPID_Nem_nd_conn_hnd_t;
 
@@ -190,6 +208,8 @@
     /* For EX blocking ops */
     MPIU_EXOVERLAPPED ex_ov;
 	MPID_Nem_nd_conn_hnd_t conn_hnd;
+    /* The number of blocking ops to wait before finalizing the hnd */
+    int npending_ops;
 } *MPID_Nem_nd_block_op_hnd_t;
 #define MPID_NEM_ND_BLOCK_OP_HND_INVALID NULL
 #define MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(hnd) (MPIU_EX_GET_OVERLAPPED_PTR(&(hnd->ex_ov)))
@@ -200,7 +220,10 @@
 /* Checks whether conn handle is valid */
 #define MPID_NEM_ND_CONN_HND_IS_VALID(_hnd)   (((_hnd) != NULL) && \
     ((_hnd)->p_conn != NULL) && ((_hnd)->p_ep != NULL))
-#define MPID_NEM_ND_CONN_STATE_SET(_hnd, _state)  (_hnd->state = _state)
+#define MPID_NEM_ND_CONN_STATE_SET(_hnd, _state)  do{   \
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn[%p] %d - %d", (_hnd), (_hnd)->state, _state));    \
+    (_hnd->state = _state); \
+}while(0);
 /* Using an unused IANA protocol family */
 #define MPID_NEM_ND_PROT_FAMILY 234
 
@@ -218,10 +241,11 @@
 };
 
 #define MPID_NEM_ND_CONN_IS_CONNECTING(_conn_hnd) (_conn_hnd && ( (_conn_hnd->state > MPID_NEM_ND_CONN_QUIESCENT) && (_conn_hnd->state < MPID_NEM_ND_CONN_ACTIVE) ))
-
+#define MPID_NEM_ND_CONN_IS_CONNECTED(_conn_hnd) (_conn_hnd && (_conn_hnd->state == MPID_NEM_ND_CONN_ACTIVE))
 /* VC states */
 typedef enum{
     MPID_NEM_ND_VC_STATE_DISCONNECTED=0,
+    MPID_NEM_ND_VC_STATE_CONNECTING,
     MPID_NEM_ND_VC_STATE_CONNECTED
 } MPID_Nem_nd_vc_state_t;
 
@@ -230,6 +254,8 @@
    on the network module */
 typedef struct {
     MPID_Nem_nd_conn_hnd_t conn_hnd;
+    /* Used by connect() to temperorily store the conn handle */
+    MPID_Nem_nd_conn_hnd_t tmp_conn_hnd;
     struct{
         struct MPID_Request *head;
         struct MPID_Request *tail;
@@ -241,9 +267,11 @@
     MPID_Nem_nd_vc_state_t state;
 } MPID_Nem_nd_vc_area;
 
+#define MPID_NEM_ND_IS_BLOCKING_REQ(_reqp) ((_reqp)->dev.OnDataAvail != NULL)
 #define MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(_vc) (((MPIDI_CH3I_VC *)((_vc)->channel_private))->recv_active)
 #define MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(_vc, _req) (((MPIDI_CH3I_VC *)((_vc)->channel_private))->recv_active = _req)
 #define MPID_NEM_ND_VCCH_NETMOD_CONN_HND_INIT(_vc) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->conn_hnd) = MPID_NEM_ND_CONN_HND_INVALID)
+#define MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(_vc) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->tmp_conn_hnd) = MPID_NEM_ND_CONN_HND_INVALID)
 #define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_INIT(_vc)  do{\
     (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq).head = NULL;        \
     (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq).tail = NULL;        \
@@ -254,6 +282,8 @@
 #define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(_vc) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq).tail)
 #define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(_vc, _reqp) GENERIC_Q_ENQUEUE (&(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq), _reqp, dev.next)
 #define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(_vc, _reqp) GENERIC_Q_DEQUEUE (&(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq), _reqp, dev.next)
+#define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_REM_TAIL(_vc, _reqp) GENERIC_Q_SEARCH_REMOVE (&(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq), ( (_reqp) && ((*_reqp)->dev.next == NULL) ), _reqp, MPID_Request, dev.next)
+
 #define MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_INIT(_vc)  do{\
     (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->pending_sendq).head = NULL;        \
     (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->pending_sendq).tail = NULL;        \
@@ -269,25 +299,38 @@
 #define MPID_NEM_ND_VCCH_NETMOD_FIELD_GET(_vc, _field) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->_field)
 #define MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(_vc, _conn_hnd) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->conn_hnd) = _conn_hnd)
 #define MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(_vc) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->conn_hnd)
+#define MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_SET(_vc, _conn_hnd) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->tmp_conn_hnd) = _conn_hnd)
+#define MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_GET(_vc) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->tmp_conn_hnd)
 #define MPID_NEM_ND_VCCH_NETMOD_STATE_SET(_vc, _state) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->state) = _state)
 #define MPID_NEM_ND_VCCH_NETMOD_STATE_GET(_vc) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->state)
 
 /* VC Netmod util funcs */
-#define MPID_NEM_ND_VC_IS_CONNECTED(_vc) (\
+#define MPID_NEM_ND_VC_IS_CONNECTED(_vc) (  \
+    (_vc) &&                                \
+    (MPID_NEM_ND_VCCH_NETMOD_STATE_GET(_vc) == MPID_NEM_ND_VC_STATE_CONNECTED) &&   \
     (MPID_NEM_ND_CONN_HND_IS_VALID(MPID_NEM_ND_VCCH_NETMOD_FIELD_GET(_vc, conn_hnd))) &&     \
     (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(_vc)->state == MPID_NEM_ND_CONN_ACTIVE)     \
 )
 
+#define MPID_NEM_ND_VC_IS_CONNECTING(_vc) (\
+    (_vc) &&    \
+    (MPID_NEM_ND_VCCH_NETMOD_STATE_GET(_vc) == MPID_NEM_ND_VC_STATE_CONNECTING) \
+)
+
 /* CONN is orphan if
  * - conn is not valid
  * - conn is related to a VC that is no longer related to it (eg: lost in head to head)
  */
+/*
 #define MPID_NEM_ND_CONN_IS_ORPHAN(_hnd) (\
     !MPID_NEM_ND_CONN_HND_IS_VALID(_hnd) ||                  \
     ((_hnd->vc) && (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(_hnd->vc) != _hnd)) \
 )
+*/
+#define MPID_NEM_ND_CONN_IS_ORPHAN(_hnd) (_hnd->is_orphan)
 #define MPID_NEM_ND_CONN_HAS_SCREDITS(_hnd) (_hnd->send_credits > 0)
 #define MPID_NEM_ND_CONN_DECR_SCREDITS(_hnd) (_hnd->send_credits--)
+#define MPID_NEM_ND_CONN_DECR_CACHE_SCREDITS(_hnd) (_hnd->cache_credits--)
 /* #define MPID_NEM_ND_CONN_INCR_SCREDITS(_hnd) (_hnd->send_credits++) */
 
 /* #define MPID_NEM_ND_CONN_DECR_RCREDITS(_hnd) (_hnd->recv_credits--) */
@@ -310,18 +353,19 @@
 int MPID_Nem_nd_dev_hnd_init(MPID_Nem_nd_dev_hnd_t *phnd, MPIU_ExSetHandle_t ex_hnd);
 int MPID_Nem_nd_dev_hnd_finalize(MPID_Nem_nd_dev_hnd_t *phnd);
 
-int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPID_Nem_nd_conn_hnd_t *pconn_hnd);
+int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPIDI_VC_t *vc, MPID_Nem_nd_conn_hnd_t *pconn_hnd);
 int MPID_Nem_nd_conn_hnd_finalize(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_hnd_t *p_conn_hnd);
 
 int MPID_Nem_nd_sm_init(void );
 int MPID_Nem_nd_sm_finalize(void );
-int MPID_Nem_nd_sm_poll(void );
+int MPID_Nem_nd_sm_poll(int in_blocking_poll);
 int MPID_Nem_nd_conn_block_op_init(MPID_Nem_nd_conn_hnd_t conn_hnd);
 int MPID_Nem_nd_conn_msg_bufs_init(MPID_Nem_nd_conn_hnd_t conn_hnd);
 int MPID_Nem_nd_listen_for_conn(int pg_rank, char **bc_val_p, int *val_max_sz_p);
 int MPID_Nem_nd_conn_disc(MPID_Nem_nd_conn_hnd_t conn_hnd);
 int MPID_Nem_nd_conn_est(MPIDI_VC_t *vc);
-int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iov, int n_iov);
+int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp);
+int MPID_Nem_nd_post_sendbv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp);
 
 
 int MPID_Nem_nd_init(MPIDI_PG_t *pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p);

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -82,6 +82,7 @@
     vc_ch->prev = NULL;
 
     MPID_NEM_ND_VCCH_NETMOD_CONN_HND_INIT(vc);
+    MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(vc);
     MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_INIT(vc);
     MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_INIT(vc);
  fn_exit:

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -337,10 +337,12 @@
     MPIU_Assert(plocal_won_flag != NULL);
     if(MPIDI_Process.my_pg == remote_pg){
         /* Same process group - compare ranks to determine the winning rank */
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Same process group, comparing ranks");
         *plocal_won_flag = (MPIDI_Process.my_pg_rank < remote_rank) ? 1 : 0;
     }
     else{
         /* Different process groups - compare pg ids to determine the winning rank */
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Diff process group, comparing ids");
         *plocal_won_flag = (strcmp((char *)MPIDI_Process.my_pg->id, remote_pg_id) < 0) ? 1 : 0;
     }
 

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -15,13 +15,13 @@
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_POLL);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POLL);
+    /* MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POLL); */
 
-    mpi_errno = MPID_Nem_nd_sm_poll();
+    mpi_errno = MPID_Nem_nd_sm_poll(in_blocking_poll);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POLL);
+    /* MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POLL); */
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -11,53 +11,70 @@
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPID_Nem_nd_istart_contig_msg(MPIDI_VC_t *vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz,
-                                    MPID_Request **sreq_ptr)
+                                    MPID_Request **sreqp_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPID_Request * sreq = NULL;
+    MPID_Request * sreqp = NULL;
     int is_send_posted = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ISTART_CONTIG_MSG);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ISTART_CONTIG_MSG);
     MPIU_Assert((hdr_sz > 0) && (hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)));
 
-    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "istart_contig_msg (hdr_sz=%d,data_sz=%d)", hdr_sz, data_sz));
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "istart_contig_msg (hdr_sz=" MPIDI_MSG_SZ_FMT ",data_sz=" MPIDI_MSG_SZ_FMT ")", hdr_sz, data_sz));
+
+    /* Create a request and queue it */
+    sreqp = MPID_Request_create();
+    MPIU_Assert(sreqp != NULL);
+    MPIU_Object_set_ref(sreqp, 2);
+    sreqp->kind = MPID_REQUEST_SEND;
+
+    sreqp->dev.OnDataAvail = NULL;
+    sreqp->ch.vc = vc;
+    sreqp->dev.iov_offset = 0;
+
+    sreqp->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
+    sreqp->dev.iov[0].MPID_IOV_BUF = (char *)&sreqp->dev.pending_pkt;
+    sreqp->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
+
+    if(data_sz > 0){
+        sreqp->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
+        sreqp->dev.iov[1].MPID_IOV_LEN = data_sz;
+        sreqp->dev.iov_count = 2;
+    }
+    else{
+        sreqp->dev.iov_count = 1;
+    }
+
+    is_send_posted = 0;
     if(MPID_NEM_ND_VC_IS_CONNECTED(vc)){
         MPID_Nem_nd_conn_hnd_t conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
         /* Try sending data - if no credits queue the remaining data */
         MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connected - trying to send data");
         if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
             /* We have send credits */
-            MPID_IOV iov[2];
-            int iov_cnt;
             /* Post a send for data & queue request */
-            iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )hdr;
-            MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_PktGeneric_t ));
-            iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
-            if(data_sz > 0){
-                iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
-                iov[1].MPID_IOV_LEN = data_sz;
-                iov_cnt = 2;
+            if(!MPID_NEM_ND_IS_BLOCKING_REQ(sreqp)){
+                mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             }
             else{
-                iov_cnt = 1;
+                mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             }
 
-            mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, iov, iov_cnt);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
             /* Now queue the request in posted queue */
             is_send_posted = 1;
         }
     }
     else{
         /* VC is not connected */
-        is_send_posted = 0;
-        if(MPID_NEM_ND_CONN_IS_CONNECTING(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))){
+        if(MPID_NEM_ND_VC_IS_CONNECTING(vc)){
             /* Already connecting - just queue req in pending queue */
             MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connecting - queueing data");
         }
         else{
+            MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_DISCONNECTED);
             /* Start connecting and queue req in pending queue */
             mpi_errno = MPID_Nem_nd_conn_est(vc);
             if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -65,36 +82,14 @@
         }
     }
 
-    /* Create a request and queue it */
-    sreq = MPID_Request_create();
-    MPIU_Assert(sreq != NULL);
-    MPIU_Object_set_ref(sreq, 2);
-    sreq->kind = MPID_REQUEST_SEND;
-
-    sreq->dev.OnDataAvail = NULL;
-    sreq->ch.vc = vc;
-    sreq->dev.iov_offset = 0;
-
-    sreq->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
-    sreq->dev.iov[0].MPID_IOV_BUF = (char *)&sreq->dev.pending_pkt;
-    sreq->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
-
-    if(data_sz > 0){
-        sreq->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
-        sreq->dev.iov[1].MPID_IOV_LEN = data_sz;
-        sreq->dev.iov_count = 2;
-    }
-    else{
-        sreq->dev.iov_count = 1;
-    }
     if(is_send_posted){
-        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreq);
+        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreqp);
     }
     else{
-        MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreq);
+        MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreqp);
     }
 
-    *sreq_ptr = sreq;
+    *sreqp_ptr = sreqp;
 
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ISTART_CONTIG_MSG);
@@ -108,7 +103,7 @@
 #define FUNCNAME MPID_Nem_nd_send_contig
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_send_contig(MPIDI_VC_t *vc, MPID_Request *sreq, void *hdr, MPIDI_msg_sz_t hdr_sz,
+int MPID_Nem_nd_send_contig(MPIDI_VC_t *vc, MPID_Request *sreqp, void *hdr, MPIDI_msg_sz_t hdr_sz,
                                 void *data, MPIDI_msg_sz_t data_sz)
 {
     int mpi_errno = MPI_SUCCESS;
@@ -118,72 +113,70 @@
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SEND_CONTIG);
     MPIU_Assert((hdr_sz > 0) && (hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)));
 
-    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_contig_msg (hdr_sz=%d,data_sz=%d)", hdr_sz, data_sz));
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_contig_msg (hdr_sz=" MPIDI_MSG_SZ_FMT ",data_sz=" MPIDI_MSG_SZ_FMT ")", hdr_sz, data_sz));
+    /* FIXME: Update the req dev iov fields only for unposted sends
+     */
+    sreqp->ch.vc = vc;
+    sreqp->dev.iov_offset = 0;
+    sreqp->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
+    sreqp->dev.iov[0].MPID_IOV_BUF = (char *)&sreqp->dev.pending_pkt;
+    sreqp->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
+
+    if(data_sz > 0){
+        sreqp->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
+        sreqp->dev.iov[1].MPID_IOV_LEN = data_sz;
+        sreqp->dev.iov_count = 2;
+    }
+    else{
+        sreqp->dev.iov_count = 1;
+    }
+
+    is_send_posted = 0;
     if(MPID_NEM_ND_VC_IS_CONNECTED(vc)){
         MPID_Nem_nd_conn_hnd_t conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
         /* Try sending data - if no credits queue the remaining data */
         MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connected - trying to send data");
         if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
-            /* We have send credits */
-            MPID_IOV iov[2];
-            int iov_cnt;
-            /* Post a send for data & queue request */
-            iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )hdr;
-            MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_PktGeneric_t ));
-            iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
-            if(data_sz > 0){
-                iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
-                iov[1].MPID_IOV_LEN = data_sz;
-                iov_cnt = 2;
+
+            if(!MPID_NEM_ND_IS_BLOCKING_REQ(sreqp)){
+                mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             }
             else{
-                iov_cnt = 1;
+                mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             }
 
-            mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, iov, iov_cnt);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
             /* Now queue the request in posted queue */
             is_send_posted = 1;
         }
     }
     else{
         /* VC is not connected */
-        is_send_posted = 0;
-        if(MPID_NEM_ND_CONN_IS_CONNECTING(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))){
+        if(MPID_NEM_ND_VC_IS_CONNECTING(vc)){
             /* Already connecting - just queue req in pending queue */
             MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connecting - queueing data");
         }
         else{
             /* Start connecting and queue req in pending queue */
+            if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) != MPID_NEM_ND_VC_STATE_DISCONNECTED){
+                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "vc(%p:%d), conn(%p:%d",
+                    vc, MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc), 
+                    MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc),
+                    (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))->state));
+            }
+            MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_DISCONNECTED);
             mpi_errno = MPID_Nem_nd_conn_est(vc);
             if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Posted a connect - queueing data");
         }
     }
 
-    /* FIXME: Update the req dev iov fields only for unposted sends
-     */
-    /* Create a request and queue it */
-    sreq->ch.vc = vc;
-    sreq->dev.iov_offset = 0;
-    sreq->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
-    sreq->dev.iov[0].MPID_IOV_BUF = (char *)&sreq->dev.pending_pkt;
-    sreq->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
-
-    if(data_sz > 0){
-        sreq->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
-        sreq->dev.iov[1].MPID_IOV_LEN = data_sz;
-        sreq->dev.iov_count = 2;
-    }
-    else{
-        sreq->dev.iov_count = 1;
-    }
     if(is_send_posted){
-        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreq);
+        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreqp);
     }
     else{
-        MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreq);
+        MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreqp);
     }
 
  fn_exit:
@@ -198,21 +191,88 @@
 #define FUNCNAME MPID_Nem_nd_send_noncontig
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_send_noncontig(MPIDI_VC_t *vc, MPID_Request *sreq, void *header, MPIDI_msg_sz_t hdr_sz)
+int MPID_Nem_nd_send_noncontig(MPIDI_VC_t *vc, MPID_Request *sreqp, void *header, MPIDI_msg_sz_t hdr_sz)
 {
     int mpi_errno = MPI_SUCCESS;
+    int is_send_posted = 0;
+    MPID_IOV *iov;
+    int iov_cnt = 0, i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SEND_NONCONTIG);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SEND_NONCONTIG);
-    MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));
-    /* FIXME: We have not implemented send for non contig msgs yet */
-    MPIU_Assert(0);
+    MPIU_Assert((hdr_sz > 0) && (hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)));
+
+    iov = &(sreqp->dev.iov[0]);
+    /* Reserve 1st IOV for header */
+    iov_cnt = MPID_IOV_LIMIT - 1;
+
+    /* On return iov_cnt refers to the number of IOVs loaded */
+    mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreqp, &(iov[1]), &iov_cnt);
+    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|loadsendiov");
+
+    sreqp->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)header;
+    iov[0].MPID_IOV_BUF = (char *)&sreqp->dev.pending_pkt;
+    iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
+
+    iov_cnt += 1;
+
+    /* FIXME: Update the req dev iov fields only for unposted sends
+     */
+    sreqp->ch.vc = vc;
+    sreqp->dev.iov_offset = 0;
+    sreqp->dev.iov_count = iov_cnt;
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_noncontig_msg (hdr_sz=" MPIDI_MSG_SZ_FMT ")", hdr_sz));
+    for(i=1; i<iov_cnt; i++){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_noncontig_msg (iov[%d] = %p, size =%u)", i, iov[i].MPID_IOV_BUF, iov[i].MPID_IOV_LEN));
+    }
     if(MPID_NEM_ND_VC_IS_CONNECTED(vc)){
+        MPID_Nem_nd_conn_hnd_t conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
         /* Try sending data - if no credits queue the remaining data */
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connected - trying to send data");
+        if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
+            if(!MPID_NEM_ND_IS_BLOCKING_REQ(sreqp)){
+                mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+            else{
+                mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+
+            /* Now queue the request in posted queue */
+            is_send_posted = 1;
+        }
     }
     else{
-        /* Start connecting */
+        /* VC is not connected */
+        is_send_posted = 0;
+        if(MPID_NEM_ND_VC_IS_CONNECTING(vc)){
+            /* Already connecting - just queue req in pending queue */
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connecting - queueing data");
+        }
+        else{
+            /* Start connecting and queue req in pending queue */
+            if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) != MPID_NEM_ND_VC_STATE_DISCONNECTED){
+                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "vc(%p:%d), conn(%p:%d",
+                    vc, MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc), 
+                    MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc),
+                    (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))->state));
+            }
+            MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_DISCONNECTED);
+            mpi_errno = MPID_Nem_nd_conn_est(vc);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Posted a connect - queueing data");
+        }
     }
+
+    if(is_send_posted){
+        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreqp);
+    }
+    else{
+        MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreqp);
+    }
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SEND_NONCONTIG);
     return mpi_errno;

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp	2011-02-21 22:57:52 UTC (rev 8003)
@@ -16,13 +16,15 @@
 static int gen_recv_fail_handler(MPID_Nem_nd_msg_result_t *recv_result);
 static int send_success_handler(MPID_Nem_nd_msg_result_t *send_result);
 static int zcp_mw_send_success_handler(MPID_Nem_nd_msg_result_t *send_result);
-static int zcp_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_result);
+static int cont_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_result);
 static int netmod_msg_send_success_handler(MPID_Nem_nd_msg_result_t *send_result);
+static int zcp_read_success_handler(MPID_Nem_nd_msg_result_t *send_result);
 static int zcp_read_fail_handler(MPID_Nem_nd_msg_result_t *send_result);
 static int wait_cack_success_handler(MPID_Nem_nd_msg_result_t *recv_result);
 static int wait_lack_success_handler(MPID_Nem_nd_msg_result_t *recv_result);
 static int recv_success_handler(MPID_Nem_nd_msg_result_t *send_result);
 static int dummy_msg_handler(MPID_Nem_nd_msg_result_t *result);
+static int quiescent_msg_handler(MPID_Nem_nd_msg_result_t *result);
 static int free_msg_result_handler(MPID_Nem_nd_msg_result_t *result);
 
 /* The EX handler func decls */
@@ -35,13 +37,20 @@
 static int __cdecl gen_ex_fail_handler(MPIU_EXOVERLAPPED *ov);
 static int __cdecl block_op_handler(MPIU_EXOVERLAPPED *ov);
 static int __cdecl manual_event_handler(MPIU_EXOVERLAPPED *ov);
+static int __cdecl dummy_handler(MPIU_EXOVERLAPPED *ov);
 }
 
 static inline int MPID_Nem_nd_handle_posted_sendq_head_req(MPIDI_VC_t *vc, int *req_complete);
+static inline int MPID_Nem_nd_handle_posted_sendq_tail_req(MPIDI_VC_t *vc, int *req_complete);
 static int process_pending_req(MPID_Nem_nd_conn_hnd_t conn_hnd);
 int MPID_Nem_nd_update_fc_info(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Nem_nd_msg_t *pmsg);
 int MPID_Nem_nd_sm_block(MPID_Nem_nd_block_op_hnd_t op_hnd);
-
+int MPID_Nem_nd_pack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iovp,
+                         int offset_start,
+                         int *offset_endp,
+                         MPID_Nem_nd_msg_t *pmsg,
+                         MPID_Nem_nd_pack_t *pack_typep,
+                         SIZE_T *nbp);
 #undef FUNCNAME
 #define FUNCNAME MPID_Nem_nd_sm_init
 #undef FCNAME
@@ -76,7 +85,7 @@
 #define FUNCNAME MPID_Nem_nd_block_op_init
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_block_op_init(MPID_Nem_nd_block_op_hnd_t *phnd, MPID_Nem_nd_conn_hnd_t conn_hnd)
+int MPID_Nem_nd_block_op_init(MPID_Nem_nd_block_op_hnd_t *phnd, int npending_ops, MPID_Nem_nd_conn_hnd_t conn_hnd, int is_manual_event)
 {
     int mpi_errno = MPI_SUCCESS;
 	HRESULT hr;
@@ -87,36 +96,35 @@
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_BLOCK_OP_INIT);
 
     MPIU_Assert(phnd != NULL);
+    MPIU_Assert(npending_ops > 0);
+
+    /* FIXME: For now we only allow 1 blocking op on the conn */
+    MPIU_Assert(conn_hnd->npending_ops == 0);
 	MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
 
     MPIU_CHKPMEM_MALLOC(*phnd, MPID_Nem_nd_block_op_hnd_t, sizeof(struct MPID_Nem_nd_block_op_hnd_), mpi_errno, "Block op hnd");
 
+    (*phnd)->npending_ops = npending_ops;
 	(*phnd)->conn_hnd = conn_hnd;
 
-	if(conn_hnd->npending_ops <= 1){
-		/* Call the block op handlers only when the last pending event is over 
-		 * Note that the event handler gets called AFTER the event
-		 */
-		MPIU_ExInitOverlapped(&((*phnd)->ex_ov), block_op_handler, block_op_handler);
-	}
-	else{
-		/* Handle manual events with the event handler */
-		MPIU_ExInitOverlapped(&((*phnd)->ex_ov), manual_event_handler, manual_event_handler);
-	}
+    if(is_manual_event){
+        MPIU_ExInitOverlapped(&((*phnd)->ex_ov), manual_event_handler, manual_event_handler);
+    }
+    else{
+        MPIU_Assert(0);
+        MPIU_ExInitOverlapped(&((*phnd)->ex_ov), block_op_handler, block_op_handler);
+    }
 
     pov = MPIU_EX_GET_OVERLAPPED_PTR(&((*phnd)->ex_ov));
 
     /* Executive initializes event to NULL - So create events after initializing the 
      * handlers
      */
-    pov->hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
+    pov->hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
     MPIU_ERR_CHKANDJUMP((pov->hEvent == NULL), mpi_errno, MPI_ERR_OTHER, "**intern");
 
-	/* Get notification for all events on CQ */
-	hr = MPID_Nem_nd_dev_hnd_g->p_cq->Notify(ND_CQ_NOTIFY_ANY, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR((*phnd)));
-	MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
-            mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
-            _com_error(hr).ErrorMessage(), hr);
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Get notifications from cq(%p)/conn(%p)/block_op(%p) on ov(%p)",
+        MPID_Nem_nd_dev_hnd_g->p_cq, conn_hnd, (*phnd), pov));
 
     MPIU_CHKPMEM_COMMIT();
  fn_exit:
@@ -135,12 +143,20 @@
 int MPID_Nem_nd_block_op_finalize(MPID_Nem_nd_block_op_hnd_t *phnd)
 {
     int mpi_errno = MPI_SUCCESS;
+    OVERLAPPED *pov;
+
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_BLOCK_OP_FINALIZE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_BLOCK_OP_FINALIZE);
 
     MPIU_Assert(phnd != NULL);
     if(*phnd){
+        pov = MPIU_EX_GET_OVERLAPPED_PTR(&((*phnd)->ex_ov));
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trying to finalize conn(%p)/block_op(%p) on ov(%p)",
+           (*phnd)->conn_hnd, (*phnd), pov));
+        if(pov->hEvent){
+            CloseHandle(pov->hEvent);
+        }
         MPIU_Free(*phnd);
     }
  fn_exit:
@@ -168,18 +184,15 @@
 	MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(op_hnd->conn_hnd));
 
 	/* Re-initialize the ex ov */
-	if(op_hnd->conn_hnd->npending_ops <= 1){
-		ret = MPIU_ExReInitOverlapped(&(op_hnd->ex_ov), block_op_handler, block_op_handler);
-		MPIU_ERR_CHKANDJUMP((ret == FALSE), mpi_errno, MPI_ERR_OTHER, "**intern");
-	}
-	else{
-		ret = MPIU_ExReInitOverlapped(&(op_hnd->ex_ov), manual_event_handler, manual_event_handler);
-		MPIU_ERR_CHKANDJUMP((ret == FALSE), mpi_errno, MPI_ERR_OTHER, "**intern");
-	}
+    ret = MPIU_ExReInitOverlapped(&(op_hnd->ex_ov), NULL, NULL);
+    MPIU_ERR_CHKANDJUMP((ret == FALSE), mpi_errno, MPI_ERR_OTHER, "**intern");
 
     pov = MPIU_EX_GET_OVERLAPPED_PTR(&(op_hnd->ex_ov));
     MPIU_Assert(pov->hEvent != NULL);
 
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Re-initializing conn(%p)/block_op(%p) on ov(%p)",
+           op_hnd->conn_hnd, op_hnd, pov));
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_BLOCK_OP_REINIT);
     return mpi_errno;
@@ -188,39 +201,7 @@
     goto fn_exit;
 }
 
-
-/*
 #undef FUNCNAME
-#define FUNCNAME MPID_Nem_nd_conn_block_op_reinit
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_conn_block_op_reinit(MPID_Nem_nd_conn_hnd_t conn_hnd)
-{
-    int mpi_errno = MPI_SUCCESS;
-    OVERLAPPED *pov;
-    BOOL ret;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_CONN_BLOCK_OP_REINIT);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_CONN_BLOCK_OP_REINIT);
-
-    MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(conn_hnd));
-
-    pov = MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->block_ov));
-    MPIU_Assert(pov->hEvent != NULL);
-
-    ret = ResetEvent(pov->hEvent);
-    MPIU_ERR_CHKANDJUMP((pov->hEvent == NULL), mpi_errno, MPI_ERR_OTHER, "**intern");
-
- fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_CONN_BLOCK_OP_REINIT);
-    return mpi_errno;
- fn_fail:
-    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
-    goto fn_exit;
-}
-
-*/
-
 #define FUNCNAME MPID_Nem_nd_conn_msg_bufs_init
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -244,35 +225,35 @@
     MSGBUF_FREEQ_INIT(conn_hnd);
 
     /* Register the sendq & recvq with adapter - We block while registering memory */
-    mpi_errno = MPID_Nem_nd_block_op_init(&rsbuf_op_hnd, conn_hnd);
+    mpi_errno = MPID_Nem_nd_block_op_init(&rsbuf_op_hnd, 1, conn_hnd, 1);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registring rs memory conn(%p)/block_op(%p) on ov(%p)",
+           conn_hnd, rsbuf_op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(rsbuf_op_hnd)));
+
     hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->rsbuf, sizeof(conn_hnd->rsbuf), MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(rsbuf_op_hnd), &(conn_hnd->rsbuf_hmr));
-    if(hr == ND_PENDING){
+    if(SUCCEEDED(hr)){
 		/* Manual event */
 		conn_hnd->npending_ops++;
 		mpi_errno = MPID_Nem_nd_sm_block(rsbuf_op_hnd);
 		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		/*
-        hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(rsbuf_op_hnd), &nb, TRUE);
-		*/
     }
     MPIU_ERR_CHKANDJUMP2(FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_listen", "**nd_listen %s %d",
         _com_error(hr).ErrorMessage(), hr);
 
-    mpi_errno = MPID_Nem_nd_block_op_init(&ssbuf_op_hnd, conn_hnd);
+    mpi_errno = MPID_Nem_nd_block_op_init(&ssbuf_op_hnd, 1, conn_hnd, 1);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registring ss memory conn(%p)/block_op(%p) on ov(%p)",
+           conn_hnd, ssbuf_op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(ssbuf_op_hnd)));
+
     hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->ssbuf, sizeof(conn_hnd->ssbuf), MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(ssbuf_op_hnd), &(conn_hnd->ssbuf_hmr));
-    if(hr == ND_PENDING){
+    if(SUCCEEDED(hr)){
 		/* Manual event */
 		conn_hnd->npending_ops++;
 		mpi_errno = MPID_Nem_nd_sm_block(ssbuf_op_hnd);
 		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		/*
-        hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(ssbuf_op_hnd), &nb, TRUE);
-		*/
     }
     MPIU_ERR_CHKANDJUMP2(FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_listen", "**nd_listen %s %d",
@@ -299,12 +280,15 @@
     }
     conn_hnd->p_ep->SubmitRequestBatch();
 
-    /* FIXME: REMOVE ME !! -start */
     MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn_hnd (%p)", conn_hnd));
+    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "========== RECV SBUFS ===========");
     for(i=0; i<MPID_NEM_ND_CONN_RECVQ_SZ;i++){
         MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn_hnd->rsbuf[%d].msg = (%p)", i, &(conn_hnd->rsbuf[i].msg)));
     }
-    /* FIXME: REMOVE ME !! -end */
+    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "========== SEND SBUFS ===========");
+    for(i=0; i<MPID_NEM_ND_CONN_SENDQ_SZ;i++){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn_hnd->ssbuf[%d].msg = (%p)", i, &(conn_hnd->ssbuf[i].msg)));
+    }
 
 fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_CONN_MSG_BUFS_INIT);
@@ -333,18 +317,18 @@
     if(is_blocking){
         MPID_Nem_nd_block_op_hnd_t op_hnd;
 
-        mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, lconn_hnd);
+        mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, 1, lconn_hnd, 1);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        
+
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Blocking on accept lconn(%p)/block_op(%p) on ov(%p)",
+           lconn_hnd, op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd)));
+
         hr = new_conn_hnd->p_conn->Accept(new_conn_hnd->p_ep, NULL, 0, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd));
-        if(hr == ND_PENDING){
+        if(SUCCEEDED(hr)){
 			/* Manual event */
 			lconn_hnd->npending_ops++;
 			mpi_errno = MPID_Nem_nd_sm_block(op_hnd);
 			if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-			/*
-            hr = new_conn_hnd->p_conn->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd), &nb, TRUE);
-			*/
         }
         MPIU_ERR_CHKANDJUMP2(FAILED(hr),
             mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
@@ -356,9 +340,12 @@
             mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
             _com_error(hr).ErrorMessage(), hr);
 
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting next req for conn on lconn(%p)/block_op(%p) on ov(%p)",
+           lconn_hnd, op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd)));
+
         /* Post next req for connection */
         hr = MPID_Nem_nd_dev_hnd_g->p_listen->GetConnectionRequest(lconn_hnd->p_conn, MPIU_EX_GET_OVERLAPPED_PTR(&(lconn_hnd->recv_ov)));
-        MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+        MPIU_ERR_CHKANDJUMP2(FAILED(hr),
             mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
             _com_error(hr).ErrorMessage(), hr);
     }
@@ -366,7 +353,7 @@
         MPIU_Assert(0);
         SET_EX_RD_HANDLER(lconn_hnd, listen_success_handler, quiescent_handler);
         hr = MPID_Nem_nd_lconn_hnd->p_conn->Accept(new_conn_hnd->p_ep, NULL, 0, MPIU_EX_GET_OVERLAPPED_PTR(&(MPID_Nem_nd_lconn_hnd->recv_ov)));
-        MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+        MPIU_ERR_CHKANDJUMP2(FAILED(hr),
             mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
             _com_error(hr).ErrorMessage(), hr);
     }
@@ -386,7 +373,7 @@
 int MPID_Nem_nd_listen_for_conn(int pg_rank, char **bc_val_p, int *val_max_sz_p)
 {
     int mpi_errno = MPI_SUCCESS, ret, use_default_interface=0;
-    size_t len;
+    SIZE_T len;
     HRESULT hr;
     char *buf;
     int i;
@@ -397,7 +384,7 @@
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_LISTEN_FOR_CONN);
 
     /* Create listen conn */
-    mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_LISTEN_CONN, NULL, &MPID_Nem_nd_lconn_hnd);
+    mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_LISTEN_CONN, NULL, NULL, &MPID_Nem_nd_lconn_hnd);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
     /* Listen for connections */
@@ -411,9 +398,12 @@
 
     SET_EX_RD_HANDLER(MPID_Nem_nd_lconn_hnd, listen_success_handler, quiescent_handler);
 
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting first req for conn on lconn(%p)on ov(%p)",
+           MPID_Nem_nd_lconn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(MPID_Nem_nd_lconn_hnd->recv_ov))));
+
     /* FIXME: How many conn requests should we pre-post ? */
     hr = MPID_Nem_nd_dev_hnd_g->p_listen->GetConnectionRequest(MPID_Nem_nd_lconn_hnd->p_conn, MPIU_EX_GET_OVERLAPPED_PTR(&(MPID_Nem_nd_lconn_hnd->recv_ov)));
-    MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+    MPIU_ERR_CHKANDJUMP2(FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_listen", "**nd_listen %s %d",
         _com_error(hr).ErrorMessage(), hr);
 
@@ -470,6 +460,7 @@
     goto fn_exit;
 }
 
+/* Wait for discing until the other side sends us some data or disconnects */
 #undef FUNCNAME
 #define FUNCNAME MPID_Nem_nd_conn_passive_disc
 #undef FCNAME
@@ -487,11 +478,13 @@
         /* Make the conn an orphan */
         conn_hnd->vc = NULL;
         MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
-        SET_EX_RD_HANDLER(conn_hnd, passive_quiescent_handler, passive_quiescent_handler);
+        SET_EX_RD_HANDLER(conn_hnd, dummy_handler, dummy_handler);
 
-        /* Set the recv sbuf handlers to dummy handlers */
+        /* Set the recv sbuf handlers to quiescent msg handlers - the conn is disconnected
+         * after we receive a CACK/CNAK, i.e., some data, on this conn
+         */
         for(i=0;i<MPID_NEM_ND_CONN_RECVQ_SZ;i++){
-            SET_MSGBUF_HANDLER(&((conn_hnd->rsbuf[i]).msg), dummy_msg_handler, dummy_msg_handler);
+            SET_MSGBUF_HANDLER(&((conn_hnd->rsbuf[i]).msg), quiescent_msg_handler, quiescent_msg_handler);
         }
     }
 
@@ -522,17 +515,26 @@
         int i=0;
         /* Make the conn an orphan */
         conn_hnd->vc = NULL;
+        conn_hnd->tmp_vc = NULL;
+
         MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
         SET_EX_WR_HANDLER(conn_hnd, quiescent_handler, quiescent_handler);
-
+
+        /* FIXME: DEREGISTER ALL RECV BUFS HERE ...*/
         /* Set the recv sbuf handlers to dummy handlers */
         for(i=0;i<MPID_NEM_ND_CONN_RECVQ_SZ;i++){
             SET_MSGBUF_HANDLER(&((conn_hnd->rsbuf[i]).msg), dummy_msg_handler, dummy_msg_handler);
         }
 
+        MPIU_Assert(conn_hnd->npending_ops == 0);
+        MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "Posting disconnect on conn(%p)", conn_hnd);
+    
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting disc on conn(%p) on ov(%p)",
+           conn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov))));
+
         /* Post disconnect on the ND Conn corresponding to VC */
         hr = conn_hnd->p_conn->Disconnect(MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)));
-        MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+        MPIU_ERR_CHKANDJUMP2(FAILED(hr),
             mpi_errno, MPI_ERR_OTHER, "**nd_disc", "**nd_disc %s %d",
             _com_error(hr).ErrorMessage(), hr);
     }
@@ -550,7 +552,7 @@
 #define FUNCNAME MPID_Nem_nd_post_send_msg
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_post_send_msg(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Nem_nd_msg_t *pmsg, int msg_len, int is_blocking)
+int MPID_Nem_nd_post_send_msg(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Nem_nd_msg_t *pmsg, SIZE_T msg_len, int is_blocking)
 {
     int mpi_errno = MPI_SUCCESS;
     HRESULT hr;
@@ -567,12 +569,16 @@
     
     was_fc_pkt = (MPID_NEM_ND_IS_FC_PKT(pmsg->hdr.type)) ? TRUE : FALSE;
 
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Post send msg [conn=%p, on/msg = %p, sz=%d]",conn_hnd, pmsg, msg_len));
+
     /* Update FC info */
     mpi_errno = MPID_Nem_nd_update_fc_info(conn_hnd, pmsg);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
     if(is_blocking){
-		mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, conn_hnd);
+        /* FIXME: Allow blocking sends */
+        MPIU_Assert(0);
+		mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, 1, conn_hnd, 0);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         MPIU_CHKPMEM_MALLOC(pmsg_result, MPID_Nem_nd_msg_result_t *, sizeof(MPID_Nem_nd_msg_result_t ), mpi_errno, "block send op result");
@@ -588,13 +594,13 @@
     sge.pAddr = pmsg;
     sge.hMr = conn_hnd->ssbuf_hmr;
 
-    hr = conn_hnd->p_ep->Send(pnd_result, &sge, 1, 0x0);
+    hr = conn_hnd->p_ep->Send(pnd_result, &sge, 1, ND_OP_FLAG_READ_FENCE);
     MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
         _com_error(hr).ErrorMessage(), hr);
 
 	/* Increment the number of pending ops on conn */
-	conn_hnd->npending_ops++;
+	/* conn_hnd->npending_ops++; */
 
     if(is_blocking){
 		/* Block till all current pending ops complete */
@@ -604,20 +610,19 @@
 		/* No pending ops */
 		MPIU_Assert(conn_hnd->npending_ops == 0);
 
-		mpi_errno = MPID_Nem_nd_block_op_finalize(&op_hnd);
-		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
-        /*
-        nresults = MPID_Nem_nd_dev_hnd_g->p_cq->GetResults(&presult, 1);
-        MPIU_ERR_CHKANDJUMP2(FAILED(presult->Status),
-            mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
-            _com_error(presult->Status).ErrorMessage(), presult->Status);
-            */
         MPIU_CHKPMEM_COMMIT();
     }
 
     if(was_fc_pkt){
-        MPID_NEM_ND_CONN_DECR_SCREDITS(conn_hnd);
+        if(conn_hnd->send_in_progress){
+            if(!conn_hnd->zcp_in_progress){
+                MPID_NEM_ND_CONN_DECR_CACHE_SCREDITS(conn_hnd);
+            }
+            /* ZCP packets are not flow controlled */
+        }
+        else{
+            MPID_NEM_ND_CONN_DECR_SCREDITS(conn_hnd);
+        }
     }
 
  fn_exit:
@@ -719,10 +724,8 @@
         /* FIXME: fc info in pkt is updated for every msg sent.
          * Do we have to explicitly update fc info here ?
          */
-        /*
         mpi_errno = MPID_Nem_nd_update_fc_info(conn_hnd, pfc_msg);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        */
 
         MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Sending CRED PKT...");
         mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pfc_msg, sizeof(MPID_Nem_nd_msg_hdr_t ), 0);
@@ -748,8 +751,13 @@
 int bind_mw_success_handler(MPID_Nem_nd_msg_result_t *zcp_send_result)
 {
     int mpi_errno = MPI_SUCCESS;
+    int ret_errno;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
+    MPID_Nem_nd_pack_t pack_type = MPID_NEM_ND_INVALID_PACK;
+    MPID_Nem_nd_msg_t *pmsg;
     MPID_Request *zcp_req = NULL;
+    int i;
+    SIZE_T nb, msg_len, rem_len;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_BIND_MW_SUCCESS_HANDLER);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_BIND_MW_SUCCESS_HANDLER);
@@ -759,11 +767,103 @@
 
     zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
     MPIU_Assert(zcp_req != NULL);
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "bind succ for IOV[%d] = %p; iov_offset = %d/rem=%d, req=%p",
+        conn_hnd->zcp_send_offset, zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_BUF,
+        zcp_req->dev.iov_offset, zcp_req->dev.iov_count, zcp_req));
+    MPIU_Assert(zcp_req->dev.iov_offset + zcp_req->dev.iov_count <= MPID_IOV_LIMIT);
+    MPIU_Assert(zcp_req->dev.iov_count > 0);
 
-    /* MW created, Registered buf, Bound MW, now post send */
-    mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, zcp_req->dev.iov, zcp_req->dev.iov_count);
-    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    MPIU_Assert(!MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd));
 
+    /* Post the ND message 
+     * iov[zcp_req->dev.iov_offset, conn_hnd->zcp_send_offset]
+     * First pack any non-ZCP IOVs, then copy zcp send mw & send
+     */
+    MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
+    MPIU_Assert(pmsg != NULL);
+    
+    SET_MSGBUF_HANDLER(pmsg, zcp_mw_send_success_handler, gen_send_fail_handler);
+    pmsg->hdr.type = MPID_NEM_ND_RD_AVAIL_PKT;
+    pmsg->hdr.credits = 0;
+    msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
+    rem_len = sizeof(pmsg->buf);
+
+    nb = 0;
+    if(zcp_req->dev.iov_offset < conn_hnd->zcp_send_offset){
+        int off_end;
+        off_end = conn_hnd->zcp_send_offset;
+            
+        /* Piggy-back IOVs */
+        mpi_errno = MPID_Nem_nd_pack_iov(conn_hnd,
+                        zcp_req->dev.iov,
+                        zcp_req->dev.iov_offset,
+                        &off_end,
+                        pmsg,
+                        &(pack_type),
+                        &nb);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        MPIU_Assert(pack_type == MPID_NEM_ND_SR_PACK);
+
+        msg_len += nb;
+        rem_len -= nb;
+    }
+
+    /* Now copy the MSG MW to the packet */
+    MPIU_Assert(rem_len >= sizeof(MPID_Nem_nd_msg_mw_t ));
+    ret_errno = memcpy_s((void *)&(pmsg->buf[nb]), rem_len, &(conn_hnd->zcp_msg_send_mw), sizeof(MPID_Nem_nd_msg_mw_t ));
+    MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,    
+        "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
+
+    msg_len += sizeof(MPID_Nem_nd_msg_mw_t );
+    rem_len -= sizeof(MPID_Nem_nd_msg_mw_t );
+
+    /* Block on progress engine if we exceed the number of RDs allowed on the conn/device */
+    while(MPID_Nem_nd_dev_hnd_g->npending_rds >= 2){
+        mpi_errno = MPID_Nem_nd_sm_poll(0);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+    MPID_Nem_nd_dev_hnd_g->npending_rds++; conn_hnd->npending_rds++;
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+        MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
+
+    for(i=zcp_req->dev.iov_offset; i<conn_hnd->zcp_send_offset; i++){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending SR packed IOV[off=%d/tot_iovs=%d]=[%p/%u]",
+        i, zcp_req->dev.iov_count,
+        zcp_req->dev.iov[i].MPID_IOV_BUF,
+        zcp_req->dev.iov[i].MPID_IOV_LEN));
+
+    }
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending mem descriptor (buf=%p) : base = %p, length=%I64d, token=%d, mw=%p",
+                zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_BUF,
+                _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Base),
+                _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Length),
+                conn_hnd->zcp_msg_send_mw.mw_data.Token,
+                conn_hnd->zcp_send_mw));
+
+    mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
+	if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if(zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_LEN == 0){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "zcp_req(%p) off[%d -> %d], cnt[%d -> %d]",
+            zcp_req, zcp_req->dev.iov_offset, conn_hnd->zcp_send_offset + 1,
+            zcp_req->dev.iov_count, zcp_req->dev.iov_count - (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset + 1)));
+        /* Rem IOVs */
+        zcp_req->dev.iov_count -= (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset + 1);
+
+        zcp_req->dev.iov_offset = conn_hnd->zcp_send_offset + 1;
+    }
+    else{
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "zcp_req(%p) off[%d -> %d], cnt[%d -> %d]",
+            zcp_req, zcp_req->dev.iov_offset, conn_hnd->zcp_send_offset,
+            zcp_req->dev.iov_count, zcp_req->dev.iov_count - (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset)));
+        /* Rem IOVs */
+        zcp_req->dev.iov_count -= (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset);
+
+        zcp_req->dev.iov_offset = conn_hnd->zcp_send_offset;
+    }
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_BIND_MW_SUCCESS_HANDLER);
     return mpi_errno;
@@ -773,25 +873,37 @@
 }
 
 #undef FUNCNAME
-#define FUNCNAME reg_zcp_mem_success_handler
+#define FUNCNAME reg_zcp_reg_sreq_bind_handler
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int reg_zcp_mem_success_handler(MPIU_EXOVERLAPPED *send_ov)
+int reg_zcp_reg_sreq_bind_handler(MPIU_EXOVERLAPPED *send_ov)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
     MPID_Nem_nd_msg_result_t *pmsg_result;
     MPID_Request *zcp_req = NULL;
     HRESULT hr;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_MEM_SUCCESS_HANDLER);
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_REG_SREQ_BIND_HANDLER);
     MPIU_CHKPMEM_DECL(1);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_MEM_SUCCESS_HANDLER);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_REG_SREQ_BIND_HANDLER);
 
     conn_hnd = GET_CONNHND_FROM_EX_SEND_OV(send_ov);
     MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
 
-    MPIU_CHKPMEM_MALLOC(pmsg_result, MPID_Nem_nd_msg_result_t *, sizeof(MPID_Nem_nd_msg_result_t ), mpi_errno, "cr_mem_win result");
+    zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+    MPIU_Assert(zcp_req != NULL);
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "About to bind IOV[%d] = %p; iov_offset = %d/tot=%d, req=%p",
+        conn_hnd->zcp_send_offset, zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_BUF,
+        zcp_req->dev.iov_offset, zcp_req->dev.iov_count, zcp_req));
+    MPIU_Assert(zcp_req->dev.iov_offset + zcp_req->dev.iov_count <= MPID_IOV_LIMIT);
+    MPIU_Assert(zcp_req->dev.iov_count > 0);  
+
+    /* Create Memory Window for sending data */
+    MPIU_CHKPMEM_MALLOC(pmsg_result, MPID_Nem_nd_msg_result_t *, 
+        sizeof(MPID_Nem_nd_msg_result_t ), mpi_errno, "cr_mem_win result");
+
     INIT_MSGRESULT(pmsg_result, free_msg_result_handler, free_msg_result_handler);
 
     hr = MPID_Nem_nd_dev_hnd_g->p_ad->CreateMemoryWindow(&(pmsg_result->result), &(conn_hnd->zcp_send_mw));
@@ -801,96 +913,202 @@
 
     MPIU_CHKPMEM_COMMIT();
 
-    zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
-    MPIU_Assert(zcp_req != NULL);
-
+    /* Initialize the MW descriptor to be sent in the ND message */
     conn_hnd->zcp_msg_send_mw.mw_data.Base = 0;
     conn_hnd->zcp_msg_send_mw.mw_data.Length = 0;
     conn_hnd->zcp_msg_send_mw.mw_data.Token = 0;
-    /* MW created, mem registered, now bind the buffer */
-    /* FIXME: Do we need a read fence ? */
+
     INIT_MSGRESULT(&(conn_hnd->zcp_send_result), bind_mw_success_handler, gen_send_fail_handler);
-    hr = conn_hnd->p_ep->Bind(&(conn_hnd->zcp_send_result.result), conn_hnd->zcp_send_mr_hnd,
-            conn_hnd->zcp_send_mw, zcp_req->dev.iov[1].MPID_IOV_BUF,
-            zcp_req->dev.iov[1].MPID_IOV_LEN, ND_OP_FLAG_ALLOW_READ, &(conn_hnd->zcp_msg_send_mw.mw_data));
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Binding IOV[%d/tot=%d]=[%p/%u], dev_off=%d, mr=%x, mw=%p, conn=%p",
+        conn_hnd->zcp_send_offset, 
+        zcp_req->dev.iov_count,
+        conn_hnd->zcp_send_sge.pAddr,
+        conn_hnd->zcp_send_sge.Length,
+        zcp_req->dev.iov_offset,
+        conn_hnd->zcp_send_mr_hnd,
+        conn_hnd->zcp_send_mw, conn_hnd));
+
+    hr = conn_hnd->p_ep->Bind(&(conn_hnd->zcp_send_result.result), 
+            conn_hnd->zcp_send_mr_hnd,
+            conn_hnd->zcp_send_mw,
+            conn_hnd->zcp_send_sge.pAddr,
+            conn_hnd->zcp_send_sge.Length, 
+            (ND_OP_FLAG_READ_FENCE | ND_OP_FLAG_ALLOW_READ), 
+            &(conn_hnd->zcp_msg_send_mw.mw_data));
+
     MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
         _com_error(hr).ErrorMessage(), hr);
-
+
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_MEM_SUCCESS_HANDLER);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_REG_SREQ_BIND_HANDLER);
     return mpi_errno;
  fn_fail:
     MPIU_CHKPMEM_REAP();
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
     goto fn_exit;
 }
-/*
+
+/* Register mem for sreq */
 #undef FUNCNAME
-#define FUNCNAME create_mw_success_handler
+#define FUNCNAME MPID_Nem_nd_zcp_reg_smem
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int create_mw_success_handler(MPID_Nem_nd_msg_result_t *pmsg_result)
+int MPID_Nem_nd_zcp_reg_smem(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iovp, int iov_offset)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPID_Nem_nd_conn_hnd_t conn_hnd;
-    MPID_Request *zcp_req = NULL;
     HRESULT hr;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_CREATE_MW_SUCCESS_HANDLER);
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_REG_SMEM);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_CREATE_MW_SUCCESS_HANDLER);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_REG_SMEM);
 
-    conn_hnd = GET_CONNHND_FROM_ZCP_MSGRESULT(pmsg_result);
-    MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registering IOV[%d]=%p/%u (conn=%p)",
+        iov_offset, iovp[iov_offset].MPID_IOV_BUF, iovp[iov_offset].MPID_IOV_LEN, conn_hnd));
 
-    / The request at the tail of the posted queue should contain
-     * the buffer
-     /
-    zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
-    MPIU_Assert(zcp_req != NULL);
+    /* Keep track of the zcp send offset */
+    conn_hnd->zcp_send_offset = iov_offset;
 
-    SET_EX_WR_HANDLER(conn_hnd, reg_zcp_mem_success_handler, gen_ex_fail_handler);
-    hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(zcp_req->dev.iov[1].MPID_IOV_BUF,
-        zcp_req->dev.iov[1].MPID_IOV_LEN, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)),
-        &(conn_hnd->zcp_mr_hnd));
+    conn_hnd->zcp_send_sge.hMr = NULL;
+    if(iovp[iov_offset].MPID_IOV_LEN > MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g)){
+        conn_hnd->zcp_send_sge.pAddr = iovp[iov_offset].MPID_IOV_BUF;
+        conn_hnd->zcp_send_sge.Length = MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g);
+
+        iovp[iov_offset].MPID_IOV_BUF += MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g);
+        iovp[iov_offset].MPID_IOV_LEN -= MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g);
+    }
+    else{
+        conn_hnd->zcp_send_sge.pAddr = iovp[iov_offset].MPID_IOV_BUF;
+        conn_hnd->zcp_send_sge.Length = iovp[iov_offset].MPID_IOV_LEN;
+
+        iovp[iov_offset].MPID_IOV_LEN = 0;
+    }
+
+    MPIU_Assert(!conn_hnd->zcp_in_progress);
+    SET_EX_WR_HANDLER(conn_hnd, reg_zcp_reg_sreq_bind_handler, gen_ex_fail_handler);
+
+    memset(&(conn_hnd->zcp_send_mr_hnd), 0x0, sizeof(conn_hnd->zcp_send_mr_hnd));
+
+    /* Register buffer */
+    hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(
+        conn_hnd->zcp_send_sge.pAddr,
+        conn_hnd->zcp_send_sge.Length,
+        MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)),
+        &(conn_hnd->zcp_send_mr_hnd));
     MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
         _com_error(hr).ErrorMessage(), hr);
 
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_CREATE_MW_SUCCESS_HANDLER);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_REG_SMEM);
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
     goto fn_exit;
 }
-*/
+
+/* Both start and end offsets returned are valid offsets */
+static inline MPID_Nem_nd_pack_t nd_pack_iov_get_params(MPID_IOV *iovp, int start, int *end){
+    u_long rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
+    int i;
+    for(i = start; (i < *end) && (rem_len >= iovp[i].MPID_IOV_LEN); i++){
+        rem_len -= iovp[i].MPID_IOV_LEN;
+    }
+    if(i == *end){
+        /* All IOVs can be packed */
+        *end = i - 1;
+        return MPID_NEM_ND_SR_PACK;
+    }
+    else if(iovp[i].MPID_IOV_LEN <= MPID_NEM_ND_CONN_UDATA_SZ){
+        *end = i - 1;
+        return MPID_NEM_ND_SR_PACK;
+    }
+    else{
+        /* One more IOV can be packed using ZCP packing */
+        *end = i;
+        return MPID_NEM_ND_ZCP_PACK;
+    }
+}
+
+/* Input:
+ * offset_start => the start req offset for packing
+ * offset_endp => the max req offset for packing, (offset_endp-1) is the max valid offset
+ * Output:
+ * offset_endp => Used to return the final req offset packed
+ * pack_typep => Used to return the packing type
+ * nbp => Used to return bytes packed
+ */
 #undef FUNCNAME
-#define FUNCNAME MPID_Nem_nd_start_zcp
+#define FUNCNAME MPID_Nem_nd_pack_iov
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_start_zcp(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iov, int n_iov)
-{
+int MPID_Nem_nd_pack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iovp,
+                         int offset_start,
+                         int *offset_endp,
+                         MPID_Nem_nd_msg_t *pmsg,
+                         MPID_Nem_nd_pack_t *pack_typep,
+                         SIZE_T *nbp){
     int mpi_errno = MPI_SUCCESS;
-    HRESULT hr;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_START_ZCP);
+    int ret_errno;
+    int off;
+    char *p;
+    SIZE_T rem_len;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_PACK_IOV);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_START_ZCP);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_PACK_IOV);
 
-    /* Register buffer */
-    /* FIXME: We only register 1 IOV for now */
-    MPIU_Assert(n_iov == 1);
+    MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+    MPIU_Assert(iovp != NULL);
+    MPIU_Assert(offset_endp != NULL);
+    MPIU_Assert(offset_start >= 0);
+    MPIU_Assert(offset_start <= *offset_endp);
+    MPIU_Assert(pack_typep != NULL);
+    MPIU_Assert(pmsg != NULL);
+    MPIU_Assert(nbp != NULL);
 
-    SET_EX_WR_HANDLER(conn_hnd, reg_zcp_mem_success_handler, gen_ex_fail_handler);
-    hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(iov[0].MPID_IOV_BUF,
-        iov[0].MPID_IOV_LEN, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)),
-        &(conn_hnd->zcp_send_mr_hnd));
-    MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
-        mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
-        _com_error(hr).ErrorMessage(), hr);
+    off = *offset_endp;
+    *pack_typep = nd_pack_iov_get_params(iovp, offset_start, &off);
+    MPIU_Assert(off < *offset_endp);
+    *offset_endp = off;
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "PACK_TYPE=%d - [%d, %d]", *pack_typep, offset_start, *offset_endp));
 
+    p = pmsg->buf;
+    rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
+
+    if(*pack_typep == MPID_NEM_ND_SR_PACK){
+        /* Note that both start and end offsets returned by nd_pack_iov_get_params()
+         * are valid/packable offsets
+         */
+        for(off = offset_start; off <= *offset_endp; off++){
+            MPIU_Assert(rem_len >= iovp[off].MPID_IOV_LEN);
+            ret_errno = memcpy_s((void *)p, rem_len,
+                            iovp[off].MPID_IOV_BUF, iovp[off].MPID_IOV_LEN);
+	    	MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
+		    	"**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
+
+            p += iovp[off].MPID_IOV_LEN;
+            rem_len -= iovp[off].MPID_IOV_LEN;
+        }
+        *nbp = MPID_NEM_ND_CONN_UDATA_SZ - rem_len;
+    }
+    else if(*pack_typep == MPID_NEM_ND_ZCP_PACK){
+        /* We are not going to use this msg right now */
+        MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+
+        MPID_Nem_nd_dev_hnd_g->zcp_pending = 1;
+
+        mpi_errno = MPID_Nem_nd_zcp_reg_smem(conn_hnd, iovp, *offset_endp);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        *nbp = 0;
+    }
+    else{
+        /* Unrecognized packing type */
+        MPIU_Assert(0);
+    }
+
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_START_ZCP);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_PACK_IOV);
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -898,114 +1116,121 @@
 }
 
 #undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_post_sendbv
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_Nem_nd_post_sendbv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_POST_SENDBV);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POST_SENDBV);
+
+    MPIU_Assert(!conn_hnd->send_in_progress);
+
+    conn_hnd->send_in_progress = 1;
+    conn_hnd->cache_credits = conn_hnd->send_credits;
+    conn_hnd->send_credits = 0;
+
+    mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POST_SENDBV);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
 #define FUNCNAME MPID_Nem_nd_post_sendv
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iov, int n_iov)
+int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp)
 {
     int mpi_errno = MPI_SUCCESS;
-    errno_t ret_errno;
-    char *p;
     MPID_Nem_nd_msg_t *pmsg;
-    int i, rem_len = 0, msg_len = 0, tot_len = 0;
+    SIZE_T msg_len = 0, nb;
+    MPID_Nem_nd_pack_t pack_type;
+    int offset_end, i;
+
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_POST_SENDV);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POST_SENDV);
 
-    if(!conn_hnd->zcp_in_progress){
-        int start_zcp=0;
-	    tot_len = 0;
-	    for(i=0; i<n_iov; i++){
-		    tot_len += iov[i].MPID_IOV_LEN;
-		    if(tot_len > MPID_NEM_ND_CONN_UDATA_SZ) {
-                start_zcp = 1;
-                break;
-		    }
-	    }
-        if(!start_zcp){
-	        /* Get a msgbuf - pack the iovs into it and send it */
-		    MPIU_Assert(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd));
-		    MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
-		    MPIU_Assert(pmsg != NULL);
-            SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
+    MPIU_Assert((conn_hnd->send_credits > 0) ? (!conn_hnd->send_in_progress) : 1);
 
-		    pmsg->hdr.type = MPID_NEM_ND_DATA_PKT;
-            pmsg->hdr.credits = 0;
-		    p = pmsg->buf;
-		    rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
-		    msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
+    MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
+    MPIU_Assert(pmsg != NULL);
+    SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
 
-		    for(i=0; i<n_iov; i++){
-			    int iov_len = iov[i].MPID_IOV_LEN;
-                /* rem_len is never less than iov_len */
-                MPIU_Assert(rem_len >= iov_len);
-				/* Copy the whole iov to the msg buffer */
-				ret_errno = memcpy_s((void *)p, rem_len, iov[i].MPID_IOV_BUF, iov_len);
-				MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
-					"**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
-				p += iov_len;
-				rem_len -= iov_len;
-				msg_len += iov_len;
-		    }
-		    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending msg packet of size %d (msg type=%d)", msg_len, pmsg->hdr.type));
-		    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending udata packet of type  = %d", ((MPIDI_CH3_Pkt_t *)(&(pmsg->buf)))->type));
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting send on [conn = %p] for ", conn_hnd));
 
-		    mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
-		    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
-        else{ /* start_zcp */
-            conn_hnd->zcp_in_progress = 1;
-            /* Don't send data till the zcpy is over */
-            conn_hnd->zcp_credits = conn_hnd->send_credits;
-            conn_hnd->send_credits = 0;
+    for(i=0; i<sreqp->dev.iov_count; i++){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "IOV[%d] = {%p, %u}",
+            sreqp->dev.iov_offset + i,
+            sreqp->dev.iov[sreqp->dev.iov_offset + i].MPID_IOV_BUF,
+            sreqp->dev.iov[sreqp->dev.iov_offset + i].MPID_IOV_LEN));
+    }
 
-            /* FIXME: Only handling 1 IOV now */
-            mpi_errno = MPID_Nem_nd_start_zcp(conn_hnd, &(iov[1]), 1);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    offset_end = sreqp->dev.iov_offset + sreqp->dev.iov_count;
+    mpi_errno = MPID_Nem_nd_pack_iov(conn_hnd, sreqp->dev.iov,
+                    sreqp->dev.iov_offset, &offset_end, pmsg, &pack_type, &nb);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    
+    if(pack_type == MPID_NEM_ND_SR_PACK){
+	    pmsg->hdr.type = MPID_NEM_ND_DATA_PKT;
+        pmsg->hdr.credits = 0;
+        msg_len = sizeof(MPID_Nem_nd_msg_hdr_t ) + nb;
+
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "SR PACKING: Sending msg packet of size %d (msg type=%d)", msg_len, pmsg->hdr.type));
+        if(sreqp->dev.iov_offset == 0){
+	        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending udata packet of type  = %d", ((MPIDI_CH3_Pkt_t *)(&(pmsg->buf)))->type));
         }
-	}
-    else{ /* zcopy in progress */
-        MPID_Nem_nd_msg_mw_t msg_mw;
-        /* zcopy init should be complete by now - send hdr and MPID_Nem_nd_msg_mw_t
-         * related to the data.
-         */
-        MPIU_Assert(!MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd));
-        MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
-        SET_MSGBUF_HANDLER(pmsg, zcp_mw_send_success_handler, gen_send_fail_handler);
-        MPIU_Assert(pmsg != NULL);
-        /* FIXME: Support more than 2 iovs */
-        MPIU_Assert(n_iov == 2);
+        else{
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Contd to send data on req=%p [iov=%d/tot=%d]",
+                sreqp, sreqp->dev.iov_offset, sreqp->dev.iov_count));
+        }
 
-        pmsg->hdr.type = MPID_NEM_ND_RD_AVAIL_PKT;
-        pmsg->hdr.credits = 0;
-        msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
-        p = pmsg->buf;
-        rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
-        /* Try to copy the first IOV to the msg packet */
-        if(iov[0].MPID_IOV_LEN <= rem_len){
-            ret_errno = memcpy_s((void *)p, rem_len, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
-            MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,    
-                "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
-            p += iov[0].MPID_IOV_LEN;
-            rem_len -= iov[0].MPID_IOV_LEN;
-            msg_len += iov[0].MPID_IOV_LEN;
+	    mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
+	    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        /* Packing always consumes whole IOVs */
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "SR pack : sreq(%p) off %d -> %d", sreqp, sreqp->dev.iov_offset, sreqp->dev.iov_offset + 1));
+
+        sreqp->dev.iov_count -= (offset_end - sreqp->dev.iov_offset + 1);
+        sreqp->dev.iov_offset = offset_end + 1;
+        if(sreqp->dev.iov_count > 0){
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Could not pack all IOVs (rem = %d iovs)- SEND_IN_PROGRESS...", sreqp->dev.iov_count);
+            if(!conn_hnd->send_in_progress){
+                /* Could not pack all IOVs - Block all subsequent sends */
+                conn_hnd->send_in_progress = 1;
+                /* Queue data till the zcpy is over - keep track of the send credits */
+                conn_hnd->cache_credits = conn_hnd->send_credits;
+                conn_hnd->send_credits = 0;
+            }
         }
-        /* We are guaranteed to have enough space for the MW descriptors */
-        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending mem descriptor (buf=%p) : base = %p, length=%I64d, token=%d\n",
-            iov[1].MPID_IOV_BUF,
-            _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Length),
-            conn_hnd->zcp_msg_send_mw.mw_data.Token));
+        else{
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Finished posting sends for all IOVs...");
+        }
+    }
+    else if(pack_type == MPID_NEM_ND_ZCP_PACK){
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ZCP PACKING - SEND_IN_PROGRESS...");
+        conn_hnd->zcp_in_progress = 1;
+        if(!conn_hnd->send_in_progress){
+            /* The progress engine ZCP handlers will send the data */
+            conn_hnd->send_in_progress = 1;
+            /* Queue data till the zcpy is over - keep track of the send credits */
+            conn_hnd->cache_credits = conn_hnd->send_credits;
+            conn_hnd->send_credits = 0;
+        }
+    }
+    else{
+        /* Unrecognized pack type */
+        MPIU_Assert(0);
+    }
 
-        ret_errno = memcpy_s((void *)p, sizeof(MPID_Nem_nd_msg_mw_t ), &(conn_hnd->zcp_msg_send_mw), sizeof(MPID_Nem_nd_msg_mw_t ));
-        MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,    
-            "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
-        /* FIXME: Add mw for other iovs */
-        p += sizeof(MPID_Nem_nd_msg_mw_t );
-        msg_len += sizeof(MPID_Nem_nd_msg_mw_t );
-		    
-        mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
-		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    }
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POST_SENDV);
     return mpi_errno;
@@ -1024,7 +1249,7 @@
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS);
+    /* MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS); */
 
     MPIU_Assert(pcq != NULL);
     MPIU_Assert(pstatus != NULL);
@@ -1049,9 +1274,7 @@
         hr = nd_results[0]->Status;
         pmsg_result = GET_MSGRESULT_FROM_NDRESULT(nd_results[0]);
         MPIU_Assert(pmsg_result != NULL);
-        /* FIXME: REMOVE ME !! -start */
         MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Got something on %p", GET_MSGBUF_FROM_MSGRESULT(pmsg_result)));
-        /* FIXME: REMOVE ME !! -end */
         if(hr == ND_SUCCESS){
             handler_fn = pmsg_result->succ_fn;
         }
@@ -1062,7 +1285,7 @@
         return handler_fn(pmsg_result);
     }
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS);
+    /* MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS); */
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -1171,6 +1394,42 @@
 }
 
 #undef FUNCNAME
+#define FUNCNAME quiescent_msg_handler
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int quiescent_msg_handler(MPID_Nem_nd_msg_result_t *result)
+{
+    int mpi_errno = MPI_SUCCESS;
+    HRESULT hr;
+    MPID_Nem_nd_msg_t *pmsg;
+    MPID_Nem_nd_conn_hnd_t conn_hnd;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_MSG_HANDLER);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_MSG_HANDLER);
+
+    pmsg = GET_MSGBUF_FROM_MSGRESULT(result);
+    MPIU_Assert(pmsg != NULL);
+
+    conn_hnd = GET_CONNHND_FROM_MSGBUF(pmsg);
+    MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+
+    MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+
+    /* A pending op completed on this conn - rd/wr - go ahead and 
+     * disconnect the conn
+     */
+    mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_MSG_HANDLER);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
 #define FUNCNAME free_msg_result_handler
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -1191,18 +1450,626 @@
 }
 
 #undef FUNCNAME
+#define FUNCNAME trim_nd_sge
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline void trim_nd_sge(ND_SGE *nd_sge, int *nd_sge_count, int *nd_sge_offset, SIZE_T nb)
+{
+    ND_SGE *nd_sge_p;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_TRIM_ND_SGE);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_TRIM_ND_SGE);
+    MPIU_Assert(nd_sge != NULL);
+    MPIU_Assert(nd_sge_count != NULL);
+    MPIU_Assert(*nd_sge_count > 0);
+    MPIU_Assert(nd_sge_offset != NULL);
+    MPIU_Assert(*nd_sge_offset < MPID_IOV_LIMIT);
+    MPIU_Assert(nb >= 0);
+
+    nd_sge_p = &(nd_sge[*nd_sge_offset]);
+    while(nb){
+        MPIU_Assert(*nd_sge_count > 0);
+        if(nb < nd_sge_p->Length){
+            /* We never read partial nd_sges */
+            MPIU_Assert(0);
+            nd_sge_p->pAddr = (char *)(nd_sge_p->pAddr) + nb;
+            nd_sge_p->Length -= nb;
+            nb = 0;
+        }
+        else{
+            *nd_sge_count -= 1;
+            *nd_sge_offset += 1;
+            nb -= nd_sge_p->Length;
+            nd_sge_p++;
+        }
+    }
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_TRIM_ND_SGE);
+}
+
+/* The function modifies ND_MW - don't use it if you need to use the MW again (eg: invalidate) */
+#undef FUNCNAME
+#define FUNCNAME trim_nd_mw
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline void trim_nd_mw(ND_MW_DESCRIPTOR *nd_mw, int *nd_mw_count, int *nd_mw_offset, SIZE_T nb)
+{
+    ND_MW_DESCRIPTOR *nd_mw_p;
+    uint64_t len = 0;
+    uint64_t base = 0;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_TRIM_ND_MW);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_TRIM_ND_MW);
+
+    MPIU_Assert(nd_mw != NULL);
+    MPIU_Assert(nd_mw_count != NULL);
+    MPIU_Assert(*nd_mw_count > 0);
+    MPIU_Assert(nd_mw_offset != NULL);
+    MPIU_Assert(*nd_mw_offset < MPID_IOV_LIMIT);
+    MPIU_Assert(nb >= 0);
+
+    nd_mw_p = &(nd_mw[*nd_mw_offset]);
+    while(nb){
+        MPIU_Assert(*nd_mw_count > 0);
+        len = _byteswap_uint64(nd_mw_p->Length);
+
+        if(nb < len){
+            base = _byteswap_uint64(nd_mw_p->Base);
+            nd_mw_p->Base = _byteswap_uint64(base + nb);
+            len -= nb;
+            nd_mw_p->Length = _byteswap_uint64(len);
+            nb = 0;
+        }
+        else{
+            *nd_mw_count -= 1;
+            *nd_mw_offset += 1;
+            nb -= len;
+            nd_mw_p->Length = 0;
+            nd_mw_p++;
+        }
+    }
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_TRIM_ND_MW);
+}
+
+/* This function trims the iov array, *iov_p, of size *n_iov_p
+ * assuming nb bytes are transferred
+ * Side-effect : *iov_p, *n_iov_p, buf & len of (*iov_p)
+ *  could be modified by this function.
+ * Returns the number of bytes copied
+ */
+#undef FUNCNAME
+#define FUNCNAME copy_and_trim_iov
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static SIZE_T copy_and_trim_iov(MPID_IOV *iov, int *n_iov_p, int *offset_p, char *buf, SIZE_T nb)
+{
+    MPID_IOV *cur_iov;
+    int cur_n_iov, cur_offset;
+    SIZE_T buflen;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_COPY_AND_TRIM_IOV);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_COPY_AND_TRIM_IOV);
+
+    MPIU_Assert(iov != NULL);
+    MPIU_Assert(n_iov_p);
+    MPIU_Assert((*n_iov_p) > 0);
+    MPIU_Assert(offset_p != NULL);
+    MPIU_Assert((*offset_p >= 0) && (*offset_p < MPID_IOV_LIMIT));
+    MPIU_Assert(buf != NULL);
+
+    cur_n_iov = *n_iov_p;
+    cur_offset = *offset_p;
+    cur_iov = &(iov[cur_offset]);
+
+    buflen = nb;
+
+    while(nb > 0){
+        if(nb < cur_iov->MPID_IOV_LEN){
+            memcpy_s(cur_iov->MPID_IOV_BUF, cur_iov->MPID_IOV_LEN, buf, nb);
+            buf += nb;
+            cur_iov->MPID_IOV_BUF += nb;
+            cur_iov->MPID_IOV_LEN -= nb;
+            nb = 0;
+        }
+        else{
+            memcpy_s(cur_iov->MPID_IOV_BUF, cur_iov->MPID_IOV_LEN, buf, cur_iov->MPID_IOV_LEN);
+            buf += cur_iov->MPID_IOV_LEN;
+            nb -= cur_iov->MPID_IOV_LEN;
+            cur_iov->MPID_IOV_LEN = 0;
+            cur_n_iov--;
+            cur_offset++;
+            if(cur_n_iov > 0){
+                cur_iov++;
+            }
+            else{
+                /* More data available in the buffer than can be copied 
+                 * The return value indicates the number of bytes copied
+                 */
+                break;
+            }
+        }
+    }
+
+    *n_iov_p = cur_n_iov;
+    *offset_p = cur_offset;
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_COPY_AND_TRIM_IOV);
+    /* Bytes processed/trimmed */
+    return (buflen - nb);
+ fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_handle_recv_req
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPID_Nem_nd_handle_recv_req(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *rreqp, int *req_complete)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int (*req_fn)(MPIDI_VC_t *, MPID_Request *, int *);
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_HANDLE_RECV_REQ);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_HANDLE_RECV_REQ);
+
+    MPIU_Assert(rreqp != NULL);
+    MPIU_Assert(req_complete != NULL);
+
+    *req_complete = 0;
+
+    req_fn = rreqp->dev.OnDataAvail;
+    if(req_fn){
+        mpi_errno = req_fn(conn_hnd->vc, rreqp, req_complete);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (*req_complete){
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+        }
+        else{
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... Not complete");
+            rreqp->dev.iov_offset = 0;
+        }
+    }
+    else{
+        MPIDI_CH3U_Request_complete(rreqp);
+        *req_complete = 1;
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_HANDLE_RECV_REQ);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME nd_read_progress_update
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int nd_read_progress_update(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *rreq, char *buf, SIZE_T *pnb, int *req_complete)
+{
+    int mpi_errno = MPI_SUCCESS;
+    SIZE_T buflen, nb;
+    MPID_IOV *iov;
+    int complete;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_READ_PROGRESS_UPDATE);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_READ_PROGRESS_UPDATE);
+
+    MPIU_Assert(rreq != NULL);
+    MPIU_Assert(buf != NULL);
+    MPIU_Assert(req_complete != NULL);
+    MPIU_Assert((pnb != NULL) && (*pnb > 0));
+
+    *req_complete = 0;
+    buflen = *pnb;
+    do{
+        *req_complete = 0;
+
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "trim rreq(%p) off %d/tot=%d", 
+            rreq, rreq->dev.iov_offset, rreq->dev.iov_count));
+
+        if(rreq->dev.iov_count != 0){
+            iov = rreq->dev.iov;
+            nb = copy_and_trim_iov(iov, &(rreq->dev.iov_count), &(rreq->dev.iov_offset), buf, buflen);
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Copied %d bytes...[rem iovs = %d]", nb, rreq->dev.iov_count));
+
+            buf += nb;
+            buflen -= nb;
+        }
+
+        complete = (rreq->dev.iov_count == 0) ? 1 : 0;
+
+        if(complete){
+            mpi_errno = MPID_Nem_nd_handle_recv_req(conn_hnd, rreq, req_complete);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
+    }while((buflen > 0) && !(*req_complete));
+
+    /* Number of bytes processed/consumed */
+    *pnb -= buflen;
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_READ_PROGRESS_UPDATE);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_zcp_recv_sge_reg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_zcp_recv_sge_reg(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+    int mpi_errno = MPI_SUCCESS;
+    HRESULT hr;
+    int i;
+    MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_REG);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_REG);
+
+    MPIU_Assert(conn_hnd->zcp_recv_sge_count > 0);
+
+    mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd,
+                        conn_hnd->zcp_recv_sge_count,
+                        conn_hnd, 1);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registering sge [%d/%d]={%p/%I64d}",
+            i, conn_hnd->zcp_recv_sge_count,
+            conn_hnd->zcp_recv_sge[i].pAddr,
+            conn_hnd->zcp_recv_sge[i].Length));
+        hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->zcp_recv_sge[i].pAddr,
+                conn_hnd->zcp_recv_sge[i].Length,
+                MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd),
+                &(conn_hnd->zcp_recv_sge[i].hMr));
+
+        if(SUCCEEDED(hr)){
+	        conn_hnd->npending_ops++;
+	        mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
+	        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
+        /* FIXME: Change the error message - nd_mem_reg */
+        MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+            mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+            _com_error(hr).ErrorMessage(), hr);
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_REG);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_zcp_recv
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_zcp_recv(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+    int mpi_errno = MPI_SUCCESS;
+    HRESULT hr;
+    MPID_Nem_nd_msg_t *pzcp_msg;
+    int i;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_RECV);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_RECV);
+
+    /* A msg buf is guaranteed for RDMA read */
+    MSGBUF_FREEQ_DEQUEUE(conn_hnd, pzcp_msg);
+    MPIU_Assert(pzcp_msg != NULL);
+    
+    SET_MSGBUF_HANDLER(pzcp_msg, zcp_read_success_handler, zcp_read_fail_handler);
+
+    while(MPID_Nem_nd_dev_hnd_g->npending_rds >= 2){
+        mpi_errno = MPID_Nem_nd_sm_poll(0);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "RDMA READ: Using remote mem descriptor : base = %p, length=%I64d, token=%d",
+        _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), 
+        _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+        conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+
+    {
+        SIZE_T len=0;
+        for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "RDMA READ: Using local sge [%d/%d] : pAddr = %p, length=%I64d, hMr =%x",
+                i, conn_hnd->zcp_recv_sge_count,
+                conn_hnd->zcp_recv_sge[i].pAddr,
+                conn_hnd->zcp_recv_sge[i].Length,
+                conn_hnd->zcp_recv_sge[i].hMr));
+            len += conn_hnd->zcp_recv_sge[i].Length;
+        }
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Performing RDMA READ for " MPIR_UPINT_FMT_DEC_SPEC "bytes", len));
+    }
+
+    hr = conn_hnd->p_ep->Read(GET_PNDRESULT_FROM_MSGBUF(pzcp_msg),
+            &(conn_hnd->zcp_recv_sge[0]),
+            conn_hnd->zcp_recv_sge_count,
+            &(conn_hnd->zcp_msg_recv_mw.mw_data),
+            0, ND_OP_FLAG_READ_FENCE);
+    MPID_Nem_nd_dev_hnd_g->npending_rds++; conn_hnd->npending_rds++;
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+        MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
+    MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+        mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+        _com_error(hr).ErrorMessage(), hr);
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_RECV);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+
+/* Function assumes that conn_hnd->zcp_msg_recv_mw is already set */
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_zcp_unpack_iov
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_zcp_unpack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd,
+                                    MPID_IOV *iovp,
+                                    int offset_start,
+                                    int *offset_endp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int i;
+    SIZE_T invec_len;
+    int iov_offset, sge_offset;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_UNPACK_IOV);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_UNPACK_IOV);
+
+    invec_len = _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length);
+    MPIU_Assert(invec_len > 0);
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "ZCP Unpack IOV[%d, %d), vec_len=" MPIR_UPINT_FMT_DEC_SPEC,
+        offset_start, *offset_endp, invec_len));
+    conn_hnd->zcp_recv_sge_count = 0;
+
+    for(iov_offset=offset_start, sge_offset=0;
+            (iov_offset < *offset_endp) && (invec_len > 0) && (sge_offset < MPID_IOV_LIMIT);
+            sge_offset++){
+
+        u_long cur_iov_len;
+
+        cur_iov_len = iovp[iov_offset].MPID_IOV_LEN;
+
+        /* Note that invec_len will be < MPID_NEM_ND_DEV_IO_LIMIT */
+        if(invec_len < cur_iov_len){
+            conn_hnd->zcp_recv_sge[sge_offset].pAddr = iovp[iov_offset].MPID_IOV_BUF;
+            conn_hnd->zcp_recv_sge[sge_offset].Length = invec_len;
+            conn_hnd->zcp_recv_sge_count++;
+
+            iovp[iov_offset].MPID_IOV_BUF += invec_len;
+            iovp[iov_offset].MPID_IOV_LEN -= invec_len;
+
+            invec_len = 0;
+            break;
+        }
+        else{
+            conn_hnd->zcp_recv_sge[sge_offset].pAddr = iovp[iov_offset].MPID_IOV_BUF;
+            conn_hnd->zcp_recv_sge[sge_offset].Length = cur_iov_len;
+            conn_hnd->zcp_recv_sge_count++;
+
+            invec_len -= cur_iov_len;
+            iovp[iov_offset].MPID_IOV_LEN = 0;
+        }
+
+        if(iovp[iov_offset].MPID_IOV_LEN == 0){
+            iov_offset++;
+        }
+    }
+
+    *offset_endp = iov_offset;
+
+    for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "recv sge[%d/%d]={%p/%I64d}",
+            i, conn_hnd->zcp_recv_sge_count,
+            conn_hnd->zcp_recv_sge[i].pAddr,
+            conn_hnd->zcp_recv_sge[i].Length));
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_UNPACK_IOV);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+
+/*
+ * Output:
+ * offset_endp => Returns the next offset that is to be packed, could be invalid
+ */
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_unpack_iov
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_unpack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd,
+                                    MPID_IOV *iovp,
+                                    int offset_start,
+                                    int *offset_endp,
+                                    MPID_Nem_nd_pack_t pack_type,
+                                    MPID_Nem_nd_msg_mw_t *msg_mwp,
+                                    char *buf,
+                                    SIZE_T *nbp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ret_errno;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_UNPACK_IOV);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_UNPACK_IOV);
+    
+    MPIU_Assert(iovp != NULL);
+    MPIU_Assert(offset_start >= 0);
+    MPIU_Assert(offset_endp != NULL);
+    MPIU_Assert(*offset_endp >= offset_start);
+    MPIU_Assert((pack_type == MPID_NEM_ND_SR_PACK) ? ((buf != NULL) && (nbp != NULL) && (*nbp > 0)) : 1);
+
+    /* Unpack and register */
+    if(pack_type == MPID_NEM_ND_SR_PACK){
+        int iov_count, off;
+        SIZE_T nb;
+
+        iov_count = *offset_endp - offset_start;
+        off = offset_start;
+
+        MPIU_Assert(off < *offset_endp);
+        nb = copy_and_trim_iov(iovp, &iov_count, &off, buf, *nbp);
+
+        /* Number of bytes consumed */
+        *nbp = nb;
+        /* Return the last valid offset that was processed */
+        *offset_endp = off;
+    }
+    else if(pack_type == MPID_NEM_ND_ZCP_PACK){
+        if(msg_mwp != NULL){
+            /* Save the MW */
+            ret_errno = memcpy_s((void *)&(conn_hnd->zcp_msg_recv_mw), sizeof(MPID_Nem_nd_msg_mw_t ), (void *)msg_mwp, sizeof(MPID_Nem_nd_msg_mw_t));
+		    MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
+			    "**nd_read", "**nd_read %s %d", strerror(ret_errno), ret_errno);
+
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Received mem descriptor : base = %p, length=%I64d, token=%d\n",
+                _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+                conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+        }
+        else{
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Re-using mem descriptor : base = %p, length=%I64d, token=%d\n",
+                _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+                conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+        }
+
+        /* Unpack IOVs to SGEs */
+        mpi_errno = MPID_Nem_nd_zcp_unpack_iov(conn_hnd, iovp, offset_start, offset_endp);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        /* Register SGEs */
+        mpi_errno = MPID_Nem_nd_zcp_recv_sge_reg(conn_hnd);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+    else{
+        /* Unrecognized packing type */
+        MPIU_Assert(0);
+    }
+fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_UNPACK_IOV);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME zcp_recv_sge_dereg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int zcp_recv_sge_dereg(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+    int mpi_errno = MPI_SUCCESS;
+    HRESULT hr;
+    int i;
+    MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_DEREG);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_DEREG);
+
+    mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd,
+                    conn_hnd->zcp_recv_sge_count, conn_hnd, 1);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+        hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_recv_sge[i].hMr,
+                MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd));
+        if(SUCCEEDED(hr)){
+	        conn_hnd->npending_ops++;
+	        mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
+	        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
+        MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+            mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+            _com_error(hr).ErrorMessage(), hr);
+
+        conn_hnd->zcp_recv_sge[i].Length = 0;
+        conn_hnd->zcp_recv_sge[i].pAddr = 0;
+    }
+
+fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_DEREG);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME zcp_dereg_smem
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int zcp_dereg_smem(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+    int mpi_errno = MPI_SUCCESS;
+    HRESULT hr;
+    int i, iov_offset;
+    MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_DEREG_SMEM);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_DEREG_SMEM);
+
+    MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+
+    /* FIXME: Don't block here - let each reg mem take place inside a handler */
+    /* Registering the local IOV */
+    mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd, 1, conn_hnd, 1);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_send_mr_hnd,
+            MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd));
+    if(SUCCEEDED(hr)){
+        /* Manual event */
+        conn_hnd->npending_ops++;
+        mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+    /* FIXME: Change the error message - nd_mem_reg */
+    MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+        mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+        _com_error(hr).ErrorMessage(), hr);
+
+fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_DEREG_SMEM);
+    return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
 #define FUNCNAME zcp_read_success_handler
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static int zcp_read_success_handler(MPID_Nem_nd_msg_result_t *send_result)
 {
     int mpi_errno = MPI_SUCCESS, ret_errno=0;
+    SIZE_T nb=0, invec_len;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
-    MPID_Nem_nd_block_op_hnd_t dereg_op_hnd;
     MPID_Nem_nd_msg_t *pmsg;
-    MPID_Request *zcp_req;
+    MPID_Request *zcp_reqp;
     int (*req_fn)(MPIDI_VC_t *, MPID_Request *, int *);
     int req_complete=0;
+    int sge_count, sge_offset, mw_count, mw_offset, i;
     HRESULT hr;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_ZCP_READ_SUCCESS_HANDLER);
 
@@ -1214,49 +2081,101 @@
     pmsg = GET_MSGBUF_FROM_MSGRESULT(send_result);
     MPIU_Assert(pmsg != NULL);
 
-    zcp_req = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
-    MPIU_Assert(zcp_req != NULL);
+    nb = GET_NB_FROM_MSGRESULT(send_result);
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Finished RDMA Read [" MPIR_UPINT_FMT_DEC_SPEC "] bytes on conn[%p]", nb, conn_hnd));
 
-    /* Call req handler and send Rd complete pkt */
-    req_fn = zcp_req->dev.OnDataAvail;
-    if(req_fn){
-        mpi_errno = req_fn(conn_hnd->vc, zcp_req, &req_complete);
+    MPID_Nem_nd_dev_hnd_g->npending_rds--; conn_hnd->npending_rds--;
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+        MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "After Rd Rcvd mem descriptor : base = %p, length=%I64d, token=%d\n",
+        _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+        conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+
+    zcp_reqp = conn_hnd->zcp_rreqp;
+    MPIU_Assert(zcp_reqp != NULL);
+
+    /* Trim nd_sge list of recv bufs */
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trimming recv_sge[cnt=%d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+        conn_hnd->zcp_recv_sge_count, nb));
+
+    sge_count = conn_hnd->zcp_recv_sge_count;
+    sge_offset = 0;
+    trim_nd_sge(conn_hnd->zcp_recv_sge, &sge_count, &sge_offset, nb);
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "After trimming recv_sge[cnt=%d/off=%d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+        conn_hnd->zcp_recv_sge_count, sge_offset, nb));
+
+    MPIU_Assert(sge_count == 0);
+
+    /* Trim the nd mw descriptor list of send bufs */
+    mw_count = 1;
+    mw_offset = 0;
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trimming recv_mw[len=%I64d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+        _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length), nb));
+
+    trim_nd_mw(&(conn_hnd->zcp_msg_recv_mw.mw_data), &mw_count, &mw_offset, nb);
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "After trimming recv_mw[len=%I64d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+        _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length), nb));
+
+    invec_len = _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length);
+
+    /* Deregister old bufs */
+    mpi_errno = zcp_recv_sge_dereg(conn_hnd);
+    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    req_complete = 0;
+    if(zcp_reqp->dev.iov_count == 0){
+        mpi_errno = MPID_Nem_nd_handle_recv_req(conn_hnd, zcp_reqp, &req_complete);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-        MPIU_Assert(req_complete);
+        if(req_complete){
+            MPIU_Assert(invec_len == 0);
+            MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(conn_hnd->vc, NULL);
+        }
     }
-    else{
-        MPIDI_CH3U_Request_complete(zcp_req);
-        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Req - RDMA Rd - complete...");
-        MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(conn_hnd->vc, NULL);
-    }
 
-    /* Unregister user memory */
-    mpi_errno = MPID_Nem_nd_block_op_init(&dereg_op_hnd, conn_hnd);
-    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if(invec_len == 0){
+        /* We are no longer ZCP reading */
+        conn_hnd->zcp_rreqp = NULL;
 
-    hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_recv_sge.hMr,
-            MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd));
-    if(hr == ND_PENDING){
-		/* Manual event */
-		conn_hnd->npending_ops++;
-		mpi_errno = MPID_Nem_nd_sm_block(dereg_op_hnd);
-		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		/*
-        hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd), &nb, TRUE);
-		*/
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Sending RD ACK ...");
+        /* We have now read all the send bufs */
+        /* Use the msg & send rd complete pkt */
+        pmsg->hdr.type = MPID_NEM_ND_RD_ACK_PKT;
+
+        SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
+        mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, sizeof(MPID_Nem_nd_msg_hdr_t ), 0);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
-    MPIU_ERR_CHKANDJUMP2(FAILED(hr),
-        mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
-        _com_error(hr).ErrorMessage(), hr);
+    else{
+        int offset_end;
+        
+        MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
 
-    /* Use the msg & send rd complete pkt */
-    pmsg->hdr.type = MPID_NEM_ND_RD_ACK_PKT;
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Re-unpacking and reading ...");
+        /* re-unpack and read */
+        offset_end = zcp_reqp->dev.iov_offset + zcp_reqp->dev.iov_count;
+        mpi_errno = MPID_Nem_nd_unpack_iov(conn_hnd,
+                        zcp_reqp->dev.iov,
+                        zcp_reqp->dev.iov_offset,
+                        &offset_end,
+                        MPID_NEM_ND_ZCP_PACK,
+                        NULL,
+                        NULL, NULL);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
-    mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, sizeof(MPID_Nem_nd_msg_hdr_t ), 0);
-    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        /* Next offset to be packed */
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "zcp_reqp(%p) off %d -> %d", zcp_reqp, zcp_reqp->dev.iov_offset, offset_end));
 
+        zcp_reqp->dev.iov_count -= (offset_end - zcp_reqp->dev.iov_offset);
+        zcp_reqp->dev.iov_offset = offset_end;
+
+        mpi_errno = MPID_Nem_nd_zcp_recv(conn_hnd);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_ZCP_READ_SUCCESS_HANDLER);
     return mpi_errno;
@@ -1274,7 +2193,7 @@
     int mpi_errno = MPI_SUCCESS;
     HRESULT hr;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
-    MPID_Nem_nd_block_op_hnd_t dereg_op_hnd;
+    MPID_Request *sreqp = NULL;
     MPID_Nem_nd_msg_t *pmsg;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_ZCP_MW_INVALIDATE_SUCCESS_HANDLER);
@@ -1284,41 +2203,36 @@
     conn_hnd = GET_CONNHND_FROM_MSGRESULT(recv_result);
     MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
 
-    /* Allow sends on the conn */
-    conn_hnd->zcp_in_progress = 0;
-    conn_hnd->send_credits = conn_hnd->zcp_credits;
+    sreqp = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+    MPIU_Assert(sreqp != NULL);
 
     /* Repost the recv buf */
     pmsg = GET_MSGBUF_FROM_MSGRESULT(recv_result);
     MPIU_Assert(pmsg != NULL);
 
+    /* Repost msg buf */
     SET_MSGBUF_HANDLER(pmsg, recv_success_handler, gen_recv_fail_handler);
     mpi_errno = MPID_Nem_nd_post_recv_msg(conn_hnd, pmsg);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    /* Deregister memory */
-    mpi_errno = MPID_Nem_nd_block_op_init(&dereg_op_hnd, conn_hnd);
+    mpi_errno = zcp_dereg_smem(conn_hnd);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_send_mr_hnd,
-            MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd));
-    if(hr == ND_PENDING){
-		/* Manual event */
-		conn_hnd->npending_ops++;
-		mpi_errno = MPID_Nem_nd_sm_block(dereg_op_hnd);
-		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		/*
-        hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd), &nb, TRUE);
-		*/
+    conn_hnd->zcp_in_progress = 0;
+    conn_hnd->zcp_send_offset = 0;
+    MPID_Nem_nd_dev_hnd_g->zcp_pending = 0;
+
+    if(sreqp->dev.iov_count == 0){
+        /* Call the cont success handler */
+        mpi_errno = cont_send_success_handler(&(conn_hnd->zcp_send_result));
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
-    MPIU_ERR_CHKANDJUMP2(FAILED(hr),
-        mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
-        _com_error(hr).ErrorMessage(), hr);
+    else{
+        /* Continue sending data on this req */
+        mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
 
-    /* Call the send success handler for zcp transfer */
-    mpi_errno = zcp_send_success_handler(&(conn_hnd->zcp_send_result));
-    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_ZCP_MW_INVALIDATE_SUCCESS_HANDLER);
     return mpi_errno;
@@ -1362,36 +2276,56 @@
 }
 
 #undef FUNCNAME
-#define FUNCNAME zcp_send_success_handler
+#define FUNCNAME cont_send_success_handler
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int zcp_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_send_result)
+static int cont_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_send_result)
 {
     int mpi_errno = MPI_SUCCESS;
     int req_complete;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
 
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_ZCP_SEND_SUCCESS_HANDLER);
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_CONT_SEND_SUCCESS_HANDLER);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_ZCP_SEND_SUCCESS_HANDLER);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_CONT_SEND_SUCCESS_HANDLER);
 
     conn_hnd = GET_CONNHND_FROM_ZCP_SEND_MSGRESULT(zcp_send_result);
     MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+    MPIU_Assert(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(conn_hnd->vc));
 
     if(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(conn_hnd->vc)){
-        mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
+        req_complete = 0;
+        mpi_errno = MPID_Nem_nd_handle_posted_sendq_tail_req(conn_hnd->vc, &req_complete);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    }
 
-    /* If we have queued sends and credits to send data - go ahead with sending */
-    if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
-        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
-        mpi_errno = process_pending_req(conn_hnd);
-        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if(req_complete){
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ZCP/Cont_send req complete...");
+            /* Allow sends on the conn */
+            conn_hnd->send_in_progress = 0;
+            conn_hnd->send_credits = conn_hnd->cache_credits;
+
+            /* If we have queued sends and credits to send data - go ahead with sending */
+            if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
+                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
+                mpi_errno = process_pending_req(conn_hnd);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+        }
+        else{
+            MPID_Request *sreqp;
+
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ZCP/Cont_send req NOT complete... sending remaining/reloaded IOVs");
+            sreqp = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+            MPIU_Assert(sreqp != NULL);
+
+            /* Send reloaded iovs */
+            mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
     }
     
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_ZCP_SEND_SUCCESS_HANDLER);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_CONT_SEND_SUCCESS_HANDLER);
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -1420,8 +2354,14 @@
         MPIU_Assert(req != NULL);
 
         /* FIXME: Can we coalesce multiple pending sends ? */
-        mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, &(req->dev.iov[req->dev.iov_offset]), req->dev.iov_count);
-        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if(!MPID_NEM_ND_IS_BLOCKING_REQ(req)){
+            mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, req);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
+        else{
+            mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, req);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(conn_hnd->vc, req);
     }
@@ -1459,7 +2399,6 @@
         MPIU_Assert(MPIDI_Request_get_type(req) != MPIDI_REQUEST_TYPE_GET_RESP);
         MPIDI_CH3U_Request_complete(req);
         MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
-        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(vc, &req);
         *req_complete = 1;
     }
     else{
@@ -1469,10 +2408,12 @@
             
         if (*req_complete){
             MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
-            MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(vc, &req);
         }
+        else{
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... Not complete");
+            req->dev.iov_offset = 0;
+        }
     }
-    req->dev.iov_offset = 0;
 
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_HEAD_REQ);
@@ -1481,7 +2422,58 @@
     goto fn_exit;
 }
 
+/* Handle the request at the tail of vc's sendq */
 #undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_handle_posted_sendq_tail_req
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPID_Nem_nd_handle_posted_sendq_tail_req(MPIDI_VC_t *vc, int *req_complete)
+{
+    int (*req_handler)(MPIDI_VC_t *, MPID_Request *, int *);
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Request *req = NULL;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_TAIL_REQ);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_TAIL_REQ);
+
+    MPIU_Assert(req_complete != NULL);
+
+    MPIU_Assert(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(vc));
+    req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(vc);
+    MPIU_Assert(req != NULL);
+
+    req_handler = req->dev.OnDataAvail;
+    if (!req_handler){
+        MPIU_Assert(MPIDI_Request_get_type(req) != MPIDI_REQUEST_TYPE_GET_RESP);
+        MPIDI_CH3U_Request_complete(req);
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+        MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_REM_TAIL(vc, &req);
+        *req_complete = 1;
+    }
+    else{
+        *req_complete = 0;
+        mpi_errno = req_handler(vc, req, req_complete);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            
+        if (*req_complete){
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+            MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_REM_TAIL(vc, &req);
+        }
+        else{
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... Not complete");
+            req->dev.iov_offset = 0;
+        }
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_TAIL_REQ);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME send_success_handler
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -1491,6 +2483,7 @@
     int req_complete = 0;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
     MPID_Nem_nd_msg_t   *pmsg;
+    MPID_Request *sreqp;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_SEND_SUCCESS_HANDLER);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_SEND_SUCCESS_HANDLER);
@@ -1501,24 +2494,69 @@
     pmsg = GET_MSGBUF_FROM_MSGRESULT(send_result);
     MPIU_Assert(pmsg != NULL);
 
+    sreqp = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+    MPIU_Assert(sreqp != NULL);
+
     MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send succeeded...");
-	conn_hnd->npending_ops--;
 
-    if(conn_hnd->vc != NULL){
+    /* Reset the handlers & enqueue this send buffer to freeq */
+    SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
+    MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+    MPIU_Assert(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY((conn_hnd->vc)));
+
+    if(MPID_NEM_ND_VC_IS_CONNECTED(conn_hnd->vc)){
         /* Increment number of available send credits only when a credit packet is recvd */
         /* Complete the request associated with this send if no pending events */
-        mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
-        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        req_complete = 0;
 
-        /* Enqueue this send buffer to freeq */
-        MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+        if(!conn_hnd->send_in_progress){
+            mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-        /* If we have queued sends and credits to send data - go ahead with sending */
-        if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
-            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
-            mpi_errno = process_pending_req(conn_hnd);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(conn_hnd->vc, &sreqp);
+            if(req_complete){
+                /* If we have queued sends and credits to send data - go ahead with sending */
+                if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
+                    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
+                    mpi_errno = process_pending_req(conn_hnd);
+                    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                }
+            }
+            else{
+                MPIU_Assert(0);
+                mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+                MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(conn_hnd->vc, sreqp);
+            }
         }
+        else{
+            if(MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_HEAD(conn_hnd->vc) ==
+                MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc)){
+                /* Only ZCP/Cont_send req in posted Q */
+                /* If ZCP is in progress - the ZCP handlers will handle sends */
+                if(!conn_hnd->zcp_in_progress){
+                    if(sreqp->dev.iov_count == 0){
+                        mpi_errno = cont_send_success_handler(&(conn_hnd->zcp_send_result));
+                        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    }
+                    else{
+                        /* Repost the remaining/reloaded IOV */
+                        mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+                        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    }
+                }
+            }
+            else{
+                req_complete = 0;
+                mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
+                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+                MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(conn_hnd->vc, &sreqp);
+
+                MPIU_Assert(req_complete);
+            }
+        }
     }
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_SEND_SUCCESS_HANDLER);
@@ -1548,14 +2586,15 @@
     pmsg = GET_MSGBUF_FROM_MSGRESULT(send_result);
     MPIU_Assert(pmsg != NULL);
 
-    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send succeeded...");
-	conn_hnd->npending_ops--;
+    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Netmod Send succeeded...");
 
-    if(conn_hnd->vc != NULL){
+    /* Reset the handlers & enqueue this send buffer to freeq */
+    SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
+    MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+
+    if(MPID_NEM_ND_VC_IS_CONNECTED(conn_hnd->vc)){
         /* Increment number of available send credits only when a credit packet is recvd */
         /* There is no request associated with this send - Netmod msg */
-        /* Enqueue this send buffer to freeq */
-        MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
 
         /* If we have queued sends and credits to send data - go ahead with sending */
         if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
@@ -1572,7 +2611,6 @@
     goto fn_exit;
 }
 
-
 #undef FUNCNAME
 #define FUNCNAME wait_cack_success_handler
 #undef FCNAME
@@ -1598,11 +2636,14 @@
     if(pmsg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT){
         /* Connection successful */
         MPIDI_VC_t *vc;
-        MPIDI_CH3I_VC *vc_ch;
 
-        vc = conn_hnd->vc;
-        vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
+        /* Set this conn vc to the stored vc info */
+        vc = conn_hnd->tmp_vc;
 
+        conn_hnd->vc = vc;
+        MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, conn_hnd);
+        /* We no longer need tmp conn info in VC */
+        MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(vc);
         MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_CONNECTED);
         MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_ACTIVE);
 
@@ -1617,6 +2658,8 @@
     else{
         /* Connection failed - Lost in head to head on the remote side */
         conn_hnd->vc = NULL;
+        conn_hnd->tmp_vc = NULL;
+
         MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
         mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -1655,7 +2698,7 @@
     lconn_hnd = GET_CONNHND_FROM_EX_RECV_OV(recv_ov);
     MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(lconn_hnd));
 
-    mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_ACCEPT_CONN, lconn_hnd->p_conn, &new_conn_hnd);
+    mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_ACCEPT_CONN, lconn_hnd->p_conn, NULL, &new_conn_hnd);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
     /* Get the pg information sent with the connect request */
@@ -1691,6 +2734,7 @@
         pg_id = (char *)MPIDI_Process.my_pg->id;
         mpi_errno = MPID_Nem_nd_decode_pg_info(pg_id, pg_info->pg_rank, &vc, &pg);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New conn (%p) for rank = %d", new_conn_hnd, pg_info->pg_rank));
     }
     else{
         /* FIXME: TODO */
@@ -1698,43 +2742,53 @@
     }
 
     vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
-    if(MPID_NEM_ND_CONN_HND_IS_VALID(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))){
-        if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) != MPID_NEM_ND_VC_STATE_CONNECTED){
-            /* VC is connecting - head-to-head scenario */
-            MPID_Nem_nd_conn_hnd_t old_conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
-            int old_conn_won_hh=0;
-            mpi_errno = MPID_Nem_nd_resolve_head_to_head(pg_info->pg_rank, pg, pg_id, &old_conn_won_hh);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "H-H: Old conn (%p:%d) & new conn (%p:%d)", old_conn_hnd, old_conn_hnd->state, new_conn_hnd, new_conn_hnd->state));
-            if(old_conn_won_hh){
-                /* Won head to head with new conn */
-                /* Send a NAK and close the new conn */
-                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Old conn (%p) won head to head", MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc)));
-                terminate_conn = 1;
-            }
-            else{
-                /* Lost head to head with new conn */
-                /* Make old conn orphan - The other side with send
-                 * us a LNAK and we can close the old conn
-                 */
-                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New conn (%p) won head to head", new_conn_hnd));
+    if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_CONNECTED){
+        /* VC is already connected */
+        terminate_conn = 1;
+    }
+    else if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_CONNECTING){
+        /* VC is connecting - head-to-head scenario */
+        MPID_Nem_nd_conn_hnd_t old_conn_hnd = MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_GET(vc);
+        int old_conn_won_hh = 0;
 
-                MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, new_conn_hnd);
-                new_conn_hnd->vc = vc;
-                terminate_conn = 0;
-            }
+        mpi_errno = MPID_Nem_nd_resolve_head_to_head(pg_info->pg_rank, pg, pg_id, &old_conn_won_hh);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        /* The old conn may not be VALID yet - So don't use it */
+        MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(old_conn_hnd));
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "H-H: Old conn (%p:%d) & new conn (%p:%d)", old_conn_hnd, old_conn_hnd->state, new_conn_hnd, new_conn_hnd->state));
+        
+        if(old_conn_won_hh){
+            /* Won head to head with new conn
+             * Send a NAK and close the new conn 
+             */
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Old conn (%p) won head to head", MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc)));
+            terminate_conn = 1;
         }
         else{
-            /* VC is already connected */
-            terminate_conn = 1;
+            /* Lost head to head with new conn
+             * Make old conn orphan - The other side with send
+             * us a LNAK and we then close the old conn
+             */
+            MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New conn (%p) won head to head", new_conn_hnd));
+
+            /*
+            MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, new_conn_hnd);
+            new_conn_hnd->vc = vc;
+            */
+            old_conn_hnd->is_orphan = 1;
+            /* Save VC info */
+            new_conn_hnd->tmp_vc = vc;
+            terminate_conn = 0;
         }
     }
-    else{
-        /* No conn associated with this vc */
-        /* Associate vc with this connection */
-        MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, new_conn_hnd);
-        /* Associate this conn with the vc */
-        new_conn_hnd->vc = vc;
+    else{ /* VC is DISCONNECTED */
+        /* Save vc info with this connection. We are still not
+         * sure if the VC is CONNECTING - Since a CNAK could mean
+         * an orphan conn
+         * Associate this conn with vc when we receive a CACK
+         */
+        new_conn_hnd->tmp_vc = vc;
         terminate_conn = 0;
     }
 
@@ -1743,31 +2797,29 @@
          * Do a blocking send for LNAK & disc
          */
 		MPID_Nem_nd_msg_t *pmsg;
-		int msg_len=0;
+		SIZE_T msg_len=0;
 
-		/* Post a LACK - do a blocking send */
+        MPID_NEM_ND_CONN_STATE_SET(new_conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
+		/* Post a LNAK */
 		MSGBUF_FREEQ_DEQUEUE(new_conn_hnd, pmsg);
         MPIU_Assert(pmsg != NULL);
+        SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
+
 		pmsg->hdr.type = MPID_NEM_ND_CONN_NAK_PKT;
 		msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
-		mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 1);
+		mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 0);
 		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		MSGBUF_FREEQ_ENQUEUE(new_conn_hnd, pmsg);
 
         MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Passive disc on (%p)", new_conn_hnd));
-        /* Wait for a disconnect from the other side and free resources */
+
+        /* Wait for a disconnect/CNAK/CACK from the other side and free resources */
         mpi_errno = MPID_Nem_nd_conn_passive_disc(new_conn_hnd);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
-        /*
-		mpi_errno = MPID_Nem_nd_conn_disc(new_conn_hnd);
-		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        */
     }
     else{
         /* Connection successful - send LACK & wait for CACK */
         MPID_Nem_nd_msg_t *pmsg;
-        int msg_len=0;
+        SIZE_T msg_len=0;
 
 		MPID_NEM_ND_CONN_STATE_SET(new_conn_hnd, MPID_NEM_ND_CONN_WAIT_CACK);
 		/* Grab the head of the recv ssbufs and set its handlers */
@@ -1779,13 +2831,13 @@
 
         MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Post LACK wait for CACK on (%p)", new_conn_hnd));
 
-		/* Post a LACK - do a blocking send */
+		/* Post a LACK - do a non-blocking send */
 		MSGBUF_FREEQ_DEQUEUE(new_conn_hnd, pmsg);
-		pmsg->hdr.type = MPID_NEM_ND_CONN_ACK_PKT;
+        SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
+        pmsg->hdr.type = MPID_NEM_ND_CONN_ACK_PKT;
 		msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
-		mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 1);
+		mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 0);
 		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		MSGBUF_FREEQ_ENQUEUE(new_conn_hnd, pmsg);
     }
 
 fn_exit:
@@ -1805,7 +2857,7 @@
     int mpi_errno = MPI_SUCCESS;
     MPID_Nem_nd_msg_t *precv_msg;
     MPID_Nem_nd_conn_hnd_t conn_hnd;
-    MPIDI_CH3I_VC *vc_ch;
+    int terminate_conn = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_WAIT_LACK_SUCCESS_HANDLER);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_WAIT_LACK_SUCCESS_HANDLER);
@@ -1828,10 +2880,19 @@
         if(precv_msg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT){
             MPID_Nem_nd_msg_t   *psend_msg;
             MPIDI_VC_t *vc;
-            int msg_len=0;
+            SIZE_T msg_len=0;
+
             /* VC is now connected - send an ACK - CACK - to listen side */
-            vc = conn_hnd->vc;
-            vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
+
+            terminate_conn = 0;
+
+            /* FIXME: Use a single macro to set vc->conn/conn->vc/state/checks etc */
+            vc = conn_hnd->tmp_vc;
+            conn_hnd->vc = vc;
+            MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, conn_hnd);
+            /* We no longer need the tmp conn info in vc */
+            MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(vc);
+            MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_CONNECTING);
             MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_CONNECTED);
             MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_ACTIVE);
 
@@ -1844,12 +2905,9 @@
             psend_msg->hdr.type = MPID_NEM_ND_CONN_ACK_PKT;
             msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
             MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LACK - Sending CACK");
-            /* We block till the ACK is sent */
-            mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 1);
+            mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 0);
             if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-            MSGBUF_FREEQ_ENQUEUE(conn_hnd, psend_msg);
-
             /* Repost receive on the used msg buf */
             SET_MSGBUF_HANDLER(precv_msg, recv_success_handler, gen_recv_fail_handler);
             mpi_errno = MPID_Nem_nd_post_recv_msg(conn_hnd, precv_msg);
@@ -1860,46 +2918,49 @@
             if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
         }
         else{
-            /* Received LNAK - Close connection - silently */
-            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Closing connection");
-
-            MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
-            mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            /* Received LNAK - Close connection */
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Sending CNAK and closing conn");
+            terminate_conn = 1;
         }
     }
     else{
         /* Send NAK - We lost in head to head connection to the listen side */
         MPIU_Assert((precv_msg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT) ||
                     (precv_msg->hdr.type == MPID_NEM_ND_CONN_NAK_PKT));
-        /* vc is already disconnected from conn */
-        MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
-
+        terminate_conn = 1;
         if(precv_msg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT){
-            /* Send a CNAK and disc */
-            MPID_Nem_nd_msg_t   *psend_msg;
-            int msg_len=0;
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LACK - Lost HH - Sending CNAK & Closing connection");
+        }
+        else{
+            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Lost HH - Sending CNAK & Closing connection");
+        }
+    }
+    if(terminate_conn){
+        /* We reach here in 2 cases
+         * case 1: Received LNAK, Conn is not orphan yet
+         * case 2: Conn is orphan
+         * In both cases send a CNAK and disconnect
+         */
+        /* Send a CNAK and disc */
+        MPID_Nem_nd_msg_t   *psend_msg;
+        SIZE_T msg_len=0;
 
-            /* Blocking send for CNAK */
-            MPIU_Assert(!MSGBUF_FREEQ_IS_EMPTY(conn_hnd));
-            MSGBUF_FREEQ_DEQUEUE(conn_hnd, psend_msg);
-            MPIU_Assert(psend_msg != NULL);
+        /* VC is already/or-will-be connected from conn 
+         * by the listen side. The VC may also have terminated by now.
+         * - So don't change the state of VC here
+         * the vc state is left *as-is*
+         */
+        MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
 
-            psend_msg->hdr.type = MPID_NEM_ND_CONN_NAK_PKT;
-            msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
-            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LACK - Lost HH - Sending CNAK & Closing connection");
+        MPIU_Assert(!MSGBUF_FREEQ_IS_EMPTY(conn_hnd));
+        MSGBUF_FREEQ_DEQUEUE(conn_hnd, psend_msg);
+        MPIU_Assert(psend_msg != NULL);
+        SET_MSGBUF_HANDLER(psend_msg, quiescent_msg_handler, gen_send_fail_handler);
 
-            /* We block till the ACK is sent */
-            mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 1);
-            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        psend_msg->hdr.type = MPID_NEM_ND_CONN_NAK_PKT;
+        msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
 
-            MSGBUF_FREEQ_ENQUEUE(conn_hnd, psend_msg);
-        }
-        else{
-            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Lost HH - Closing connection");
-        }
-        /* Close connection - silently */
-        mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
+        mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 0);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
 
@@ -1921,9 +2982,10 @@
     HRESULT hr;
     MPID_Nem_nd_conn_hnd_t  conn_hnd;
     MPID_Nem_nd_msg_t   *pmsg, *pzcp_msg;
-    MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
-    MPID_Request *rreq = NULL;
-    int nb, udata_len=0;
+    MPID_Request *rreqp = NULL;
+    int i, offset_end;
+    char *buf;
+    SIZE_T buflen, nb, udata_len=0;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_RECV_SUCCESS_HANDLER);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_RECV_SUCCESS_HANDLER);
@@ -1936,14 +2998,15 @@
     nb = GET_NB_FROM_MSGRESULT(recv_result);
     MPIU_ERR_CHKANDJUMP(nb == 0, mpi_errno, MPI_ERR_OTHER, "**nd_write");
 
-    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Recvd %d bytes (msg type=%d)",nb, pmsg->hdr.type));
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Recvd " MPIR_UPINT_FMT_DEC_SPEC " bytes (msg type=%d) on conn=%p",
+        nb, pmsg->hdr.type, conn_hnd));
 
     MPIU_Assert(nb >= sizeof(MPID_Nem_nd_msg_hdr_t ));
-    if(!conn_hnd->zcp_in_progress){
+    if(!conn_hnd->send_in_progress){
         conn_hnd->send_credits += pmsg->hdr.credits;
     }
     else{
-        conn_hnd->zcp_credits += pmsg->hdr.credits;
+        conn_hnd->cache_credits += pmsg->hdr.credits;
     }
     udata_len = nb - sizeof(MPID_Nem_nd_msg_hdr_t );
     switch(pmsg->hdr.type){
@@ -1959,13 +3022,39 @@
 
                 break;
         case MPID_NEM_ND_DATA_PKT:
-                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Received DATA PKT (len = %d, credits = %d)",udata_len, pmsg->hdr.credits));
+                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Received DATA PKT (len =" MPIR_UPINT_FMT_DEC_SPEC ", credits = %d)",udata_len, pmsg->hdr.credits));
+                buf = pmsg->buf;
+                buflen = udata_len;
 
-                /* The msg just contains the type and udata */
-				/* FIXME: We need to keep track of incomplete recv reqs on the conn */
-                mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, pmsg->buf, udata_len);
-                if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                do{
+                    MPIU_Assert(conn_hnd->zcp_rreqp == NULL);
+                    rreqp = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
+                    if(rreqp == NULL){
+                        /* The msg just contains the type and udata */
+                        mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, buf, buflen);
+                        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+                        /* MPID_nem_handle_pkt() consumes all data */
+                        buflen = 0;
+                    }
+                    else{
+                        /* Continuing to recv on this conn - Just copy data into req IOVs */
+                        int complete = 0;
+                        SIZE_T nb = 0;
+
+                        nb = buflen;
+                        mpi_errno = nd_read_progress_update(conn_hnd, rreqp, buf, &nb, &complete);
+                        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                        if(complete){
+                            MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(conn_hnd->vc, NULL);
+                        }   
+                        buflen -= nb;
+                        buf += nb;
+                    }
+
+                    MPIU_Assert(buflen == 0);
+                }while(buflen > 0);
+
                 /* When handling a packet the conn might be disconnected */
                 if(conn_hnd->vc != NULL){
                     /* Repost the recv on the scratch buf */
@@ -1978,18 +3067,86 @@
                 }
                 break;
         case MPID_NEM_ND_RD_AVAIL_PKT:
-                MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "Received RD Avail pkt (len=%d)", udata_len);
+                MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "Received RD Avail pkt (len=" MPIR_UPINT_FMT_DEC_SPEC ")", udata_len);
                 udata_len -= sizeof(MPID_Nem_nd_msg_mw_t);
-                mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, pmsg->buf, udata_len);
+
+                MPIU_Assert(((MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc) != NULL) && (conn_hnd->zcp_rreqp != NULL))? 
+                    (MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc) == conn_hnd->zcp_rreqp) : 1);
+                rreqp = (conn_hnd->zcp_rreqp) ? (conn_hnd->zcp_rreqp) : MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc) ;
+                conn_hnd->zcp_rreqp = rreqp;
+
+                if(rreqp == NULL){
+                    mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, pmsg->buf, udata_len);
+                    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    
+                    rreqp = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
+                    MPIU_Assert(rreqp != NULL);
+
+                    conn_hnd->zcp_rreqp = rreqp;
+                }
+                else{
+                    SIZE_T len = udata_len;
+                    /* Continuing to recv data on a req */
+                    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,"Cont to recv data on req=%p", rreqp));
+                    while(len > 0){
+                        int req_complete = 0;
+                        SIZE_T nb_unpack;
+
+                        MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "RD AVAIL contains " MPIR_UPINT_FMT_DEC_SPEC " bytes", len);
+
+                        offset_end = rreqp->dev.iov_offset + rreqp->dev.iov_count;
+                        nb_unpack = len;
+                        mpi_errno = MPID_Nem_nd_unpack_iov(conn_hnd,
+                                        rreqp->dev.iov,
+                                        rreqp->dev.iov_offset,
+                                        &offset_end,
+                                        MPID_NEM_ND_SR_PACK,
+                                        NULL,
+                                        pmsg->buf,
+                                        &nb_unpack);
+                        len -= nb_unpack;
+                        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                        
+                        /* Update the req offset */
+                        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "rreqp(%p) off %d -> %d", rreqp, rreqp->dev.iov_offset, offset_end));
+
+                        rreqp->dev.iov_count -= (offset_end - rreqp->dev.iov_offset);
+                        rreqp->dev.iov_offset = offset_end;
+
+                        if(rreqp->dev.iov_count == 0){
+                            mpi_errno = MPID_Nem_nd_handle_recv_req(conn_hnd, rreqp, &req_complete);
+                            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                            
+                            /* RD AVAIL always contains data to zcpy */
+                            MPIU_Assert(!req_complete);
+                        }
+                    }
+                }
+
+                for(i=0; i<rreqp->dev.iov_count; i++){
+                    int off = rreqp->dev.iov_offset + i;
+                    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, 
+                        "Trying to zcp unpack req (%p) - iov[%d/tot=%d] = {%p/%u}",
+                        rreqp, off, rreqp->dev.iov_count,
+                        rreqp->dev.iov[off].MPID_IOV_BUF,
+                        rreqp->dev.iov[off].MPID_IOV_LEN
+                    ));
+                }
+
+                offset_end = rreqp->dev.iov_offset + rreqp->dev.iov_count;
+                mpi_errno = MPID_Nem_nd_unpack_iov(conn_hnd,
+                                rreqp->dev.iov,
+                                rreqp->dev.iov_offset,
+                                &offset_end,
+                                MPID_NEM_ND_ZCP_PACK,
+                                (MPID_Nem_nd_msg_mw_t *)&(pmsg->buf[udata_len]),
+                                NULL, NULL);
                 if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-                
-                rreq = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
 
-                MPIU_Assert(rreq != NULL);
+                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "rreqp(%p) off %d -> %d", rreqp, rreqp->dev.iov_offset, offset_end));
 
-                ret_errno = memcpy_s((void *)&(conn_hnd->zcp_msg_recv_mw), sizeof(MPID_Nem_nd_msg_mw_t ), (void *)&(pmsg->buf[udata_len]), sizeof(MPID_Nem_nd_msg_mw_t));
-				MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
-					"**nd_read", "**nd_read %s %d", strerror(ret_errno), ret_errno);
+                rreqp->dev.iov_count -= (offset_end - rreqp->dev.iov_offset);
+                rreqp->dev.iov_offset = offset_end;
 
                 /* Repost recv buffer */
                 if(conn_hnd->vc != NULL){
@@ -1997,57 +3154,31 @@
                     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
                 }
 
-                /* A msg buf is guaranteed for RDMA read */
-                MSGBUF_FREEQ_DEQUEUE(conn_hnd, pzcp_msg);
-                MPIU_Assert(pzcp_msg != NULL);
-                
-                SET_MSGBUF_HANDLER(pzcp_msg, zcp_read_success_handler, zcp_read_fail_handler);
-
-                /* FIXME: We just support 1 IOV for now */
-                conn_hnd->zcp_recv_sge.Length = rreq->dev.iov[rreq->dev.iov_offset].MPID_IOV_LEN;
-                conn_hnd->zcp_recv_sge.pAddr = rreq->dev.iov[rreq->dev.iov_offset].MPID_IOV_BUF;
-                /* Registering the local IOV */
-                mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd, conn_hnd);
+                mpi_errno = MPID_Nem_nd_zcp_recv(conn_hnd);
                 if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-                hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->zcp_recv_sge.pAddr, conn_hnd->zcp_recv_sge.Length, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd), &(conn_hnd->zcp_recv_sge.hMr));
-                if(hr == ND_PENDING){
-					/* Manual event */
-					conn_hnd->npending_ops++;
-					mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
-					if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-					/*
-                    hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd), &nb, TRUE);
-					*/
-                }
-                MPIU_ERR_CHKANDJUMP2(FAILED(hr),
-                    mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
-                    _com_error(hr).ErrorMessage(), hr);
-
-                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Performing RDMA read for %d bytes", conn_hnd->zcp_recv_sge.Length));
-                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Using remote mem descriptor : base = %p, length=%I64d, token=%d",
-                    _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
-                    conn_hnd->zcp_msg_recv_mw.mw_data.Token));
-
-                hr = conn_hnd->p_ep->Read(GET_PNDRESULT_FROM_MSGBUF(pzcp_msg), &(conn_hnd->zcp_recv_sge), 1,
-                    &(conn_hnd->zcp_msg_recv_mw.mw_data), 0, 0x0);
-                MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
-                    mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
-                    _com_error(hr).ErrorMessage(), hr);
-
                 break;
         case MPID_NEM_ND_RD_ACK_PKT:
                 MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received RD ACK pkt");
+                MPID_Nem_nd_dev_hnd_g->npending_rds--; conn_hnd->npending_rds--;
+                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+                    MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
                 /* Get the send credits for conn */
                 MPIU_Assert(udata_len == 0);
                 /* Save the credits in the RD ack pkt */
-                /* conn_hnd->zcp_credits = pmsg->hdr.credits; */
                 /* Invalidate/unbind the address */
+
+                MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trying to invalidate MW [%p]",
+                    conn_hnd->zcp_send_mw));
+
                 SET_MSGBUF_HANDLER(pmsg, zcp_mw_invalidate_success_handler, gen_recv_fail_handler);
-                hr = conn_hnd->p_ep->Invalidate(GET_PNDRESULT_FROM_MSGBUF(pmsg), conn_hnd->zcp_send_mw, 0x0);
+                hr = conn_hnd->p_ep->Invalidate(GET_PNDRESULT_FROM_MSGBUF(pmsg), 
+                        conn_hnd->zcp_send_mw, ND_OP_FLAG_READ_FENCE);
                 MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
                     mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
                     _com_error(hr).ErrorMessage(), hr);
+
                 break;
         default:
                 MPIU_Assert(0);
@@ -2080,33 +3211,33 @@
 
     /* FIXME: We shouldn't block here */
     /* Block and complete the connect() */
-    mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, conn_hnd);
+    mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, 1, conn_hnd, 1);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Complete connect on conn(%p)/block_op(%p) on ov(%p)",
+           conn_hnd, op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd)));
+
     hr = conn_hnd->p_conn->CompleteConnect(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd));
-    if(hr == ND_PENDING){
-		/* Manual event */
+    if(SUCCEEDED(hr)){
+        MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_WAIT_LACK);
+        /* Receive is already pre-posted. Set the handlers correctly and wait
+        * for LACK from the other process
+        */
+        pmsg = GET_RECV_SBUF_HEAD(conn_hnd);
+        MPIU_Assert(pmsg != NULL);
+        SET_MSGBUF_HANDLER(pmsg, wait_lack_success_handler, gen_recv_fail_handler);
+
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Setting the wait_lack recv handler for msg_buf = %p", pmsg));
+
+        /* Manual event */
 		conn_hnd->npending_ops++;
 		mpi_errno = MPID_Nem_nd_sm_block(op_hnd);
 		if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		/*
-        hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd), &nb, TRUE);
-		*/
     }
     MPIU_ERR_CHKANDJUMP2(FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_connect", "**nd_connect %s %d",
         _com_error(hr).ErrorMessage(), hr);
 
-    MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_WAIT_LACK);
-    /* Receive is already pre-posted. Set the handlers correctly and wait
-     * for LACK from the other process
-     */
-    pmsg = GET_RECV_SBUF_HEAD(conn_hnd);
-    MPIU_Assert(pmsg != NULL);
-    SET_MSGBUF_HANDLER(pmsg, wait_lack_success_handler, gen_recv_fail_handler);
-
-    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Setting the wait_lack recv handler for msg_buf = %p", pmsg));
-
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_CONNECTING_SUCCESS_HANDLER);
     return mpi_errno;
@@ -2171,10 +3302,10 @@
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_HANDLER);
 
     conn_hnd = GET_CONNHND_FROM_EX_SEND_OV(send_ov);
-    if(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd)){
-        MPID_Nem_nd_conn_hnd_finalize(MPID_Nem_nd_dev_hnd_g, &conn_hnd);
-    }
 
+    MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+    MPID_Nem_nd_conn_hnd_finalize(MPID_Nem_nd_dev_hnd_g, &conn_hnd);
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_HANDLER);
     return mpi_errno;
@@ -2202,6 +3333,13 @@
 
     SET_EX_WR_HANDLER(conn_hnd, quiescent_handler, quiescent_handler);
 
+    /* FIXME: DEREGISTER ALL RECV BUFS HERE ...*/
+    MPIU_Assert(conn_hnd->npending_ops == 0);
+    MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "Posting disconnect on conn(%p)\n", conn_hnd);
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting disc on conn(%p) on ov(%p)",
+           conn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov))));
+
     hr = conn_hnd->p_conn->Disconnect(MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)));
     MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
         mpi_errno, MPI_ERR_OTHER, "**nd_disc", "**nd_disc %s %d",
@@ -2215,6 +3353,7 @@
     goto fn_exit;
 }
 
+
 #undef FUNCNAME
 #define FUNCNAME block_op_handler
 #undef FCNAME
@@ -2229,13 +3368,21 @@
     hnd = CONTAINING_RECORD(ov, MPID_Nem_nd_block_op_hnd_, ex_ov);
 
 	MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(hnd->conn_hnd));
-	/* Handle manual event completion */
-	hnd->conn_hnd->npending_ops--;
 
-    MPID_Nem_nd_block_op_finalize(&hnd);
+    if(hnd->npending_ops == 0){
+        MPID_Nem_nd_block_op_finalize(&hnd);
+    }
+    else{
+        mpi_errno = MPID_Nem_nd_block_op_reinit(hnd);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
 
+ fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_BLOCK_OP_HANDLER);
     return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
 }
 
 #undef FUNCNAME
@@ -2252,15 +3399,56 @@
     hnd = CONTAINING_RECORD(ov, MPID_Nem_nd_block_op_hnd_, ex_ov);
 
 	MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(hnd->conn_hnd));
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Manual event handler on conn(%p)/block_op(%p) on ov(%p)",
+        hnd->conn_hnd, hnd, ov));
+
+    MPIU_Assert(hnd->npending_ops > 0);
+    /* FIXME: Atleast for now both block op and conn have same number of pending ops */
+    MPIU_Assert(hnd->conn_hnd->npending_ops > 0);
+    /* Note that we might want to wait only for one blocking op on a conn, conn_hnd->npending_ops, but have two
+     * blocking ops on the blocking op, eg: registering memory before a connect etc
+     */
 	/* Handle manual event completion */
-	hnd->conn_hnd->npending_ops--;
+    hnd->npending_ops--;
+    if(hnd->conn_hnd->npending_ops > 0){
+        hnd->conn_hnd->npending_ops--;
+    }
 
 	MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "[%d] manual events pending", hnd->conn_hnd->npending_ops);
 
+    if(hnd->npending_ops == 0){
+        MPID_Nem_nd_block_op_finalize(&hnd);
+    }
+    else{
+        mpi_errno = MPID_Nem_nd_block_op_reinit(hnd);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+ fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_MANUAL_EVENT_HANDLER);
     return mpi_errno;
+ fn_fail:
+    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+    goto fn_exit;
 }
 
+#undef FUNCNAME
+#define FUNCNAME dummy_handler
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int __cdecl dummy_handler(MPIU_EXOVERLAPPED *ov)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Nem_nd_block_op_hnd_t hnd;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_DUMMY_HANDLER);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_DUMMY_HANDLER);
+
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_DUMMY_HANDLER);
+
+    return mpi_errno;
+}
+
 /* The caller is responsible for freeing the pg info buffer allocated by
  * this function
  */
@@ -2314,44 +3502,7 @@
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
     goto fn_exit;
 }
-/* FIXME - Remove
-#undef FUNCNAME
-#define FUNCNAME MPID_Nem_nd_resolve_remote_addr
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_resolve_remote_addr(MPID_Nem_nd_conn_hnd_t conn_hnd,
-                                    struct sockaddr *punresolved_sin,
-                                    int unresolved_sin_len,
-                                    struct sockaddr *presolved_sin,
-                                    int resolved_sin_len)
-{
-    int mpi_errno = MPI_SUCCESS, ret, len;
-    SOCKET s;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_RESOLVE_REMOTE_ADDR);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_RESOLVE_REMOTE_ADDR);
-
-    s = WSASocketW(AF_INET, SOCK_STREAM, 0, NULL, 0, WSA_FLAG_OVERLAPPED);
-    MPIU_ERR_CHKANDJUMP2((s == INVALID_SOCKET), mpi_errno, MPI_ERR_OTHER,
-        "**sock_create", "**sock_create %s %d",
-        MPIU_OSW_Strerror(MPIU_OSW_Get_errno()), MPIU_OSW_Get_errno());
-
-    ret = WSAIoctl(s, SIO_ROUTING_INTERFACE_QUERY,
-        (void *)punresolved_sin, (DWORD )unresolved_sin_len,
-        (void *)presolved_sin, (DWORD )resolved_sin_len, (DWORD *)&len, NULL, NULL);
-    MPIU_ERR_CHKANDJUMP2((ret == SOCKET_ERROR), mpi_errno, MPI_ERR_OTHER,
-        "**ioctl_socket", "**ioctl_socket %s %d",
-        MPIU_OSW_Strerror(MPIU_OSW_Get_errno()), MPIU_OSW_Get_errno());
-
- fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_RESOLVE_REMOTE_ADDR);
-    return mpi_errno;
- fn_fail:
-    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
-    goto fn_exit;
-}
-*/
-
 /* Start connecting on the nd conn corresponding to vc
  * Prepost recvs before we connect
  */
@@ -2372,21 +3523,28 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_CONN_EST);
 
+    MPIU_Assert(!MPID_NEM_ND_CONN_HND_IS_INIT(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc)));
+
+    /* This should be done first because at least some ops below can block 
+     * Setting VC state to CONNECTING prevents dup connect()s
+     */
+    MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_CONNECTING);
+
     /* Create a conn - The progress engine will keep track of
      * this connection.
      */
-    mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_CONNECT_CONN, NULL, &conn_hnd);
+    /* Set tmp conn info in the VC at init time - This might be required by the accept() side
+     * to mark this conn as an orphan
+     */
+    mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_CONNECT_CONN, NULL, vc, &conn_hnd);
     if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
     MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_C_CONNECTING);
-    /* Set VC's conn to this conn */
-    MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, conn_hnd);
-    /* This connection is related to this vc. If this connection
-     * loses in a head to head battle this conn will still point to
-     * the vc, however the vc will no longer point to this conn
-     * making this conn an ORPHAN CONN
+
+    /* Save the vc info in the conn 
+     * Set conn info in vc & vc info in conn after we receive LACK
      */
-    conn_hnd->vc = vc;
+    conn_hnd->tmp_vc = vc;
 
     /* We don't handle dynamic conns yet - no tmp vcs*/
     MPIU_Assert(vc->pg != NULL);
@@ -2426,7 +3584,10 @@
      * successful ?
      */
     MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Connecting to %s:%d(pg_rank=%d, pg_id_len=%d)", ifname, sin.sin_port, ((MPID_Nem_nd_pg_info_hdr_t *)pg_info)->pg_rank, ((MPID_Nem_nd_pg_info_hdr_t *)pg_info)->pg_id_len));
-    
+
+    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting connect on conn(%p) on ov(%p)",
+           conn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov))));
+
     hr = conn_hnd->p_conn->Connect(conn_hnd->p_ep,
             (const struct sockaddr *)&sin, sizeof(struct sockaddr_in),
             MPID_NEM_ND_PROT_FAMILY, 0, (void *)pg_info, pg_info_len,
@@ -2453,7 +3614,7 @@
 #define FUNCNAME MPID_Nem_nd_sm_poll
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_sm_poll(void )
+int MPID_Nem_nd_sm_poll(int in_blocking_poll)
 {
     int mpi_errno = MPI_SUCCESS;
     BOOL wait_for_event_and_status = FALSE;
@@ -2461,7 +3622,7 @@
     static int num_skip_polls = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_POLL);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_POLL);
+    /* MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_POLL); */
     /* ND progress */
     if(num_skip_polls++ < MPID_NEM_ND_SM_SKIP_POLL){
         goto fn_exit;
@@ -2474,8 +3635,8 @@
         /* Reset event completion status */
         status = FALSE;
         /* On return, if (wait_for_event_and_status == FALSE) then
-         * there are no more events in ND Cq
-         */
+        * there are no more events in ND Cq
+        */
         mpi_errno = MPID_Nem_nd_process_completions(MPID_Nem_nd_dev_hnd_g->p_cq, &status);
         if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }while(status == TRUE);
@@ -2488,7 +3649,7 @@
     }while(wait_for_event_and_status == TRUE);
 
  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_POLL);
+    /* MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_POLL); */
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -2518,35 +3679,37 @@
 	/* We need to check conn_hnd status even if block op becomes invalid */
 	conn_hnd = op_hnd->conn_hnd;
 
-	/* Currently only blocking on pending nd ops */
-	while(conn_hnd->npending_ops > 0){
+    MPIU_Assert(conn_hnd->npending_ops == 1);
+    /* MPIU_Assert(op_hnd->npending_ops == 1); */
+	/* Currently only blocking on pending ex ops */
+    /*
+    while(conn_hnd->npending_ops > 0){
 		HRESULT hr;
 		SIZE_T nb=0;
 
-		/* Wait for an event */
         hr = MPID_Nem_nd_dev_hnd_g->p_cq->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd), &nb, TRUE);
 	    MPIU_ERR_CHKANDJUMP(FAILED(hr), mpi_errno, MPI_ERR_OTHER, "**intern");
 
-		/* Process the completed event */
-        status = FALSE;
-        mpi_errno = MPID_Nem_nd_process_completions(MPID_Nem_nd_dev_hnd_g->p_cq, &status);
-        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        do{
+            status = FALSE;
 
-		if(status == FALSE){
-			/* No event on CQ - We must be blocking on a manual event */
-			status = FALSE;
-			mpi_errno = MPIU_ExProcessCompletions(MPID_Nem_nd_exset_hnd_g, &status);
-			if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            mpi_errno = MPID_Nem_nd_process_completions(MPID_Nem_nd_dev_hnd_g->p_cq, &status);
+            if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-			MPIU_Assert(status == TRUE);
-		}
-
-		if(conn_hnd->npending_ops > 0){
-			/* Re-initialize block op */
-			mpi_errno = MPID_Nem_nd_block_op_reinit(op_hnd);
-			if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-		}
+		    if(status == FALSE){
+ 			    mpi_errno = MPIU_ExProcessCompletions(MPID_Nem_nd_exset_hnd_g, &status);
+			    if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+		    }
+        }while(status == TRUE);
 	}
+    */
+    while(conn_hnd->npending_ops > 0){
+        status = TRUE;
+        mpi_errno = MPIU_ExProcessCompletions(MPID_Nem_nd_exset_hnd_g, &status);
+        if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        /* Since we only support blocking ops on EX - atleast one op should complete */
+        MPIU_Assert(status == TRUE);
+    }
 
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_BLOCK);

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h	2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h	2011-02-21 22:57:52 UTC (rev 8003)
@@ -31,6 +31,13 @@
 #define MPID_NEM_ND_IS_FC_PKT(pkt_type) ((pkt_type != MPID_NEM_ND_CRED_PKT) && \
                                         (pkt_type != MPID_NEM_ND_RD_AVAIL_PKT) && \
                                         (pkt_type != MPID_NEM_ND_RD_ACK_PKT))
+
+typedef enum{
+    MPID_NEM_ND_SR_PACK=0,
+    MPID_NEM_ND_ZCP_PACK,
+    MPID_NEM_ND_INVALID_PACK
+} MPID_Nem_nd_pack_t;
+
 /* We use a simple cookie to make sure that the connection
  * is an MPICH2 nd connection
  */
@@ -81,6 +88,7 @@
 }while(0)
 #define MSGBUF_FREEQ_IS_EMPTY(_conn_hnd) (_conn_hnd->ssbuf_freeq.nbuf == 0)
 #define MSGBUF_FREEQ_DEQUEUE(_conn_hnd, _pmsg_buf) do{\
+    MPIU_Assert(!MSGBUF_FREEQ_IS_EMPTY(_conn_hnd)); \
     _pmsg_buf = &(_conn_hnd->ssbuf[_conn_hnd->ssbuf_freeq.head].msg);  \
     (_pmsg_buf)->hdr.type = MPID_NEM_ND_INVALID_PKT; \
     (_pmsg_buf)->hdr.credits = 0;   \



More information about the mpich2-commits mailing list