[mpich2-commits] r8003 - mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd
jayesh at mcs.anl.gov
jayesh at mcs.anl.gov
Mon Feb 21 16:57:52 CST 2011
Author: jayesh
Date: 2011-02-21 16:57:52 -0600 (Mon, 21 Feb 2011)
New Revision: 8003
Modified:
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp
mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h
Log:
This patch includes fixes/changes listed below. Since the code isn't completely functional without this patch - the different changes are not separately checked-in.
1. Fixed several bugs in the conn protocol
2. We only wait on EX events now - we no longer wait on ND events
3. Added support for non-contig data
4. Added support for user-defined datatypes. Data is now packed before sending
5. Added support for data transfer gt than device limit
6. Support blocking sends
7. Handle any unfinished reqs before terminating/closing a VC
8. Nemesis uses iov_count to indicate the rem IOVs. Fixed code where the sock channel interpretation of iov_count was used.
9. ND_PENDING is a SUCCESS not a FAILURE - fixed parts of code which considered the condn as a failure
10. Added several dbg stmts
11. Removed unused code/comments etc
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ad_util.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -49,10 +49,16 @@
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "Successfully created an ND CQ (sz=%d)", cq_sz);
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,
- "ND CQ : size = %d, msz = %d, mir = %d, mor = %d, mirl = %d, morl = %d",
+ "ND CQ : size = " MPIR_UPINT_FMT_DEC_SPEC ", mcq = " MPIR_UPINT_FMT_DEC_SPEC
+ ", mir = " MPIR_UPINT_FMT_DEC_SPEC ", mor = " MPIR_UPINT_FMT_DEC_SPEC
+ ", mirl = " MPIR_UPINT_FMT_DEC_SPEC ", morl = " MPIR_UPINT_FMT_DEC_SPEC
+ ", mol = " MPIR_UPINT_FMT_DEC_SPEC ", mreg_sz = " MPIR_UPINT_FMT_DEC_SPEC
+ ", lreq_thres = " MPIR_UPINT_FMT_DEC_SPEC,
cq_sz, hnd->ad_info.MaxCqEntries,
hnd->ad_info.MaxInboundRequests, hnd->ad_info.MaxOutboundRequests,
- hnd->ad_info.MaxInboundReadLimit, hnd->ad_info.MaxOutboundReadLimit));
+ hnd->ad_info.MaxInboundReadLimit, hnd->ad_info.MaxOutboundReadLimit,
+ hnd->ad_info.MaxOutboundLength, hnd->ad_info.MaxRegistrationSize,
+ hnd->ad_info.LargeRequestThreshold));
/* Associate the adapter with the Executive */
MPIU_ExAttachHandle(ex_hnd, MPIU_EX_GENERIC_COMP_PROC_KEY, hnd->p_ad->GetFileHandle());
@@ -126,6 +132,9 @@
mpi_errno = MPID_Nem_nd_ad_init(*phnd, ex_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ (*phnd)->npending_rds = 0;
+ (*phnd)->zcp_pending = 0;
+
MPIU_CHKPMEM_COMMIT();
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_DEV_HND_INIT);
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_ep_util.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -11,7 +11,7 @@
#define FUNCNAME MPID_Nem_nd_conn_hnd_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPID_Nem_nd_conn_hnd_t *pconn_hnd)
+int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPIDI_VC_t *vc, MPID_Nem_nd_conn_hnd_t *pconn_hnd)
{
int mpi_errno = MPI_SUCCESS;
HRESULT hr;
@@ -41,16 +41,44 @@
MPID_NEM_ND_CONN_STATE_SET((*pconn_hnd), MPID_NEM_ND_CONN_QUIESCENT);
(*pconn_hnd)->vc = NULL;
+ if(vc != NULL){
+ /* Make sure that we set the tmp conn info in the vc before we block
+ * We wait till 3-way handshake before setting vc info
+ * for this conn & conn info for the vc
+ * This info could be used by accept() side to signal that the conn
+ * is no longer valid - orphan
+ */
+ MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_SET(vc, (*pconn_hnd));
+ }
MPIU_ExInitOverlapped(&((*pconn_hnd)->recv_ov), NULL, NULL);
MPIU_ExInitOverlapped(&((*pconn_hnd)->send_ov), NULL, NULL);
+ (*pconn_hnd)->is_orphan = 0;
+ (*pconn_hnd)->tmp_vc = NULL;
(*pconn_hnd)->npending_ops = 0;
+ (*pconn_hnd)->send_in_progress = 0;
(*pconn_hnd)->zcp_in_progress = 0;
-
+ (*pconn_hnd)->zcp_rreqp = NULL;
+
+ (*pconn_hnd)->npending_rds = 0;
+
+ (*pconn_hnd)->zcp_send_offset = 0;
+
/* Create an endpoint - listen conns don't need an endpoint */
if((conn_type == MPID_NEM_ND_CONNECT_CONN) || (conn_type == MPID_NEM_ND_ACCEPT_CONN)){
+ ND_ADAPTER_INFO info;
+ SIZE_T len = sizeof(info);
+
+ hr = dev_hnd->p_ad->Query(1, &info, &len);
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+ mpi_errno, MPI_ERR_OTHER, "**nd_ep_create", "**nd_ep_create %s %d",
+ _com_error(hr).ErrorMessage(), hr);
+
+ /* FIXME: Use MPID_NEM_ND_CONN_RECVQ_SZ, MPID_NEM_ND_CONN_SENDQ_SZ for
+ * number of inboud/outbound requests
+ */
hr = (*pconn_hnd)->p_conn->CreateEndpoint(dev_hnd->p_cq, dev_hnd->p_cq,
- MPID_NEM_ND_CONN_RECVQ_SZ, MPID_NEM_ND_CONN_SENDQ_SZ,
+ info.MaxInboundRequests, info.MaxOutboundRequests,
MPID_NEM_ND_CONN_SGE_MAX, MPID_NEM_ND_CONN_SGE_MAX,
MPID_NEM_ND_CONN_RDMA_RD_MAX, MPID_NEM_ND_CONN_RDMA_RD_MAX,
NULL, &((*pconn_hnd)->p_ep));
@@ -88,6 +116,7 @@
* allowed even if init() fails
*/
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(*p_conn_hnd));
+
if((*p_conn_hnd)->p_ep){
/* Release endpoint */
(*p_conn_hnd)->p_ep->Release();
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_finalize.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -82,6 +82,13 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_VC_TERMINATE);
+ /* Poll till no more pending/posted sends */
+ while(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(vc)
+ || !MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_EMPTY(vc)){
+ mpi_errno = MPID_Nem_nd_sm_poll(1);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+
vc_ch->next = NULL;
vc_ch->prev = NULL;
MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_DISCONNECTED);
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_impl.h 2011-02-21 22:57:52 UTC (rev 8003)
@@ -35,9 +35,13 @@
* completion queue
*/
INDCompletionQueue *p_cq;
+ int npending_rds;
+ volatile int zcp_pending;
} *MPID_Nem_nd_dev_hnd_t;
#define MPID_NEM_ND_DEV_HND_INVALID NULL
+#define MPID_NEM_ND_DEV_RDMA_RD_MAX 2
+#define MPID_NEM_ND_DEV_IO_LIMIT(_dev_hnd) (_dev_hnd->ad_info.MaxOutboundLength - 1)
/* Checks whether dev handle is initialized */
#define MPID_NEM_ND_DEV_HND_IS_INIT(hnd) ((hnd) != NULL)
@@ -58,23 +62,25 @@
#define MPID_NEM_ND_CONN_SENDQ_SZ (MPID_NEM_ND_CONN_FC_BUFS_MAX+MPID_NEM_ND_CONN_RDMA_RD_MAX+MPID_NEM_ND_CONN_FC_MSG_MAX)
#define MPID_NEM_ND_CONN_RECVQ_SZ (MPID_NEM_ND_CONN_FC_BUFS_MAX+MPID_NEM_ND_CONN_RDMA_RD_MAX)
-#define MPID_NEM_ND_CONN_SGE_MAX 1
+#define MPID_NEM_ND_CONN_SGE_MAX 16
/* We use bcopy for upto 1K of upper layer data - pkt + user data
* FIXME: Tune this value after some runtime exp
*/
-#define MPID_NEM_ND_CONN_UDATA_SZ 1024
+#define MPID_NEM_ND_CONN_UDATA_SZ 2048
typedef struct MPID_Nem_nd_msg_mw_{
/* The memory window descriptor of next data
* i.e., upper layer data > MPID_NEM_ND_CONN_UDATA_SZ
* - if any
*/
+ /* FIXME: Only use/send the valid mw_datas */
ND_MW_DESCRIPTOR mw_data;
/* The memory window descriptor containing
* memory window descriptors of subsequent user data
* eg: Non contig sends
* - if any
*/
+ /* FIXME: Use this for multi-mws */
ND_MW_DESCRIPTOR mw_mws;
} MPID_Nem_nd_msg_mw_t;
@@ -131,6 +137,10 @@
INDEndpoint *p_ep;
INDConnector *p_conn;
MPIDI_VC_t *vc;
+ /* Set if this conn loses in H-H */
+ int is_orphan;
+ /* Used by conns to store vc till 3-way handshake - i.e., LACK/CACK */
+ MPIDI_VC_t *tmp_vc;
/* EX OV for Connect() */
/* FIXME: Use this for Send() etc after extending Executive */
MPIU_EXOVERLAPPED send_ov;
@@ -164,24 +174,32 @@
* FIXME: Can we get this info from send_credits ?
*/
int npending_ops;
+
+ /* FIXME : REMOVE ME ! */
+ int npending_rds;
/* Is a Flow control pkt pending ? */
int fc_pkt_pending;
- /* FIXME: Make sure that we only have 1 pending RDMA read */
- /* FIXME: Move rdma fields to another struct */
- /* Once we finish invalidating a MW - use these credits as send_credits */
-
/* RDMA Send side fields */
+ int zcp_send_offset;
+ int send_in_progress;
int zcp_in_progress;
+ ND_SGE zcp_send_sge;
+ /* MPID_Request *zcp_sreq; */
+ /* The ND memory window */
INDMemoryWindow *zcp_send_mw;
+ /* The memory window desc sent in the ND message */
MPID_Nem_nd_msg_mw_t zcp_msg_send_mw;
ND_MR_HANDLE zcp_send_mr_hnd;
MPID_Nem_nd_msg_result_t zcp_send_result;
/* RDMA Recv side fields*/
- int zcp_credits;
+ int cache_credits;
+ int zcp_recv_sge_count;
+ ND_SGE zcp_recv_sge[MPID_IOV_LIMIT];
+ MPID_Request *zcp_rreqp;
MPID_Nem_nd_msg_mw_t zcp_msg_recv_mw;
- ND_SGE zcp_recv_sge;
+ /* int zcp_recv_mw_offset; */
/* MPID_Nem_nd_msg_result_t zcp_recv_result; */
} *MPID_Nem_nd_conn_hnd_t;
@@ -190,6 +208,8 @@
/* For EX blocking ops */
MPIU_EXOVERLAPPED ex_ov;
MPID_Nem_nd_conn_hnd_t conn_hnd;
+ /* The number of blocking ops to wait before finalizing the hnd */
+ int npending_ops;
} *MPID_Nem_nd_block_op_hnd_t;
#define MPID_NEM_ND_BLOCK_OP_HND_INVALID NULL
#define MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(hnd) (MPIU_EX_GET_OVERLAPPED_PTR(&(hnd->ex_ov)))
@@ -200,7 +220,10 @@
/* Checks whether conn handle is valid */
#define MPID_NEM_ND_CONN_HND_IS_VALID(_hnd) (((_hnd) != NULL) && \
((_hnd)->p_conn != NULL) && ((_hnd)->p_ep != NULL))
-#define MPID_NEM_ND_CONN_STATE_SET(_hnd, _state) (_hnd->state = _state)
+#define MPID_NEM_ND_CONN_STATE_SET(_hnd, _state) do{ \
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn[%p] %d - %d", (_hnd), (_hnd)->state, _state)); \
+ (_hnd->state = _state); \
+}while(0);
/* Using an unused IANA protocol family */
#define MPID_NEM_ND_PROT_FAMILY 234
@@ -218,10 +241,11 @@
};
#define MPID_NEM_ND_CONN_IS_CONNECTING(_conn_hnd) (_conn_hnd && ( (_conn_hnd->state > MPID_NEM_ND_CONN_QUIESCENT) && (_conn_hnd->state < MPID_NEM_ND_CONN_ACTIVE) ))
-
+#define MPID_NEM_ND_CONN_IS_CONNECTED(_conn_hnd) (_conn_hnd && (_conn_hnd->state == MPID_NEM_ND_CONN_ACTIVE))
/* VC states */
typedef enum{
MPID_NEM_ND_VC_STATE_DISCONNECTED=0,
+ MPID_NEM_ND_VC_STATE_CONNECTING,
MPID_NEM_ND_VC_STATE_CONNECTED
} MPID_Nem_nd_vc_state_t;
@@ -230,6 +254,8 @@
on the network module */
typedef struct {
MPID_Nem_nd_conn_hnd_t conn_hnd;
+ /* Used by connect() to temperorily store the conn handle */
+ MPID_Nem_nd_conn_hnd_t tmp_conn_hnd;
struct{
struct MPID_Request *head;
struct MPID_Request *tail;
@@ -241,9 +267,11 @@
MPID_Nem_nd_vc_state_t state;
} MPID_Nem_nd_vc_area;
+#define MPID_NEM_ND_IS_BLOCKING_REQ(_reqp) ((_reqp)->dev.OnDataAvail != NULL)
#define MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(_vc) (((MPIDI_CH3I_VC *)((_vc)->channel_private))->recv_active)
#define MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(_vc, _req) (((MPIDI_CH3I_VC *)((_vc)->channel_private))->recv_active = _req)
#define MPID_NEM_ND_VCCH_NETMOD_CONN_HND_INIT(_vc) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->conn_hnd) = MPID_NEM_ND_CONN_HND_INVALID)
+#define MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(_vc) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->tmp_conn_hnd) = MPID_NEM_ND_CONN_HND_INVALID)
#define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_INIT(_vc) do{\
(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq).head = NULL; \
(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq).tail = NULL; \
@@ -254,6 +282,8 @@
#define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(_vc) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq).tail)
#define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(_vc, _reqp) GENERIC_Q_ENQUEUE (&(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq), _reqp, dev.next)
#define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(_vc, _reqp) GENERIC_Q_DEQUEUE (&(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq), _reqp, dev.next)
+#define MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_REM_TAIL(_vc, _reqp) GENERIC_Q_SEARCH_REMOVE (&(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->posted_sendq), ( (_reqp) && ((*_reqp)->dev.next == NULL) ), _reqp, MPID_Request, dev.next)
+
#define MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_INIT(_vc) do{\
(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->pending_sendq).head = NULL; \
(((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->pending_sendq).tail = NULL; \
@@ -269,25 +299,38 @@
#define MPID_NEM_ND_VCCH_NETMOD_FIELD_GET(_vc, _field) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->_field)
#define MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(_vc, _conn_hnd) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->conn_hnd) = _conn_hnd)
#define MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(_vc) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->conn_hnd)
+#define MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_SET(_vc, _conn_hnd) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->tmp_conn_hnd) = _conn_hnd)
+#define MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_GET(_vc) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->tmp_conn_hnd)
#define MPID_NEM_ND_VCCH_NETMOD_STATE_SET(_vc, _state) ((((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->state) = _state)
#define MPID_NEM_ND_VCCH_NETMOD_STATE_GET(_vc) (((MPID_Nem_nd_vc_area *)((MPIDI_CH3I_VC *)(_vc)->channel_private)->netmod_area.padding)->state)
/* VC Netmod util funcs */
-#define MPID_NEM_ND_VC_IS_CONNECTED(_vc) (\
+#define MPID_NEM_ND_VC_IS_CONNECTED(_vc) ( \
+ (_vc) && \
+ (MPID_NEM_ND_VCCH_NETMOD_STATE_GET(_vc) == MPID_NEM_ND_VC_STATE_CONNECTED) && \
(MPID_NEM_ND_CONN_HND_IS_VALID(MPID_NEM_ND_VCCH_NETMOD_FIELD_GET(_vc, conn_hnd))) && \
(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(_vc)->state == MPID_NEM_ND_CONN_ACTIVE) \
)
+#define MPID_NEM_ND_VC_IS_CONNECTING(_vc) (\
+ (_vc) && \
+ (MPID_NEM_ND_VCCH_NETMOD_STATE_GET(_vc) == MPID_NEM_ND_VC_STATE_CONNECTING) \
+)
+
/* CONN is orphan if
* - conn is not valid
* - conn is related to a VC that is no longer related to it (eg: lost in head to head)
*/
+/*
#define MPID_NEM_ND_CONN_IS_ORPHAN(_hnd) (\
!MPID_NEM_ND_CONN_HND_IS_VALID(_hnd) || \
((_hnd->vc) && (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(_hnd->vc) != _hnd)) \
)
+*/
+#define MPID_NEM_ND_CONN_IS_ORPHAN(_hnd) (_hnd->is_orphan)
#define MPID_NEM_ND_CONN_HAS_SCREDITS(_hnd) (_hnd->send_credits > 0)
#define MPID_NEM_ND_CONN_DECR_SCREDITS(_hnd) (_hnd->send_credits--)
+#define MPID_NEM_ND_CONN_DECR_CACHE_SCREDITS(_hnd) (_hnd->cache_credits--)
/* #define MPID_NEM_ND_CONN_INCR_SCREDITS(_hnd) (_hnd->send_credits++) */
/* #define MPID_NEM_ND_CONN_DECR_RCREDITS(_hnd) (_hnd->recv_credits--) */
@@ -310,18 +353,19 @@
int MPID_Nem_nd_dev_hnd_init(MPID_Nem_nd_dev_hnd_t *phnd, MPIU_ExSetHandle_t ex_hnd);
int MPID_Nem_nd_dev_hnd_finalize(MPID_Nem_nd_dev_hnd_t *phnd);
-int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPID_Nem_nd_conn_hnd_t *pconn_hnd);
+int MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_type_t conn_type, INDConnector *p_conn, MPIDI_VC_t *vc, MPID_Nem_nd_conn_hnd_t *pconn_hnd);
int MPID_Nem_nd_conn_hnd_finalize(MPID_Nem_nd_dev_hnd_t dev_hnd, MPID_Nem_nd_conn_hnd_t *p_conn_hnd);
int MPID_Nem_nd_sm_init(void );
int MPID_Nem_nd_sm_finalize(void );
-int MPID_Nem_nd_sm_poll(void );
+int MPID_Nem_nd_sm_poll(int in_blocking_poll);
int MPID_Nem_nd_conn_block_op_init(MPID_Nem_nd_conn_hnd_t conn_hnd);
int MPID_Nem_nd_conn_msg_bufs_init(MPID_Nem_nd_conn_hnd_t conn_hnd);
int MPID_Nem_nd_listen_for_conn(int pg_rank, char **bc_val_p, int *val_max_sz_p);
int MPID_Nem_nd_conn_disc(MPID_Nem_nd_conn_hnd_t conn_hnd);
int MPID_Nem_nd_conn_est(MPIDI_VC_t *vc);
-int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iov, int n_iov);
+int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp);
+int MPID_Nem_nd_post_sendbv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp);
int MPID_Nem_nd_init(MPIDI_PG_t *pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p);
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_init.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -82,6 +82,7 @@
vc_ch->prev = NULL;
MPID_NEM_ND_VCCH_NETMOD_CONN_HND_INIT(vc);
+ MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(vc);
MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_INIT(vc);
MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_INIT(vc);
fn_exit:
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_misc_util.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -337,10 +337,12 @@
MPIU_Assert(plocal_won_flag != NULL);
if(MPIDI_Process.my_pg == remote_pg){
/* Same process group - compare ranks to determine the winning rank */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Same process group, comparing ranks");
*plocal_won_flag = (MPIDI_Process.my_pg_rank < remote_rank) ? 1 : 0;
}
else{
/* Different process groups - compare pg ids to determine the winning rank */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Diff process group, comparing ids");
*plocal_won_flag = (strcmp((char *)MPIDI_Process.my_pg->id, remote_pg_id) < 0) ? 1 : 0;
}
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_poll.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -15,13 +15,13 @@
int mpi_errno = MPI_SUCCESS;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_POLL);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POLL);
+ /* MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POLL); */
- mpi_errno = MPID_Nem_nd_sm_poll();
+ mpi_errno = MPID_Nem_nd_sm_poll(in_blocking_poll);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POLL);
+ /* MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POLL); */
return mpi_errno;
fn_fail:
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_send.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -11,53 +11,70 @@
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPID_Nem_nd_istart_contig_msg(MPIDI_VC_t *vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz,
- MPID_Request **sreq_ptr)
+ MPID_Request **sreqp_ptr)
{
int mpi_errno = MPI_SUCCESS;
- MPID_Request * sreq = NULL;
+ MPID_Request * sreqp = NULL;
int is_send_posted = 0;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ISTART_CONTIG_MSG);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ISTART_CONTIG_MSG);
MPIU_Assert((hdr_sz > 0) && (hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)));
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "istart_contig_msg (hdr_sz=%d,data_sz=%d)", hdr_sz, data_sz));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "istart_contig_msg (hdr_sz=" MPIDI_MSG_SZ_FMT ",data_sz=" MPIDI_MSG_SZ_FMT ")", hdr_sz, data_sz));
+
+ /* Create a request and queue it */
+ sreqp = MPID_Request_create();
+ MPIU_Assert(sreqp != NULL);
+ MPIU_Object_set_ref(sreqp, 2);
+ sreqp->kind = MPID_REQUEST_SEND;
+
+ sreqp->dev.OnDataAvail = NULL;
+ sreqp->ch.vc = vc;
+ sreqp->dev.iov_offset = 0;
+
+ sreqp->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
+ sreqp->dev.iov[0].MPID_IOV_BUF = (char *)&sreqp->dev.pending_pkt;
+ sreqp->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
+
+ if(data_sz > 0){
+ sreqp->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
+ sreqp->dev.iov[1].MPID_IOV_LEN = data_sz;
+ sreqp->dev.iov_count = 2;
+ }
+ else{
+ sreqp->dev.iov_count = 1;
+ }
+
+ is_send_posted = 0;
if(MPID_NEM_ND_VC_IS_CONNECTED(vc)){
MPID_Nem_nd_conn_hnd_t conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
/* Try sending data - if no credits queue the remaining data */
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connected - trying to send data");
if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
/* We have send credits */
- MPID_IOV iov[2];
- int iov_cnt;
/* Post a send for data & queue request */
- iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )hdr;
- MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_PktGeneric_t ));
- iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
- if(data_sz > 0){
- iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
- iov[1].MPID_IOV_LEN = data_sz;
- iov_cnt = 2;
+ if(!MPID_NEM_ND_IS_BLOCKING_REQ(sreqp)){
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
else{
- iov_cnt = 1;
+ mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
- mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, iov, iov_cnt);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
/* Now queue the request in posted queue */
is_send_posted = 1;
}
}
else{
/* VC is not connected */
- is_send_posted = 0;
- if(MPID_NEM_ND_CONN_IS_CONNECTING(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))){
+ if(MPID_NEM_ND_VC_IS_CONNECTING(vc)){
/* Already connecting - just queue req in pending queue */
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connecting - queueing data");
}
else{
+ MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_DISCONNECTED);
/* Start connecting and queue req in pending queue */
mpi_errno = MPID_Nem_nd_conn_est(vc);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -65,36 +82,14 @@
}
}
- /* Create a request and queue it */
- sreq = MPID_Request_create();
- MPIU_Assert(sreq != NULL);
- MPIU_Object_set_ref(sreq, 2);
- sreq->kind = MPID_REQUEST_SEND;
-
- sreq->dev.OnDataAvail = NULL;
- sreq->ch.vc = vc;
- sreq->dev.iov_offset = 0;
-
- sreq->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
- sreq->dev.iov[0].MPID_IOV_BUF = (char *)&sreq->dev.pending_pkt;
- sreq->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
-
- if(data_sz > 0){
- sreq->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
- sreq->dev.iov[1].MPID_IOV_LEN = data_sz;
- sreq->dev.iov_count = 2;
- }
- else{
- sreq->dev.iov_count = 1;
- }
if(is_send_posted){
- MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreq);
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreqp);
}
else{
- MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreq);
+ MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreqp);
}
- *sreq_ptr = sreq;
+ *sreqp_ptr = sreqp;
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ISTART_CONTIG_MSG);
@@ -108,7 +103,7 @@
#define FUNCNAME MPID_Nem_nd_send_contig
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_send_contig(MPIDI_VC_t *vc, MPID_Request *sreq, void *hdr, MPIDI_msg_sz_t hdr_sz,
+int MPID_Nem_nd_send_contig(MPIDI_VC_t *vc, MPID_Request *sreqp, void *hdr, MPIDI_msg_sz_t hdr_sz,
void *data, MPIDI_msg_sz_t data_sz)
{
int mpi_errno = MPI_SUCCESS;
@@ -118,72 +113,70 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SEND_CONTIG);
MPIU_Assert((hdr_sz > 0) && (hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)));
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_contig_msg (hdr_sz=%d,data_sz=%d)", hdr_sz, data_sz));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_contig_msg (hdr_sz=" MPIDI_MSG_SZ_FMT ",data_sz=" MPIDI_MSG_SZ_FMT ")", hdr_sz, data_sz));
+ /* FIXME: Update the req dev iov fields only for unposted sends
+ */
+ sreqp->ch.vc = vc;
+ sreqp->dev.iov_offset = 0;
+ sreqp->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
+ sreqp->dev.iov[0].MPID_IOV_BUF = (char *)&sreqp->dev.pending_pkt;
+ sreqp->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
+
+ if(data_sz > 0){
+ sreqp->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
+ sreqp->dev.iov[1].MPID_IOV_LEN = data_sz;
+ sreqp->dev.iov_count = 2;
+ }
+ else{
+ sreqp->dev.iov_count = 1;
+ }
+
+ is_send_posted = 0;
if(MPID_NEM_ND_VC_IS_CONNECTED(vc)){
MPID_Nem_nd_conn_hnd_t conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
/* Try sending data - if no credits queue the remaining data */
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connected - trying to send data");
if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
- /* We have send credits */
- MPID_IOV iov[2];
- int iov_cnt;
- /* Post a send for data & queue request */
- iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )hdr;
- MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_PktGeneric_t ));
- iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
- if(data_sz > 0){
- iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
- iov[1].MPID_IOV_LEN = data_sz;
- iov_cnt = 2;
+
+ if(!MPID_NEM_ND_IS_BLOCKING_REQ(sreqp)){
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
else{
- iov_cnt = 1;
+ mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
- mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, iov, iov_cnt);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
/* Now queue the request in posted queue */
is_send_posted = 1;
}
}
else{
/* VC is not connected */
- is_send_posted = 0;
- if(MPID_NEM_ND_CONN_IS_CONNECTING(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))){
+ if(MPID_NEM_ND_VC_IS_CONNECTING(vc)){
/* Already connecting - just queue req in pending queue */
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connecting - queueing data");
}
else{
/* Start connecting and queue req in pending queue */
+ if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) != MPID_NEM_ND_VC_STATE_DISCONNECTED){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "vc(%p:%d), conn(%p:%d",
+ vc, MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc),
+ MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc),
+ (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))->state));
+ }
+ MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_DISCONNECTED);
mpi_errno = MPID_Nem_nd_conn_est(vc);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Posted a connect - queueing data");
}
}
- /* FIXME: Update the req dev iov fields only for unposted sends
- */
- /* Create a request and queue it */
- sreq->ch.vc = vc;
- sreq->dev.iov_offset = 0;
- sreq->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)hdr;
- sreq->dev.iov[0].MPID_IOV_BUF = (char *)&sreq->dev.pending_pkt;
- sreq->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
-
- if(data_sz > 0){
- sreq->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST )data;
- sreq->dev.iov[1].MPID_IOV_LEN = data_sz;
- sreq->dev.iov_count = 2;
- }
- else{
- sreq->dev.iov_count = 1;
- }
if(is_send_posted){
- MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreq);
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreqp);
}
else{
- MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreq);
+ MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreqp);
}
fn_exit:
@@ -198,21 +191,88 @@
#define FUNCNAME MPID_Nem_nd_send_noncontig
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_send_noncontig(MPIDI_VC_t *vc, MPID_Request *sreq, void *header, MPIDI_msg_sz_t hdr_sz)
+int MPID_Nem_nd_send_noncontig(MPIDI_VC_t *vc, MPID_Request *sreqp, void *header, MPIDI_msg_sz_t hdr_sz)
{
int mpi_errno = MPI_SUCCESS;
+ int is_send_posted = 0;
+ MPID_IOV *iov;
+ int iov_cnt = 0, i;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SEND_NONCONTIG);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SEND_NONCONTIG);
- MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));
- /* FIXME: We have not implemented send for non contig msgs yet */
- MPIU_Assert(0);
+ MPIU_Assert((hdr_sz > 0) && (hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)));
+
+ iov = &(sreqp->dev.iov[0]);
+ /* Reserve 1st IOV for header */
+ iov_cnt = MPID_IOV_LIMIT - 1;
+
+ /* On return iov_cnt refers to the number of IOVs loaded */
+ mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreqp, &(iov[1]), &iov_cnt);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|loadsendiov");
+
+ sreqp->dev.pending_pkt = *(MPIDI_CH3_PktGeneric_t *)header;
+ iov[0].MPID_IOV_BUF = (char *)&sreqp->dev.pending_pkt;
+ iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_PktGeneric_t);
+
+ iov_cnt += 1;
+
+ /* FIXME: Update the req dev iov fields only for unposted sends
+ */
+ sreqp->ch.vc = vc;
+ sreqp->dev.iov_offset = 0;
+ sreqp->dev.iov_count = iov_cnt;
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_noncontig_msg (hdr_sz=" MPIDI_MSG_SZ_FMT ")", hdr_sz));
+ for(i=1; i<iov_cnt; i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "isend_noncontig_msg (iov[%d] = %p, size =%u)", i, iov[i].MPID_IOV_BUF, iov[i].MPID_IOV_LEN));
+ }
if(MPID_NEM_ND_VC_IS_CONNECTED(vc)){
+ MPID_Nem_nd_conn_hnd_t conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
/* Try sending data - if no credits queue the remaining data */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connected - trying to send data");
+ if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
+ if(!MPID_NEM_ND_IS_BLOCKING_REQ(sreqp)){
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ else{
+ mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+
+ /* Now queue the request in posted queue */
+ is_send_posted = 1;
+ }
}
else{
- /* Start connecting */
+ /* VC is not connected */
+ is_send_posted = 0;
+ if(MPID_NEM_ND_VC_IS_CONNECTING(vc)){
+ /* Already connecting - just queue req in pending queue */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "vc connecting - queueing data");
+ }
+ else{
+ /* Start connecting and queue req in pending queue */
+ if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) != MPID_NEM_ND_VC_STATE_DISCONNECTED){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "vc(%p:%d), conn(%p:%d",
+ vc, MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc),
+ MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc),
+ (MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))->state));
+ }
+ MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_DISCONNECTED);
+ mpi_errno = MPID_Nem_nd_conn_est(vc);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Posted a connect - queueing data");
+ }
}
+
+ if(is_send_posted){
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(vc, sreqp);
+ }
+ else{
+ MPID_NEM_ND_VCCH_NETMOD_PENDING_SENDQ_ENQUEUE(vc, sreqp);
+ }
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SEND_NONCONTIG);
return mpi_errno;
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.cpp 2011-02-21 22:57:52 UTC (rev 8003)
@@ -16,13 +16,15 @@
static int gen_recv_fail_handler(MPID_Nem_nd_msg_result_t *recv_result);
static int send_success_handler(MPID_Nem_nd_msg_result_t *send_result);
static int zcp_mw_send_success_handler(MPID_Nem_nd_msg_result_t *send_result);
-static int zcp_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_result);
+static int cont_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_result);
static int netmod_msg_send_success_handler(MPID_Nem_nd_msg_result_t *send_result);
+static int zcp_read_success_handler(MPID_Nem_nd_msg_result_t *send_result);
static int zcp_read_fail_handler(MPID_Nem_nd_msg_result_t *send_result);
static int wait_cack_success_handler(MPID_Nem_nd_msg_result_t *recv_result);
static int wait_lack_success_handler(MPID_Nem_nd_msg_result_t *recv_result);
static int recv_success_handler(MPID_Nem_nd_msg_result_t *send_result);
static int dummy_msg_handler(MPID_Nem_nd_msg_result_t *result);
+static int quiescent_msg_handler(MPID_Nem_nd_msg_result_t *result);
static int free_msg_result_handler(MPID_Nem_nd_msg_result_t *result);
/* The EX handler func decls */
@@ -35,13 +37,20 @@
static int __cdecl gen_ex_fail_handler(MPIU_EXOVERLAPPED *ov);
static int __cdecl block_op_handler(MPIU_EXOVERLAPPED *ov);
static int __cdecl manual_event_handler(MPIU_EXOVERLAPPED *ov);
+static int __cdecl dummy_handler(MPIU_EXOVERLAPPED *ov);
}
static inline int MPID_Nem_nd_handle_posted_sendq_head_req(MPIDI_VC_t *vc, int *req_complete);
+static inline int MPID_Nem_nd_handle_posted_sendq_tail_req(MPIDI_VC_t *vc, int *req_complete);
static int process_pending_req(MPID_Nem_nd_conn_hnd_t conn_hnd);
int MPID_Nem_nd_update_fc_info(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Nem_nd_msg_t *pmsg);
int MPID_Nem_nd_sm_block(MPID_Nem_nd_block_op_hnd_t op_hnd);
-
+int MPID_Nem_nd_pack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iovp,
+ int offset_start,
+ int *offset_endp,
+ MPID_Nem_nd_msg_t *pmsg,
+ MPID_Nem_nd_pack_t *pack_typep,
+ SIZE_T *nbp);
#undef FUNCNAME
#define FUNCNAME MPID_Nem_nd_sm_init
#undef FCNAME
@@ -76,7 +85,7 @@
#define FUNCNAME MPID_Nem_nd_block_op_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_block_op_init(MPID_Nem_nd_block_op_hnd_t *phnd, MPID_Nem_nd_conn_hnd_t conn_hnd)
+int MPID_Nem_nd_block_op_init(MPID_Nem_nd_block_op_hnd_t *phnd, int npending_ops, MPID_Nem_nd_conn_hnd_t conn_hnd, int is_manual_event)
{
int mpi_errno = MPI_SUCCESS;
HRESULT hr;
@@ -87,36 +96,35 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_BLOCK_OP_INIT);
MPIU_Assert(phnd != NULL);
+ MPIU_Assert(npending_ops > 0);
+
+ /* FIXME: For now we only allow 1 blocking op on the conn */
+ MPIU_Assert(conn_hnd->npending_ops == 0);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
MPIU_CHKPMEM_MALLOC(*phnd, MPID_Nem_nd_block_op_hnd_t, sizeof(struct MPID_Nem_nd_block_op_hnd_), mpi_errno, "Block op hnd");
+ (*phnd)->npending_ops = npending_ops;
(*phnd)->conn_hnd = conn_hnd;
- if(conn_hnd->npending_ops <= 1){
- /* Call the block op handlers only when the last pending event is over
- * Note that the event handler gets called AFTER the event
- */
- MPIU_ExInitOverlapped(&((*phnd)->ex_ov), block_op_handler, block_op_handler);
- }
- else{
- /* Handle manual events with the event handler */
- MPIU_ExInitOverlapped(&((*phnd)->ex_ov), manual_event_handler, manual_event_handler);
- }
+ if(is_manual_event){
+ MPIU_ExInitOverlapped(&((*phnd)->ex_ov), manual_event_handler, manual_event_handler);
+ }
+ else{
+ MPIU_Assert(0);
+ MPIU_ExInitOverlapped(&((*phnd)->ex_ov), block_op_handler, block_op_handler);
+ }
pov = MPIU_EX_GET_OVERLAPPED_PTR(&((*phnd)->ex_ov));
/* Executive initializes event to NULL - So create events after initializing the
* handlers
*/
- pov->hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
+ pov->hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
MPIU_ERR_CHKANDJUMP((pov->hEvent == NULL), mpi_errno, MPI_ERR_OTHER, "**intern");
- /* Get notification for all events on CQ */
- hr = MPID_Nem_nd_dev_hnd_g->p_cq->Notify(ND_CQ_NOTIFY_ANY, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR((*phnd)));
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
- mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
- _com_error(hr).ErrorMessage(), hr);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Get notifications from cq(%p)/conn(%p)/block_op(%p) on ov(%p)",
+ MPID_Nem_nd_dev_hnd_g->p_cq, conn_hnd, (*phnd), pov));
MPIU_CHKPMEM_COMMIT();
fn_exit:
@@ -135,12 +143,20 @@
int MPID_Nem_nd_block_op_finalize(MPID_Nem_nd_block_op_hnd_t *phnd)
{
int mpi_errno = MPI_SUCCESS;
+ OVERLAPPED *pov;
+
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_BLOCK_OP_FINALIZE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_BLOCK_OP_FINALIZE);
MPIU_Assert(phnd != NULL);
if(*phnd){
+ pov = MPIU_EX_GET_OVERLAPPED_PTR(&((*phnd)->ex_ov));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trying to finalize conn(%p)/block_op(%p) on ov(%p)",
+ (*phnd)->conn_hnd, (*phnd), pov));
+ if(pov->hEvent){
+ CloseHandle(pov->hEvent);
+ }
MPIU_Free(*phnd);
}
fn_exit:
@@ -168,18 +184,15 @@
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(op_hnd->conn_hnd));
/* Re-initialize the ex ov */
- if(op_hnd->conn_hnd->npending_ops <= 1){
- ret = MPIU_ExReInitOverlapped(&(op_hnd->ex_ov), block_op_handler, block_op_handler);
- MPIU_ERR_CHKANDJUMP((ret == FALSE), mpi_errno, MPI_ERR_OTHER, "**intern");
- }
- else{
- ret = MPIU_ExReInitOverlapped(&(op_hnd->ex_ov), manual_event_handler, manual_event_handler);
- MPIU_ERR_CHKANDJUMP((ret == FALSE), mpi_errno, MPI_ERR_OTHER, "**intern");
- }
+ ret = MPIU_ExReInitOverlapped(&(op_hnd->ex_ov), NULL, NULL);
+ MPIU_ERR_CHKANDJUMP((ret == FALSE), mpi_errno, MPI_ERR_OTHER, "**intern");
pov = MPIU_EX_GET_OVERLAPPED_PTR(&(op_hnd->ex_ov));
MPIU_Assert(pov->hEvent != NULL);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Re-initializing conn(%p)/block_op(%p) on ov(%p)",
+ op_hnd->conn_hnd, op_hnd, pov));
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_BLOCK_OP_REINIT);
return mpi_errno;
@@ -188,39 +201,7 @@
goto fn_exit;
}
-
-/*
#undef FUNCNAME
-#define FUNCNAME MPID_Nem_nd_conn_block_op_reinit
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_conn_block_op_reinit(MPID_Nem_nd_conn_hnd_t conn_hnd)
-{
- int mpi_errno = MPI_SUCCESS;
- OVERLAPPED *pov;
- BOOL ret;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_CONN_BLOCK_OP_REINIT);
-
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_CONN_BLOCK_OP_REINIT);
-
- MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(conn_hnd));
-
- pov = MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->block_ov));
- MPIU_Assert(pov->hEvent != NULL);
-
- ret = ResetEvent(pov->hEvent);
- MPIU_ERR_CHKANDJUMP((pov->hEvent == NULL), mpi_errno, MPI_ERR_OTHER, "**intern");
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_CONN_BLOCK_OP_REINIT);
- return mpi_errno;
- fn_fail:
- MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
- goto fn_exit;
-}
-
-*/
-
#define FUNCNAME MPID_Nem_nd_conn_msg_bufs_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -244,35 +225,35 @@
MSGBUF_FREEQ_INIT(conn_hnd);
/* Register the sendq & recvq with adapter - We block while registering memory */
- mpi_errno = MPID_Nem_nd_block_op_init(&rsbuf_op_hnd, conn_hnd);
+ mpi_errno = MPID_Nem_nd_block_op_init(&rsbuf_op_hnd, 1, conn_hnd, 1);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registring rs memory conn(%p)/block_op(%p) on ov(%p)",
+ conn_hnd, rsbuf_op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(rsbuf_op_hnd)));
+
hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->rsbuf, sizeof(conn_hnd->rsbuf), MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(rsbuf_op_hnd), &(conn_hnd->rsbuf_hmr));
- if(hr == ND_PENDING){
+ if(SUCCEEDED(hr)){
/* Manual event */
conn_hnd->npending_ops++;
mpi_errno = MPID_Nem_nd_sm_block(rsbuf_op_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(rsbuf_op_hnd), &nb, TRUE);
- */
}
MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_listen", "**nd_listen %s %d",
_com_error(hr).ErrorMessage(), hr);
- mpi_errno = MPID_Nem_nd_block_op_init(&ssbuf_op_hnd, conn_hnd);
+ mpi_errno = MPID_Nem_nd_block_op_init(&ssbuf_op_hnd, 1, conn_hnd, 1);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registring ss memory conn(%p)/block_op(%p) on ov(%p)",
+ conn_hnd, ssbuf_op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(ssbuf_op_hnd)));
+
hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->ssbuf, sizeof(conn_hnd->ssbuf), MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(ssbuf_op_hnd), &(conn_hnd->ssbuf_hmr));
- if(hr == ND_PENDING){
+ if(SUCCEEDED(hr)){
/* Manual event */
conn_hnd->npending_ops++;
mpi_errno = MPID_Nem_nd_sm_block(ssbuf_op_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(ssbuf_op_hnd), &nb, TRUE);
- */
}
MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_listen", "**nd_listen %s %d",
@@ -299,12 +280,15 @@
}
conn_hnd->p_ep->SubmitRequestBatch();
- /* FIXME: REMOVE ME !! -start */
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn_hnd (%p)", conn_hnd));
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "========== RECV SBUFS ===========");
for(i=0; i<MPID_NEM_ND_CONN_RECVQ_SZ;i++){
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn_hnd->rsbuf[%d].msg = (%p)", i, &(conn_hnd->rsbuf[i].msg)));
}
- /* FIXME: REMOVE ME !! -end */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "========== SEND SBUFS ===========");
+ for(i=0; i<MPID_NEM_ND_CONN_SENDQ_SZ;i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "conn_hnd->ssbuf[%d].msg = (%p)", i, &(conn_hnd->ssbuf[i].msg)));
+ }
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_CONN_MSG_BUFS_INIT);
@@ -333,18 +317,18 @@
if(is_blocking){
MPID_Nem_nd_block_op_hnd_t op_hnd;
- mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, lconn_hnd);
+ mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, 1, lconn_hnd, 1);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Blocking on accept lconn(%p)/block_op(%p) on ov(%p)",
+ lconn_hnd, op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd)));
+
hr = new_conn_hnd->p_conn->Accept(new_conn_hnd->p_ep, NULL, 0, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd));
- if(hr == ND_PENDING){
+ if(SUCCEEDED(hr)){
/* Manual event */
lconn_hnd->npending_ops++;
mpi_errno = MPID_Nem_nd_sm_block(op_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = new_conn_hnd->p_conn->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd), &nb, TRUE);
- */
}
MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
@@ -356,9 +340,12 @@
mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
_com_error(hr).ErrorMessage(), hr);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting next req for conn on lconn(%p)/block_op(%p) on ov(%p)",
+ lconn_hnd, op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd)));
+
/* Post next req for connection */
hr = MPID_Nem_nd_dev_hnd_g->p_listen->GetConnectionRequest(lconn_hnd->p_conn, MPIU_EX_GET_OVERLAPPED_PTR(&(lconn_hnd->recv_ov)));
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
_com_error(hr).ErrorMessage(), hr);
}
@@ -366,7 +353,7 @@
MPIU_Assert(0);
SET_EX_RD_HANDLER(lconn_hnd, listen_success_handler, quiescent_handler);
hr = MPID_Nem_nd_lconn_hnd->p_conn->Accept(new_conn_hnd->p_ep, NULL, 0, MPIU_EX_GET_OVERLAPPED_PTR(&(MPID_Nem_nd_lconn_hnd->recv_ov)));
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_accept", "**nd_accept %s %d",
_com_error(hr).ErrorMessage(), hr);
}
@@ -386,7 +373,7 @@
int MPID_Nem_nd_listen_for_conn(int pg_rank, char **bc_val_p, int *val_max_sz_p)
{
int mpi_errno = MPI_SUCCESS, ret, use_default_interface=0;
- size_t len;
+ SIZE_T len;
HRESULT hr;
char *buf;
int i;
@@ -397,7 +384,7 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_LISTEN_FOR_CONN);
/* Create listen conn */
- mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_LISTEN_CONN, NULL, &MPID_Nem_nd_lconn_hnd);
+ mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_LISTEN_CONN, NULL, NULL, &MPID_Nem_nd_lconn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
/* Listen for connections */
@@ -411,9 +398,12 @@
SET_EX_RD_HANDLER(MPID_Nem_nd_lconn_hnd, listen_success_handler, quiescent_handler);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting first req for conn on lconn(%p)on ov(%p)",
+ MPID_Nem_nd_lconn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(MPID_Nem_nd_lconn_hnd->recv_ov))));
+
/* FIXME: How many conn requests should we pre-post ? */
hr = MPID_Nem_nd_dev_hnd_g->p_listen->GetConnectionRequest(MPID_Nem_nd_lconn_hnd->p_conn, MPIU_EX_GET_OVERLAPPED_PTR(&(MPID_Nem_nd_lconn_hnd->recv_ov)));
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_listen", "**nd_listen %s %d",
_com_error(hr).ErrorMessage(), hr);
@@ -470,6 +460,7 @@
goto fn_exit;
}
+/* Wait for discing until the other side sends us some data or disconnects */
#undef FUNCNAME
#define FUNCNAME MPID_Nem_nd_conn_passive_disc
#undef FCNAME
@@ -487,11 +478,13 @@
/* Make the conn an orphan */
conn_hnd->vc = NULL;
MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
- SET_EX_RD_HANDLER(conn_hnd, passive_quiescent_handler, passive_quiescent_handler);
+ SET_EX_RD_HANDLER(conn_hnd, dummy_handler, dummy_handler);
- /* Set the recv sbuf handlers to dummy handlers */
+ /* Set the recv sbuf handlers to quiescent msg handlers - the conn is disconnected
+ * after we receive a CACK/CNAK, i.e., some data, on this conn
+ */
for(i=0;i<MPID_NEM_ND_CONN_RECVQ_SZ;i++){
- SET_MSGBUF_HANDLER(&((conn_hnd->rsbuf[i]).msg), dummy_msg_handler, dummy_msg_handler);
+ SET_MSGBUF_HANDLER(&((conn_hnd->rsbuf[i]).msg), quiescent_msg_handler, quiescent_msg_handler);
}
}
@@ -522,17 +515,26 @@
int i=0;
/* Make the conn an orphan */
conn_hnd->vc = NULL;
+ conn_hnd->tmp_vc = NULL;
+
MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
SET_EX_WR_HANDLER(conn_hnd, quiescent_handler, quiescent_handler);
-
+
+ /* FIXME: DEREGISTER ALL RECV BUFS HERE ...*/
/* Set the recv sbuf handlers to dummy handlers */
for(i=0;i<MPID_NEM_ND_CONN_RECVQ_SZ;i++){
SET_MSGBUF_HANDLER(&((conn_hnd->rsbuf[i]).msg), dummy_msg_handler, dummy_msg_handler);
}
+ MPIU_Assert(conn_hnd->npending_ops == 0);
+ MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "Posting disconnect on conn(%p)", conn_hnd);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting disc on conn(%p) on ov(%p)",
+ conn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov))));
+
/* Post disconnect on the ND Conn corresponding to VC */
hr = conn_hnd->p_conn->Disconnect(MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)));
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_disc", "**nd_disc %s %d",
_com_error(hr).ErrorMessage(), hr);
}
@@ -550,7 +552,7 @@
#define FUNCNAME MPID_Nem_nd_post_send_msg
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_post_send_msg(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Nem_nd_msg_t *pmsg, int msg_len, int is_blocking)
+int MPID_Nem_nd_post_send_msg(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Nem_nd_msg_t *pmsg, SIZE_T msg_len, int is_blocking)
{
int mpi_errno = MPI_SUCCESS;
HRESULT hr;
@@ -567,12 +569,16 @@
was_fc_pkt = (MPID_NEM_ND_IS_FC_PKT(pmsg->hdr.type)) ? TRUE : FALSE;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Post send msg [conn=%p, on/msg = %p, sz=%d]",conn_hnd, pmsg, msg_len));
+
/* Update FC info */
mpi_errno = MPID_Nem_nd_update_fc_info(conn_hnd, pmsg);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
if(is_blocking){
- mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, conn_hnd);
+ /* FIXME: Allow blocking sends */
+ MPIU_Assert(0);
+ mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, 1, conn_hnd, 0);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
MPIU_CHKPMEM_MALLOC(pmsg_result, MPID_Nem_nd_msg_result_t *, sizeof(MPID_Nem_nd_msg_result_t ), mpi_errno, "block send op result");
@@ -588,13 +594,13 @@
sge.pAddr = pmsg;
sge.hMr = conn_hnd->ssbuf_hmr;
- hr = conn_hnd->p_ep->Send(pnd_result, &sge, 1, 0x0);
+ hr = conn_hnd->p_ep->Send(pnd_result, &sge, 1, ND_OP_FLAG_READ_FENCE);
MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
_com_error(hr).ErrorMessage(), hr);
/* Increment the number of pending ops on conn */
- conn_hnd->npending_ops++;
+ /* conn_hnd->npending_ops++; */
if(is_blocking){
/* Block till all current pending ops complete */
@@ -604,20 +610,19 @@
/* No pending ops */
MPIU_Assert(conn_hnd->npending_ops == 0);
- mpi_errno = MPID_Nem_nd_block_op_finalize(&op_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
- /*
- nresults = MPID_Nem_nd_dev_hnd_g->p_cq->GetResults(&presult, 1);
- MPIU_ERR_CHKANDJUMP2(FAILED(presult->Status),
- mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
- _com_error(presult->Status).ErrorMessage(), presult->Status);
- */
MPIU_CHKPMEM_COMMIT();
}
if(was_fc_pkt){
- MPID_NEM_ND_CONN_DECR_SCREDITS(conn_hnd);
+ if(conn_hnd->send_in_progress){
+ if(!conn_hnd->zcp_in_progress){
+ MPID_NEM_ND_CONN_DECR_CACHE_SCREDITS(conn_hnd);
+ }
+ /* ZCP packets are not flow controlled */
+ }
+ else{
+ MPID_NEM_ND_CONN_DECR_SCREDITS(conn_hnd);
+ }
}
fn_exit:
@@ -719,10 +724,8 @@
/* FIXME: fc info in pkt is updated for every msg sent.
* Do we have to explicitly update fc info here ?
*/
- /*
mpi_errno = MPID_Nem_nd_update_fc_info(conn_hnd, pfc_msg);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- */
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Sending CRED PKT...");
mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pfc_msg, sizeof(MPID_Nem_nd_msg_hdr_t ), 0);
@@ -748,8 +751,13 @@
int bind_mw_success_handler(MPID_Nem_nd_msg_result_t *zcp_send_result)
{
int mpi_errno = MPI_SUCCESS;
+ int ret_errno;
MPID_Nem_nd_conn_hnd_t conn_hnd;
+ MPID_Nem_nd_pack_t pack_type = MPID_NEM_ND_INVALID_PACK;
+ MPID_Nem_nd_msg_t *pmsg;
MPID_Request *zcp_req = NULL;
+ int i;
+ SIZE_T nb, msg_len, rem_len;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_BIND_MW_SUCCESS_HANDLER);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_BIND_MW_SUCCESS_HANDLER);
@@ -759,11 +767,103 @@
zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
MPIU_Assert(zcp_req != NULL);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "bind succ for IOV[%d] = %p; iov_offset = %d/rem=%d, req=%p",
+ conn_hnd->zcp_send_offset, zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_BUF,
+ zcp_req->dev.iov_offset, zcp_req->dev.iov_count, zcp_req));
+ MPIU_Assert(zcp_req->dev.iov_offset + zcp_req->dev.iov_count <= MPID_IOV_LIMIT);
+ MPIU_Assert(zcp_req->dev.iov_count > 0);
- /* MW created, Registered buf, Bound MW, now post send */
- mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, zcp_req->dev.iov, zcp_req->dev.iov_count);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_Assert(!MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd));
+ /* Post the ND message
+ * iov[zcp_req->dev.iov_offset, conn_hnd->zcp_send_offset]
+ * First pack any non-ZCP IOVs, then copy zcp send mw & send
+ */
+ MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
+ MPIU_Assert(pmsg != NULL);
+
+ SET_MSGBUF_HANDLER(pmsg, zcp_mw_send_success_handler, gen_send_fail_handler);
+ pmsg->hdr.type = MPID_NEM_ND_RD_AVAIL_PKT;
+ pmsg->hdr.credits = 0;
+ msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
+ rem_len = sizeof(pmsg->buf);
+
+ nb = 0;
+ if(zcp_req->dev.iov_offset < conn_hnd->zcp_send_offset){
+ int off_end;
+ off_end = conn_hnd->zcp_send_offset;
+
+ /* Piggy-back IOVs */
+ mpi_errno = MPID_Nem_nd_pack_iov(conn_hnd,
+ zcp_req->dev.iov,
+ zcp_req->dev.iov_offset,
+ &off_end,
+ pmsg,
+ &(pack_type),
+ &nb);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_Assert(pack_type == MPID_NEM_ND_SR_PACK);
+
+ msg_len += nb;
+ rem_len -= nb;
+ }
+
+ /* Now copy the MSG MW to the packet */
+ MPIU_Assert(rem_len >= sizeof(MPID_Nem_nd_msg_mw_t ));
+ ret_errno = memcpy_s((void *)&(pmsg->buf[nb]), rem_len, &(conn_hnd->zcp_msg_send_mw), sizeof(MPID_Nem_nd_msg_mw_t ));
+ MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
+ "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
+
+ msg_len += sizeof(MPID_Nem_nd_msg_mw_t );
+ rem_len -= sizeof(MPID_Nem_nd_msg_mw_t );
+
+ /* Block on progress engine if we exceed the number of RDs allowed on the conn/device */
+ while(MPID_Nem_nd_dev_hnd_g->npending_rds >= 2){
+ mpi_errno = MPID_Nem_nd_sm_poll(0);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+
+ MPID_Nem_nd_dev_hnd_g->npending_rds++; conn_hnd->npending_rds++;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+ MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
+
+ for(i=zcp_req->dev.iov_offset; i<conn_hnd->zcp_send_offset; i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending SR packed IOV[off=%d/tot_iovs=%d]=[%p/%u]",
+ i, zcp_req->dev.iov_count,
+ zcp_req->dev.iov[i].MPID_IOV_BUF,
+ zcp_req->dev.iov[i].MPID_IOV_LEN));
+
+ }
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending mem descriptor (buf=%p) : base = %p, length=%I64d, token=%d, mw=%p",
+ zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_BUF,
+ _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Base),
+ _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Length),
+ conn_hnd->zcp_msg_send_mw.mw_data.Token,
+ conn_hnd->zcp_send_mw));
+
+ mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ if(zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_LEN == 0){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "zcp_req(%p) off[%d -> %d], cnt[%d -> %d]",
+ zcp_req, zcp_req->dev.iov_offset, conn_hnd->zcp_send_offset + 1,
+ zcp_req->dev.iov_count, zcp_req->dev.iov_count - (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset + 1)));
+ /* Rem IOVs */
+ zcp_req->dev.iov_count -= (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset + 1);
+
+ zcp_req->dev.iov_offset = conn_hnd->zcp_send_offset + 1;
+ }
+ else{
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "zcp_req(%p) off[%d -> %d], cnt[%d -> %d]",
+ zcp_req, zcp_req->dev.iov_offset, conn_hnd->zcp_send_offset,
+ zcp_req->dev.iov_count, zcp_req->dev.iov_count - (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset)));
+ /* Rem IOVs */
+ zcp_req->dev.iov_count -= (conn_hnd->zcp_send_offset - zcp_req->dev.iov_offset);
+
+ zcp_req->dev.iov_offset = conn_hnd->zcp_send_offset;
+ }
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_BIND_MW_SUCCESS_HANDLER);
return mpi_errno;
@@ -773,25 +873,37 @@
}
#undef FUNCNAME
-#define FUNCNAME reg_zcp_mem_success_handler
+#define FUNCNAME reg_zcp_reg_sreq_bind_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int reg_zcp_mem_success_handler(MPIU_EXOVERLAPPED *send_ov)
+int reg_zcp_reg_sreq_bind_handler(MPIU_EXOVERLAPPED *send_ov)
{
int mpi_errno = MPI_SUCCESS;
MPID_Nem_nd_conn_hnd_t conn_hnd;
MPID_Nem_nd_msg_result_t *pmsg_result;
MPID_Request *zcp_req = NULL;
HRESULT hr;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_MEM_SUCCESS_HANDLER);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_REG_SREQ_BIND_HANDLER);
MPIU_CHKPMEM_DECL(1);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_MEM_SUCCESS_HANDLER);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_REG_SREQ_BIND_HANDLER);
conn_hnd = GET_CONNHND_FROM_EX_SEND_OV(send_ov);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
- MPIU_CHKPMEM_MALLOC(pmsg_result, MPID_Nem_nd_msg_result_t *, sizeof(MPID_Nem_nd_msg_result_t ), mpi_errno, "cr_mem_win result");
+ zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+ MPIU_Assert(zcp_req != NULL);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "About to bind IOV[%d] = %p; iov_offset = %d/tot=%d, req=%p",
+ conn_hnd->zcp_send_offset, zcp_req->dev.iov[conn_hnd->zcp_send_offset].MPID_IOV_BUF,
+ zcp_req->dev.iov_offset, zcp_req->dev.iov_count, zcp_req));
+ MPIU_Assert(zcp_req->dev.iov_offset + zcp_req->dev.iov_count <= MPID_IOV_LIMIT);
+ MPIU_Assert(zcp_req->dev.iov_count > 0);
+
+ /* Create Memory Window for sending data */
+ MPIU_CHKPMEM_MALLOC(pmsg_result, MPID_Nem_nd_msg_result_t *,
+ sizeof(MPID_Nem_nd_msg_result_t ), mpi_errno, "cr_mem_win result");
+
INIT_MSGRESULT(pmsg_result, free_msg_result_handler, free_msg_result_handler);
hr = MPID_Nem_nd_dev_hnd_g->p_ad->CreateMemoryWindow(&(pmsg_result->result), &(conn_hnd->zcp_send_mw));
@@ -801,96 +913,202 @@
MPIU_CHKPMEM_COMMIT();
- zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
- MPIU_Assert(zcp_req != NULL);
-
+ /* Initialize the MW descriptor to be sent in the ND message */
conn_hnd->zcp_msg_send_mw.mw_data.Base = 0;
conn_hnd->zcp_msg_send_mw.mw_data.Length = 0;
conn_hnd->zcp_msg_send_mw.mw_data.Token = 0;
- /* MW created, mem registered, now bind the buffer */
- /* FIXME: Do we need a read fence ? */
+
INIT_MSGRESULT(&(conn_hnd->zcp_send_result), bind_mw_success_handler, gen_send_fail_handler);
- hr = conn_hnd->p_ep->Bind(&(conn_hnd->zcp_send_result.result), conn_hnd->zcp_send_mr_hnd,
- conn_hnd->zcp_send_mw, zcp_req->dev.iov[1].MPID_IOV_BUF,
- zcp_req->dev.iov[1].MPID_IOV_LEN, ND_OP_FLAG_ALLOW_READ, &(conn_hnd->zcp_msg_send_mw.mw_data));
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Binding IOV[%d/tot=%d]=[%p/%u], dev_off=%d, mr=%x, mw=%p, conn=%p",
+ conn_hnd->zcp_send_offset,
+ zcp_req->dev.iov_count,
+ conn_hnd->zcp_send_sge.pAddr,
+ conn_hnd->zcp_send_sge.Length,
+ zcp_req->dev.iov_offset,
+ conn_hnd->zcp_send_mr_hnd,
+ conn_hnd->zcp_send_mw, conn_hnd));
+
+ hr = conn_hnd->p_ep->Bind(&(conn_hnd->zcp_send_result.result),
+ conn_hnd->zcp_send_mr_hnd,
+ conn_hnd->zcp_send_mw,
+ conn_hnd->zcp_send_sge.pAddr,
+ conn_hnd->zcp_send_sge.Length,
+ (ND_OP_FLAG_READ_FENCE | ND_OP_FLAG_ALLOW_READ),
+ &(conn_hnd->zcp_msg_send_mw.mw_data));
+
MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
_com_error(hr).ErrorMessage(), hr);
-
+
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_MEM_SUCCESS_HANDLER);
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_REG_ZCP_REG_SREQ_BIND_HANDLER);
return mpi_errno;
fn_fail:
MPIU_CHKPMEM_REAP();
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
goto fn_exit;
}
-/*
+
+/* Register mem for sreq */
#undef FUNCNAME
-#define FUNCNAME create_mw_success_handler
+#define FUNCNAME MPID_Nem_nd_zcp_reg_smem
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int create_mw_success_handler(MPID_Nem_nd_msg_result_t *pmsg_result)
+int MPID_Nem_nd_zcp_reg_smem(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iovp, int iov_offset)
{
int mpi_errno = MPI_SUCCESS;
- MPID_Nem_nd_conn_hnd_t conn_hnd;
- MPID_Request *zcp_req = NULL;
HRESULT hr;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_CREATE_MW_SUCCESS_HANDLER);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_REG_SMEM);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_CREATE_MW_SUCCESS_HANDLER);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_REG_SMEM);
- conn_hnd = GET_CONNHND_FROM_ZCP_MSGRESULT(pmsg_result);
- MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registering IOV[%d]=%p/%u (conn=%p)",
+ iov_offset, iovp[iov_offset].MPID_IOV_BUF, iovp[iov_offset].MPID_IOV_LEN, conn_hnd));
- / The request at the tail of the posted queue should contain
- * the buffer
- /
- zcp_req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
- MPIU_Assert(zcp_req != NULL);
+ /* Keep track of the zcp send offset */
+ conn_hnd->zcp_send_offset = iov_offset;
- SET_EX_WR_HANDLER(conn_hnd, reg_zcp_mem_success_handler, gen_ex_fail_handler);
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(zcp_req->dev.iov[1].MPID_IOV_BUF,
- zcp_req->dev.iov[1].MPID_IOV_LEN, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)),
- &(conn_hnd->zcp_mr_hnd));
+ conn_hnd->zcp_send_sge.hMr = NULL;
+ if(iovp[iov_offset].MPID_IOV_LEN > MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g)){
+ conn_hnd->zcp_send_sge.pAddr = iovp[iov_offset].MPID_IOV_BUF;
+ conn_hnd->zcp_send_sge.Length = MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g);
+
+ iovp[iov_offset].MPID_IOV_BUF += MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g);
+ iovp[iov_offset].MPID_IOV_LEN -= MPID_NEM_ND_DEV_IO_LIMIT(MPID_Nem_nd_dev_hnd_g);
+ }
+ else{
+ conn_hnd->zcp_send_sge.pAddr = iovp[iov_offset].MPID_IOV_BUF;
+ conn_hnd->zcp_send_sge.Length = iovp[iov_offset].MPID_IOV_LEN;
+
+ iovp[iov_offset].MPID_IOV_LEN = 0;
+ }
+
+ MPIU_Assert(!conn_hnd->zcp_in_progress);
+ SET_EX_WR_HANDLER(conn_hnd, reg_zcp_reg_sreq_bind_handler, gen_ex_fail_handler);
+
+ memset(&(conn_hnd->zcp_send_mr_hnd), 0x0, sizeof(conn_hnd->zcp_send_mr_hnd));
+
+ /* Register buffer */
+ hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(
+ conn_hnd->zcp_send_sge.pAddr,
+ conn_hnd->zcp_send_sge.Length,
+ MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)),
+ &(conn_hnd->zcp_send_mr_hnd));
MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
_com_error(hr).ErrorMessage(), hr);
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_CREATE_MW_SUCCESS_HANDLER);
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_REG_SMEM);
return mpi_errno;
fn_fail:
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
goto fn_exit;
}
-*/
+
+/* Both start and end offsets returned are valid offsets */
+static inline MPID_Nem_nd_pack_t nd_pack_iov_get_params(MPID_IOV *iovp, int start, int *end){
+ u_long rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
+ int i;
+ for(i = start; (i < *end) && (rem_len >= iovp[i].MPID_IOV_LEN); i++){
+ rem_len -= iovp[i].MPID_IOV_LEN;
+ }
+ if(i == *end){
+ /* All IOVs can be packed */
+ *end = i - 1;
+ return MPID_NEM_ND_SR_PACK;
+ }
+ else if(iovp[i].MPID_IOV_LEN <= MPID_NEM_ND_CONN_UDATA_SZ){
+ *end = i - 1;
+ return MPID_NEM_ND_SR_PACK;
+ }
+ else{
+ /* One more IOV can be packed using ZCP packing */
+ *end = i;
+ return MPID_NEM_ND_ZCP_PACK;
+ }
+}
+
+/* Input:
+ * offset_start => the start req offset for packing
+ * offset_endp => the max req offset for packing, (offset_endp-1) is the max valid offset
+ * Output:
+ * offset_endp => Used to return the final req offset packed
+ * pack_typep => Used to return the packing type
+ * nbp => Used to return bytes packed
+ */
#undef FUNCNAME
-#define FUNCNAME MPID_Nem_nd_start_zcp
+#define FUNCNAME MPID_Nem_nd_pack_iov
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_start_zcp(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iov, int n_iov)
-{
+int MPID_Nem_nd_pack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iovp,
+ int offset_start,
+ int *offset_endp,
+ MPID_Nem_nd_msg_t *pmsg,
+ MPID_Nem_nd_pack_t *pack_typep,
+ SIZE_T *nbp){
int mpi_errno = MPI_SUCCESS;
- HRESULT hr;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_START_ZCP);
+ int ret_errno;
+ int off;
+ char *p;
+ SIZE_T rem_len;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_PACK_IOV);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_START_ZCP);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_PACK_IOV);
- /* Register buffer */
- /* FIXME: We only register 1 IOV for now */
- MPIU_Assert(n_iov == 1);
+ MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+ MPIU_Assert(iovp != NULL);
+ MPIU_Assert(offset_endp != NULL);
+ MPIU_Assert(offset_start >= 0);
+ MPIU_Assert(offset_start <= *offset_endp);
+ MPIU_Assert(pack_typep != NULL);
+ MPIU_Assert(pmsg != NULL);
+ MPIU_Assert(nbp != NULL);
- SET_EX_WR_HANDLER(conn_hnd, reg_zcp_mem_success_handler, gen_ex_fail_handler);
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(iov[0].MPID_IOV_BUF,
- iov[0].MPID_IOV_LEN, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)),
- &(conn_hnd->zcp_send_mr_hnd));
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
- mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
- _com_error(hr).ErrorMessage(), hr);
+ off = *offset_endp;
+ *pack_typep = nd_pack_iov_get_params(iovp, offset_start, &off);
+ MPIU_Assert(off < *offset_endp);
+ *offset_endp = off;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "PACK_TYPE=%d - [%d, %d]", *pack_typep, offset_start, *offset_endp));
+ p = pmsg->buf;
+ rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
+
+ if(*pack_typep == MPID_NEM_ND_SR_PACK){
+ /* Note that both start and end offsets returned by nd_pack_iov_get_params()
+ * are valid/packable offsets
+ */
+ for(off = offset_start; off <= *offset_endp; off++){
+ MPIU_Assert(rem_len >= iovp[off].MPID_IOV_LEN);
+ ret_errno = memcpy_s((void *)p, rem_len,
+ iovp[off].MPID_IOV_BUF, iovp[off].MPID_IOV_LEN);
+ MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
+ "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
+
+ p += iovp[off].MPID_IOV_LEN;
+ rem_len -= iovp[off].MPID_IOV_LEN;
+ }
+ *nbp = MPID_NEM_ND_CONN_UDATA_SZ - rem_len;
+ }
+ else if(*pack_typep == MPID_NEM_ND_ZCP_PACK){
+ /* We are not going to use this msg right now */
+ MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+
+ MPID_Nem_nd_dev_hnd_g->zcp_pending = 1;
+
+ mpi_errno = MPID_Nem_nd_zcp_reg_smem(conn_hnd, iovp, *offset_endp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ *nbp = 0;
+ }
+ else{
+ /* Unrecognized packing type */
+ MPIU_Assert(0);
+ }
+
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_START_ZCP);
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_PACK_IOV);
return mpi_errno;
fn_fail:
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -898,114 +1116,121 @@
}
#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_post_sendbv
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_Nem_nd_post_sendbv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp)
+{
+ int mpi_errno = MPI_SUCCESS;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_POST_SENDBV);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POST_SENDBV);
+
+ MPIU_Assert(!conn_hnd->send_in_progress);
+
+ conn_hnd->send_in_progress = 1;
+ conn_hnd->cache_credits = conn_hnd->send_credits;
+ conn_hnd->send_credits = 0;
+
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POST_SENDBV);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
#define FUNCNAME MPID_Nem_nd_post_sendv
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_IOV *iov, int n_iov)
+int MPID_Nem_nd_post_sendv(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *sreqp)
{
int mpi_errno = MPI_SUCCESS;
- errno_t ret_errno;
- char *p;
MPID_Nem_nd_msg_t *pmsg;
- int i, rem_len = 0, msg_len = 0, tot_len = 0;
+ SIZE_T msg_len = 0, nb;
+ MPID_Nem_nd_pack_t pack_type;
+ int offset_end, i;
+
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_POST_SENDV);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_POST_SENDV);
- if(!conn_hnd->zcp_in_progress){
- int start_zcp=0;
- tot_len = 0;
- for(i=0; i<n_iov; i++){
- tot_len += iov[i].MPID_IOV_LEN;
- if(tot_len > MPID_NEM_ND_CONN_UDATA_SZ) {
- start_zcp = 1;
- break;
- }
- }
- if(!start_zcp){
- /* Get a msgbuf - pack the iovs into it and send it */
- MPIU_Assert(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd));
- MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
- MPIU_Assert(pmsg != NULL);
- SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
+ MPIU_Assert((conn_hnd->send_credits > 0) ? (!conn_hnd->send_in_progress) : 1);
- pmsg->hdr.type = MPID_NEM_ND_DATA_PKT;
- pmsg->hdr.credits = 0;
- p = pmsg->buf;
- rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
- msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
+ MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
+ MPIU_Assert(pmsg != NULL);
+ SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
- for(i=0; i<n_iov; i++){
- int iov_len = iov[i].MPID_IOV_LEN;
- /* rem_len is never less than iov_len */
- MPIU_Assert(rem_len >= iov_len);
- /* Copy the whole iov to the msg buffer */
- ret_errno = memcpy_s((void *)p, rem_len, iov[i].MPID_IOV_BUF, iov_len);
- MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
- "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
- p += iov_len;
- rem_len -= iov_len;
- msg_len += iov_len;
- }
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending msg packet of size %d (msg type=%d)", msg_len, pmsg->hdr.type));
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending udata packet of type = %d", ((MPIDI_CH3_Pkt_t *)(&(pmsg->buf)))->type));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting send on [conn = %p] for ", conn_hnd));
- mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- }
- else{ /* start_zcp */
- conn_hnd->zcp_in_progress = 1;
- /* Don't send data till the zcpy is over */
- conn_hnd->zcp_credits = conn_hnd->send_credits;
- conn_hnd->send_credits = 0;
+ for(i=0; i<sreqp->dev.iov_count; i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "IOV[%d] = {%p, %u}",
+ sreqp->dev.iov_offset + i,
+ sreqp->dev.iov[sreqp->dev.iov_offset + i].MPID_IOV_BUF,
+ sreqp->dev.iov[sreqp->dev.iov_offset + i].MPID_IOV_LEN));
+ }
- /* FIXME: Only handling 1 IOV now */
- mpi_errno = MPID_Nem_nd_start_zcp(conn_hnd, &(iov[1]), 1);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ offset_end = sreqp->dev.iov_offset + sreqp->dev.iov_count;
+ mpi_errno = MPID_Nem_nd_pack_iov(conn_hnd, sreqp->dev.iov,
+ sreqp->dev.iov_offset, &offset_end, pmsg, &pack_type, &nb);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ if(pack_type == MPID_NEM_ND_SR_PACK){
+ pmsg->hdr.type = MPID_NEM_ND_DATA_PKT;
+ pmsg->hdr.credits = 0;
+ msg_len = sizeof(MPID_Nem_nd_msg_hdr_t ) + nb;
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "SR PACKING: Sending msg packet of size %d (msg type=%d)", msg_len, pmsg->hdr.type));
+ if(sreqp->dev.iov_offset == 0){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending udata packet of type = %d", ((MPIDI_CH3_Pkt_t *)(&(pmsg->buf)))->type));
}
- }
- else{ /* zcopy in progress */
- MPID_Nem_nd_msg_mw_t msg_mw;
- /* zcopy init should be complete by now - send hdr and MPID_Nem_nd_msg_mw_t
- * related to the data.
- */
- MPIU_Assert(!MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd));
- MSGBUF_FREEQ_DEQUEUE(conn_hnd, pmsg);
- SET_MSGBUF_HANDLER(pmsg, zcp_mw_send_success_handler, gen_send_fail_handler);
- MPIU_Assert(pmsg != NULL);
- /* FIXME: Support more than 2 iovs */
- MPIU_Assert(n_iov == 2);
+ else{
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Contd to send data on req=%p [iov=%d/tot=%d]",
+ sreqp, sreqp->dev.iov_offset, sreqp->dev.iov_count));
+ }
- pmsg->hdr.type = MPID_NEM_ND_RD_AVAIL_PKT;
- pmsg->hdr.credits = 0;
- msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
- p = pmsg->buf;
- rem_len = MPID_NEM_ND_CONN_UDATA_SZ;
- /* Try to copy the first IOV to the msg packet */
- if(iov[0].MPID_IOV_LEN <= rem_len){
- ret_errno = memcpy_s((void *)p, rem_len, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
- MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
- "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
- p += iov[0].MPID_IOV_LEN;
- rem_len -= iov[0].MPID_IOV_LEN;
- msg_len += iov[0].MPID_IOV_LEN;
+ mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ /* Packing always consumes whole IOVs */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "SR pack : sreq(%p) off %d -> %d", sreqp, sreqp->dev.iov_offset, sreqp->dev.iov_offset + 1));
+
+ sreqp->dev.iov_count -= (offset_end - sreqp->dev.iov_offset + 1);
+ sreqp->dev.iov_offset = offset_end + 1;
+ if(sreqp->dev.iov_count > 0){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Could not pack all IOVs (rem = %d iovs)- SEND_IN_PROGRESS...", sreqp->dev.iov_count);
+ if(!conn_hnd->send_in_progress){
+ /* Could not pack all IOVs - Block all subsequent sends */
+ conn_hnd->send_in_progress = 1;
+ /* Queue data till the zcpy is over - keep track of the send credits */
+ conn_hnd->cache_credits = conn_hnd->send_credits;
+ conn_hnd->send_credits = 0;
+ }
}
- /* We are guaranteed to have enough space for the MW descriptors */
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Sending mem descriptor (buf=%p) : base = %p, length=%I64d, token=%d\n",
- iov[1].MPID_IOV_BUF,
- _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_send_mw.mw_data.Length),
- conn_hnd->zcp_msg_send_mw.mw_data.Token));
+ else{
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Finished posting sends for all IOVs...");
+ }
+ }
+ else if(pack_type == MPID_NEM_ND_ZCP_PACK){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ZCP PACKING - SEND_IN_PROGRESS...");
+ conn_hnd->zcp_in_progress = 1;
+ if(!conn_hnd->send_in_progress){
+ /* The progress engine ZCP handlers will send the data */
+ conn_hnd->send_in_progress = 1;
+ /* Queue data till the zcpy is over - keep track of the send credits */
+ conn_hnd->cache_credits = conn_hnd->send_credits;
+ conn_hnd->send_credits = 0;
+ }
+ }
+ else{
+ /* Unrecognized pack type */
+ MPIU_Assert(0);
+ }
- ret_errno = memcpy_s((void *)p, sizeof(MPID_Nem_nd_msg_mw_t ), &(conn_hnd->zcp_msg_send_mw), sizeof(MPID_Nem_nd_msg_mw_t ));
- MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
- "**nd_write", "**nd_write %s %d", strerror(ret_errno), ret_errno);
- /* FIXME: Add mw for other iovs */
- p += sizeof(MPID_Nem_nd_msg_mw_t );
- msg_len += sizeof(MPID_Nem_nd_msg_mw_t );
-
- mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, msg_len, 0);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- }
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_POST_SENDV);
return mpi_errno;
@@ -1024,7 +1249,7 @@
int mpi_errno = MPI_SUCCESS;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS);
+ /* MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS); */
MPIU_Assert(pcq != NULL);
MPIU_Assert(pstatus != NULL);
@@ -1049,9 +1274,7 @@
hr = nd_results[0]->Status;
pmsg_result = GET_MSGRESULT_FROM_NDRESULT(nd_results[0]);
MPIU_Assert(pmsg_result != NULL);
- /* FIXME: REMOVE ME !! -start */
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Got something on %p", GET_MSGBUF_FROM_MSGRESULT(pmsg_result)));
- /* FIXME: REMOVE ME !! -end */
if(hr == ND_SUCCESS){
handler_fn = pmsg_result->succ_fn;
}
@@ -1062,7 +1285,7 @@
return handler_fn(pmsg_result);
}
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS);
+ /* MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_PROCESS_COMPLETIONS); */
return mpi_errno;
fn_fail:
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -1171,6 +1394,42 @@
}
#undef FUNCNAME
+#define FUNCNAME quiescent_msg_handler
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int quiescent_msg_handler(MPID_Nem_nd_msg_result_t *result)
+{
+ int mpi_errno = MPI_SUCCESS;
+ HRESULT hr;
+ MPID_Nem_nd_msg_t *pmsg;
+ MPID_Nem_nd_conn_hnd_t conn_hnd;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_MSG_HANDLER);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_MSG_HANDLER);
+
+ pmsg = GET_MSGBUF_FROM_MSGRESULT(result);
+ MPIU_Assert(pmsg != NULL);
+
+ conn_hnd = GET_CONNHND_FROM_MSGBUF(pmsg);
+ MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+
+ MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+
+ /* A pending op completed on this conn - rd/wr - go ahead and
+ * disconnect the conn
+ */
+ mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_MSG_HANDLER);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
#define FUNCNAME free_msg_result_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -1191,18 +1450,626 @@
}
#undef FUNCNAME
+#define FUNCNAME trim_nd_sge
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline void trim_nd_sge(ND_SGE *nd_sge, int *nd_sge_count, int *nd_sge_offset, SIZE_T nb)
+{
+ ND_SGE *nd_sge_p;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_TRIM_ND_SGE);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_TRIM_ND_SGE);
+ MPIU_Assert(nd_sge != NULL);
+ MPIU_Assert(nd_sge_count != NULL);
+ MPIU_Assert(*nd_sge_count > 0);
+ MPIU_Assert(nd_sge_offset != NULL);
+ MPIU_Assert(*nd_sge_offset < MPID_IOV_LIMIT);
+ MPIU_Assert(nb >= 0);
+
+ nd_sge_p = &(nd_sge[*nd_sge_offset]);
+ while(nb){
+ MPIU_Assert(*nd_sge_count > 0);
+ if(nb < nd_sge_p->Length){
+ /* We never read partial nd_sges */
+ MPIU_Assert(0);
+ nd_sge_p->pAddr = (char *)(nd_sge_p->pAddr) + nb;
+ nd_sge_p->Length -= nb;
+ nb = 0;
+ }
+ else{
+ *nd_sge_count -= 1;
+ *nd_sge_offset += 1;
+ nb -= nd_sge_p->Length;
+ nd_sge_p++;
+ }
+ }
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_TRIM_ND_SGE);
+}
+
+/* The function modifies ND_MW - don't use it if you need to use the MW again (eg: invalidate) */
+#undef FUNCNAME
+#define FUNCNAME trim_nd_mw
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline void trim_nd_mw(ND_MW_DESCRIPTOR *nd_mw, int *nd_mw_count, int *nd_mw_offset, SIZE_T nb)
+{
+ ND_MW_DESCRIPTOR *nd_mw_p;
+ uint64_t len = 0;
+ uint64_t base = 0;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_TRIM_ND_MW);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_TRIM_ND_MW);
+
+ MPIU_Assert(nd_mw != NULL);
+ MPIU_Assert(nd_mw_count != NULL);
+ MPIU_Assert(*nd_mw_count > 0);
+ MPIU_Assert(nd_mw_offset != NULL);
+ MPIU_Assert(*nd_mw_offset < MPID_IOV_LIMIT);
+ MPIU_Assert(nb >= 0);
+
+ nd_mw_p = &(nd_mw[*nd_mw_offset]);
+ while(nb){
+ MPIU_Assert(*nd_mw_count > 0);
+ len = _byteswap_uint64(nd_mw_p->Length);
+
+ if(nb < len){
+ base = _byteswap_uint64(nd_mw_p->Base);
+ nd_mw_p->Base = _byteswap_uint64(base + nb);
+ len -= nb;
+ nd_mw_p->Length = _byteswap_uint64(len);
+ nb = 0;
+ }
+ else{
+ *nd_mw_count -= 1;
+ *nd_mw_offset += 1;
+ nb -= len;
+ nd_mw_p->Length = 0;
+ nd_mw_p++;
+ }
+ }
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_TRIM_ND_MW);
+}
+
+/* This function trims the iov array, *iov_p, of size *n_iov_p
+ * assuming nb bytes are transferred
+ * Side-effect : *iov_p, *n_iov_p, buf & len of (*iov_p)
+ * could be modified by this function.
+ * Returns the number of bytes copied
+ */
+#undef FUNCNAME
+#define FUNCNAME copy_and_trim_iov
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static SIZE_T copy_and_trim_iov(MPID_IOV *iov, int *n_iov_p, int *offset_p, char *buf, SIZE_T nb)
+{
+ MPID_IOV *cur_iov;
+ int cur_n_iov, cur_offset;
+ SIZE_T buflen;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_COPY_AND_TRIM_IOV);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_COPY_AND_TRIM_IOV);
+
+ MPIU_Assert(iov != NULL);
+ MPIU_Assert(n_iov_p);
+ MPIU_Assert((*n_iov_p) > 0);
+ MPIU_Assert(offset_p != NULL);
+ MPIU_Assert((*offset_p >= 0) && (*offset_p < MPID_IOV_LIMIT));
+ MPIU_Assert(buf != NULL);
+
+ cur_n_iov = *n_iov_p;
+ cur_offset = *offset_p;
+ cur_iov = &(iov[cur_offset]);
+
+ buflen = nb;
+
+ while(nb > 0){
+ if(nb < cur_iov->MPID_IOV_LEN){
+ memcpy_s(cur_iov->MPID_IOV_BUF, cur_iov->MPID_IOV_LEN, buf, nb);
+ buf += nb;
+ cur_iov->MPID_IOV_BUF += nb;
+ cur_iov->MPID_IOV_LEN -= nb;
+ nb = 0;
+ }
+ else{
+ memcpy_s(cur_iov->MPID_IOV_BUF, cur_iov->MPID_IOV_LEN, buf, cur_iov->MPID_IOV_LEN);
+ buf += cur_iov->MPID_IOV_LEN;
+ nb -= cur_iov->MPID_IOV_LEN;
+ cur_iov->MPID_IOV_LEN = 0;
+ cur_n_iov--;
+ cur_offset++;
+ if(cur_n_iov > 0){
+ cur_iov++;
+ }
+ else{
+ /* More data available in the buffer than can be copied
+ * The return value indicates the number of bytes copied
+ */
+ break;
+ }
+ }
+ }
+
+ *n_iov_p = cur_n_iov;
+ *offset_p = cur_offset;
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_COPY_AND_TRIM_IOV);
+ /* Bytes processed/trimmed */
+ return (buflen - nb);
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_handle_recv_req
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPID_Nem_nd_handle_recv_req(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *rreqp, int *req_complete)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int (*req_fn)(MPIDI_VC_t *, MPID_Request *, int *);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_HANDLE_RECV_REQ);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_HANDLE_RECV_REQ);
+
+ MPIU_Assert(rreqp != NULL);
+ MPIU_Assert(req_complete != NULL);
+
+ *req_complete = 0;
+
+ req_fn = rreqp->dev.OnDataAvail;
+ if(req_fn){
+ mpi_errno = req_fn(conn_hnd->vc, rreqp, req_complete);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ if (*req_complete){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+ }
+ else{
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... Not complete");
+ rreqp->dev.iov_offset = 0;
+ }
+ }
+ else{
+ MPIDI_CH3U_Request_complete(rreqp);
+ *req_complete = 1;
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_HANDLE_RECV_REQ);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME nd_read_progress_update
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int nd_read_progress_update(MPID_Nem_nd_conn_hnd_t conn_hnd, MPID_Request *rreq, char *buf, SIZE_T *pnb, int *req_complete)
+{
+ int mpi_errno = MPI_SUCCESS;
+ SIZE_T buflen, nb;
+ MPID_IOV *iov;
+ int complete;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_READ_PROGRESS_UPDATE);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_READ_PROGRESS_UPDATE);
+
+ MPIU_Assert(rreq != NULL);
+ MPIU_Assert(buf != NULL);
+ MPIU_Assert(req_complete != NULL);
+ MPIU_Assert((pnb != NULL) && (*pnb > 0));
+
+ *req_complete = 0;
+ buflen = *pnb;
+ do{
+ *req_complete = 0;
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "trim rreq(%p) off %d/tot=%d",
+ rreq, rreq->dev.iov_offset, rreq->dev.iov_count));
+
+ if(rreq->dev.iov_count != 0){
+ iov = rreq->dev.iov;
+ nb = copy_and_trim_iov(iov, &(rreq->dev.iov_count), &(rreq->dev.iov_offset), buf, buflen);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Copied %d bytes...[rem iovs = %d]", nb, rreq->dev.iov_count));
+
+ buf += nb;
+ buflen -= nb;
+ }
+
+ complete = (rreq->dev.iov_count == 0) ? 1 : 0;
+
+ if(complete){
+ mpi_errno = MPID_Nem_nd_handle_recv_req(conn_hnd, rreq, req_complete);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ }while((buflen > 0) && !(*req_complete));
+
+ /* Number of bytes processed/consumed */
+ *pnb -= buflen;
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_READ_PROGRESS_UPDATE);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_zcp_recv_sge_reg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_zcp_recv_sge_reg(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+ int mpi_errno = MPI_SUCCESS;
+ HRESULT hr;
+ int i;
+ MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_REG);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_REG);
+
+ MPIU_Assert(conn_hnd->zcp_recv_sge_count > 0);
+
+ mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd,
+ conn_hnd->zcp_recv_sge_count,
+ conn_hnd, 1);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Registering sge [%d/%d]={%p/%I64d}",
+ i, conn_hnd->zcp_recv_sge_count,
+ conn_hnd->zcp_recv_sge[i].pAddr,
+ conn_hnd->zcp_recv_sge[i].Length));
+ hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->zcp_recv_sge[i].pAddr,
+ conn_hnd->zcp_recv_sge[i].Length,
+ MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd),
+ &(conn_hnd->zcp_recv_sge[i].hMr));
+
+ if(SUCCEEDED(hr)){
+ conn_hnd->npending_ops++;
+ mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ /* FIXME: Change the error message - nd_mem_reg */
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+ mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+ _com_error(hr).ErrorMessage(), hr);
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_REG);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_zcp_recv
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_zcp_recv(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+ int mpi_errno = MPI_SUCCESS;
+ HRESULT hr;
+ MPID_Nem_nd_msg_t *pzcp_msg;
+ int i;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_RECV);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_RECV);
+
+ /* A msg buf is guaranteed for RDMA read */
+ MSGBUF_FREEQ_DEQUEUE(conn_hnd, pzcp_msg);
+ MPIU_Assert(pzcp_msg != NULL);
+
+ SET_MSGBUF_HANDLER(pzcp_msg, zcp_read_success_handler, zcp_read_fail_handler);
+
+ while(MPID_Nem_nd_dev_hnd_g->npending_rds >= 2){
+ mpi_errno = MPID_Nem_nd_sm_poll(0);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "RDMA READ: Using remote mem descriptor : base = %p, length=%I64d, token=%d",
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base),
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+ conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+
+ {
+ SIZE_T len=0;
+ for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "RDMA READ: Using local sge [%d/%d] : pAddr = %p, length=%I64d, hMr =%x",
+ i, conn_hnd->zcp_recv_sge_count,
+ conn_hnd->zcp_recv_sge[i].pAddr,
+ conn_hnd->zcp_recv_sge[i].Length,
+ conn_hnd->zcp_recv_sge[i].hMr));
+ len += conn_hnd->zcp_recv_sge[i].Length;
+ }
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Performing RDMA READ for " MPIR_UPINT_FMT_DEC_SPEC "bytes", len));
+ }
+
+ hr = conn_hnd->p_ep->Read(GET_PNDRESULT_FROM_MSGBUF(pzcp_msg),
+ &(conn_hnd->zcp_recv_sge[0]),
+ conn_hnd->zcp_recv_sge_count,
+ &(conn_hnd->zcp_msg_recv_mw.mw_data),
+ 0, ND_OP_FLAG_READ_FENCE);
+ MPID_Nem_nd_dev_hnd_g->npending_rds++; conn_hnd->npending_rds++;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+ MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
+ MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
+ mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+ _com_error(hr).ErrorMessage(), hr);
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_RECV);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+
+/* Function assumes that conn_hnd->zcp_msg_recv_mw is already set */
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_zcp_unpack_iov
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_zcp_unpack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd,
+ MPID_IOV *iovp,
+ int offset_start,
+ int *offset_endp)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int i;
+ SIZE_T invec_len;
+ int iov_offset, sge_offset;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_UNPACK_IOV);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_UNPACK_IOV);
+
+ invec_len = _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length);
+ MPIU_Assert(invec_len > 0);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "ZCP Unpack IOV[%d, %d), vec_len=" MPIR_UPINT_FMT_DEC_SPEC,
+ offset_start, *offset_endp, invec_len));
+ conn_hnd->zcp_recv_sge_count = 0;
+
+ for(iov_offset=offset_start, sge_offset=0;
+ (iov_offset < *offset_endp) && (invec_len > 0) && (sge_offset < MPID_IOV_LIMIT);
+ sge_offset++){
+
+ u_long cur_iov_len;
+
+ cur_iov_len = iovp[iov_offset].MPID_IOV_LEN;
+
+ /* Note that invec_len will be < MPID_NEM_ND_DEV_IO_LIMIT */
+ if(invec_len < cur_iov_len){
+ conn_hnd->zcp_recv_sge[sge_offset].pAddr = iovp[iov_offset].MPID_IOV_BUF;
+ conn_hnd->zcp_recv_sge[sge_offset].Length = invec_len;
+ conn_hnd->zcp_recv_sge_count++;
+
+ iovp[iov_offset].MPID_IOV_BUF += invec_len;
+ iovp[iov_offset].MPID_IOV_LEN -= invec_len;
+
+ invec_len = 0;
+ break;
+ }
+ else{
+ conn_hnd->zcp_recv_sge[sge_offset].pAddr = iovp[iov_offset].MPID_IOV_BUF;
+ conn_hnd->zcp_recv_sge[sge_offset].Length = cur_iov_len;
+ conn_hnd->zcp_recv_sge_count++;
+
+ invec_len -= cur_iov_len;
+ iovp[iov_offset].MPID_IOV_LEN = 0;
+ }
+
+ if(iovp[iov_offset].MPID_IOV_LEN == 0){
+ iov_offset++;
+ }
+ }
+
+ *offset_endp = iov_offset;
+
+ for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "recv sge[%d/%d]={%p/%I64d}",
+ i, conn_hnd->zcp_recv_sge_count,
+ conn_hnd->zcp_recv_sge[i].pAddr,
+ conn_hnd->zcp_recv_sge[i].Length));
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_UNPACK_IOV);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+
+/*
+ * Output:
+ * offset_endp => Returns the next offset that is to be packed, could be invalid
+ */
+#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_unpack_iov
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPID_Nem_nd_unpack_iov(MPID_Nem_nd_conn_hnd_t conn_hnd,
+ MPID_IOV *iovp,
+ int offset_start,
+ int *offset_endp,
+ MPID_Nem_nd_pack_t pack_type,
+ MPID_Nem_nd_msg_mw_t *msg_mwp,
+ char *buf,
+ SIZE_T *nbp)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ret_errno;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_UNPACK_IOV);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_UNPACK_IOV);
+
+ MPIU_Assert(iovp != NULL);
+ MPIU_Assert(offset_start >= 0);
+ MPIU_Assert(offset_endp != NULL);
+ MPIU_Assert(*offset_endp >= offset_start);
+ MPIU_Assert((pack_type == MPID_NEM_ND_SR_PACK) ? ((buf != NULL) && (nbp != NULL) && (*nbp > 0)) : 1);
+
+ /* Unpack and register */
+ if(pack_type == MPID_NEM_ND_SR_PACK){
+ int iov_count, off;
+ SIZE_T nb;
+
+ iov_count = *offset_endp - offset_start;
+ off = offset_start;
+
+ MPIU_Assert(off < *offset_endp);
+ nb = copy_and_trim_iov(iovp, &iov_count, &off, buf, *nbp);
+
+ /* Number of bytes consumed */
+ *nbp = nb;
+ /* Return the last valid offset that was processed */
+ *offset_endp = off;
+ }
+ else if(pack_type == MPID_NEM_ND_ZCP_PACK){
+ if(msg_mwp != NULL){
+ /* Save the MW */
+ ret_errno = memcpy_s((void *)&(conn_hnd->zcp_msg_recv_mw), sizeof(MPID_Nem_nd_msg_mw_t ), (void *)msg_mwp, sizeof(MPID_Nem_nd_msg_mw_t));
+ MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
+ "**nd_read", "**nd_read %s %d", strerror(ret_errno), ret_errno);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Received mem descriptor : base = %p, length=%I64d, token=%d\n",
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+ conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+ }
+ else{
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Re-using mem descriptor : base = %p, length=%I64d, token=%d\n",
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+ conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+ }
+
+ /* Unpack IOVs to SGEs */
+ mpi_errno = MPID_Nem_nd_zcp_unpack_iov(conn_hnd, iovp, offset_start, offset_endp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ /* Register SGEs */
+ mpi_errno = MPID_Nem_nd_zcp_recv_sge_reg(conn_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ else{
+ /* Unrecognized packing type */
+ MPIU_Assert(0);
+ }
+fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_UNPACK_IOV);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME zcp_recv_sge_dereg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int zcp_recv_sge_dereg(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+ int mpi_errno = MPI_SUCCESS;
+ HRESULT hr;
+ int i;
+ MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_DEREG);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_DEREG);
+
+ mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd,
+ conn_hnd->zcp_recv_sge_count, conn_hnd, 1);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ for(i=0; i<conn_hnd->zcp_recv_sge_count; i++){
+ hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_recv_sge[i].hMr,
+ MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd));
+ if(SUCCEEDED(hr)){
+ conn_hnd->npending_ops++;
+ mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+ mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+ _com_error(hr).ErrorMessage(), hr);
+
+ conn_hnd->zcp_recv_sge[i].Length = 0;
+ conn_hnd->zcp_recv_sge[i].pAddr = 0;
+ }
+
+fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_RECV_SGE_DEREG);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME zcp_dereg_smem
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int zcp_dereg_smem(MPID_Nem_nd_conn_hnd_t conn_hnd)
+{
+ int mpi_errno = MPI_SUCCESS;
+ HRESULT hr;
+ int i, iov_offset;
+ MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_ZCP_DEREG_SMEM);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_ZCP_DEREG_SMEM);
+
+ MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+
+ /* FIXME: Don't block here - let each reg mem take place inside a handler */
+ /* Registering the local IOV */
+ mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd, 1, conn_hnd, 1);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_send_mr_hnd,
+ MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd));
+ if(SUCCEEDED(hr)){
+ /* Manual event */
+ conn_hnd->npending_ops++;
+ mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ /* FIXME: Change the error message - nd_mem_reg */
+ MPIU_ERR_CHKANDJUMP2(FAILED(hr),
+ mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
+ _com_error(hr).ErrorMessage(), hr);
+
+fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_ZCP_DEREG_SMEM);
+ return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
+}
+
+#undef FUNCNAME
#define FUNCNAME zcp_read_success_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int zcp_read_success_handler(MPID_Nem_nd_msg_result_t *send_result)
{
int mpi_errno = MPI_SUCCESS, ret_errno=0;
+ SIZE_T nb=0, invec_len;
MPID_Nem_nd_conn_hnd_t conn_hnd;
- MPID_Nem_nd_block_op_hnd_t dereg_op_hnd;
MPID_Nem_nd_msg_t *pmsg;
- MPID_Request *zcp_req;
+ MPID_Request *zcp_reqp;
int (*req_fn)(MPIDI_VC_t *, MPID_Request *, int *);
int req_complete=0;
+ int sge_count, sge_offset, mw_count, mw_offset, i;
HRESULT hr;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_ZCP_READ_SUCCESS_HANDLER);
@@ -1214,49 +2081,101 @@
pmsg = GET_MSGBUF_FROM_MSGRESULT(send_result);
MPIU_Assert(pmsg != NULL);
- zcp_req = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
- MPIU_Assert(zcp_req != NULL);
+ nb = GET_NB_FROM_MSGRESULT(send_result);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Finished RDMA Read [" MPIR_UPINT_FMT_DEC_SPEC "] bytes on conn[%p]", nb, conn_hnd));
- /* Call req handler and send Rd complete pkt */
- req_fn = zcp_req->dev.OnDataAvail;
- if(req_fn){
- mpi_errno = req_fn(conn_hnd->vc, zcp_req, &req_complete);
+ MPID_Nem_nd_dev_hnd_g->npending_rds--; conn_hnd->npending_rds--;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+ MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "After Rd Rcvd mem descriptor : base = %p, length=%I64d, token=%d\n",
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
+ conn_hnd->zcp_msg_recv_mw.mw_data.Token));
+
+ zcp_reqp = conn_hnd->zcp_rreqp;
+ MPIU_Assert(zcp_reqp != NULL);
+
+ /* Trim nd_sge list of recv bufs */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trimming recv_sge[cnt=%d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+ conn_hnd->zcp_recv_sge_count, nb));
+
+ sge_count = conn_hnd->zcp_recv_sge_count;
+ sge_offset = 0;
+ trim_nd_sge(conn_hnd->zcp_recv_sge, &sge_count, &sge_offset, nb);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "After trimming recv_sge[cnt=%d/off=%d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+ conn_hnd->zcp_recv_sge_count, sge_offset, nb));
+
+ MPIU_Assert(sge_count == 0);
+
+ /* Trim the nd mw descriptor list of send bufs */
+ mw_count = 1;
+ mw_offset = 0;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trimming recv_mw[len=%I64d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length), nb));
+
+ trim_nd_mw(&(conn_hnd->zcp_msg_recv_mw.mw_data), &mw_count, &mw_offset, nb);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "After trimming recv_mw[len=%I64d], nb=" MPIR_UPINT_FMT_DEC_SPEC,
+ _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length), nb));
+
+ invec_len = _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length);
+
+ /* Deregister old bufs */
+ mpi_errno = zcp_recv_sge_dereg(conn_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ req_complete = 0;
+ if(zcp_reqp->dev.iov_count == 0){
+ mpi_errno = MPID_Nem_nd_handle_recv_req(conn_hnd, zcp_reqp, &req_complete);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- MPIU_Assert(req_complete);
+ if(req_complete){
+ MPIU_Assert(invec_len == 0);
+ MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(conn_hnd->vc, NULL);
+ }
}
- else{
- MPIDI_CH3U_Request_complete(zcp_req);
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Req - RDMA Rd - complete...");
- MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(conn_hnd->vc, NULL);
- }
- /* Unregister user memory */
- mpi_errno = MPID_Nem_nd_block_op_init(&dereg_op_hnd, conn_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ if(invec_len == 0){
+ /* We are no longer ZCP reading */
+ conn_hnd->zcp_rreqp = NULL;
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_recv_sge.hMr,
- MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd));
- if(hr == ND_PENDING){
- /* Manual event */
- conn_hnd->npending_ops++;
- mpi_errno = MPID_Nem_nd_sm_block(dereg_op_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd), &nb, TRUE);
- */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Sending RD ACK ...");
+ /* We have now read all the send bufs */
+ /* Use the msg & send rd complete pkt */
+ pmsg->hdr.type = MPID_NEM_ND_RD_ACK_PKT;
+
+ SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
+ mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, sizeof(MPID_Nem_nd_msg_hdr_t ), 0);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
- MPIU_ERR_CHKANDJUMP2(FAILED(hr),
- mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
- _com_error(hr).ErrorMessage(), hr);
+ else{
+ int offset_end;
+
+ MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
- /* Use the msg & send rd complete pkt */
- pmsg->hdr.type = MPID_NEM_ND_RD_ACK_PKT;
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Re-unpacking and reading ...");
+ /* re-unpack and read */
+ offset_end = zcp_reqp->dev.iov_offset + zcp_reqp->dev.iov_count;
+ mpi_errno = MPID_Nem_nd_unpack_iov(conn_hnd,
+ zcp_reqp->dev.iov,
+ zcp_reqp->dev.iov_offset,
+ &offset_end,
+ MPID_NEM_ND_ZCP_PACK,
+ NULL,
+ NULL, NULL);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
- mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, pmsg, sizeof(MPID_Nem_nd_msg_hdr_t ), 0);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ /* Next offset to be packed */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "zcp_reqp(%p) off %d -> %d", zcp_reqp, zcp_reqp->dev.iov_offset, offset_end));
+ zcp_reqp->dev.iov_count -= (offset_end - zcp_reqp->dev.iov_offset);
+ zcp_reqp->dev.iov_offset = offset_end;
+
+ mpi_errno = MPID_Nem_nd_zcp_recv(conn_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_ZCP_READ_SUCCESS_HANDLER);
return mpi_errno;
@@ -1274,7 +2193,7 @@
int mpi_errno = MPI_SUCCESS;
HRESULT hr;
MPID_Nem_nd_conn_hnd_t conn_hnd;
- MPID_Nem_nd_block_op_hnd_t dereg_op_hnd;
+ MPID_Request *sreqp = NULL;
MPID_Nem_nd_msg_t *pmsg;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_ZCP_MW_INVALIDATE_SUCCESS_HANDLER);
@@ -1284,41 +2203,36 @@
conn_hnd = GET_CONNHND_FROM_MSGRESULT(recv_result);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
- /* Allow sends on the conn */
- conn_hnd->zcp_in_progress = 0;
- conn_hnd->send_credits = conn_hnd->zcp_credits;
+ sreqp = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+ MPIU_Assert(sreqp != NULL);
/* Repost the recv buf */
pmsg = GET_MSGBUF_FROM_MSGRESULT(recv_result);
MPIU_Assert(pmsg != NULL);
+ /* Repost msg buf */
SET_MSGBUF_HANDLER(pmsg, recv_success_handler, gen_recv_fail_handler);
mpi_errno = MPID_Nem_nd_post_recv_msg(conn_hnd, pmsg);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /* Deregister memory */
- mpi_errno = MPID_Nem_nd_block_op_init(&dereg_op_hnd, conn_hnd);
+ mpi_errno = zcp_dereg_smem(conn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->DeregisterMemory(conn_hnd->zcp_send_mr_hnd,
- MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd));
- if(hr == ND_PENDING){
- /* Manual event */
- conn_hnd->npending_ops++;
- mpi_errno = MPID_Nem_nd_sm_block(dereg_op_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(dereg_op_hnd), &nb, TRUE);
- */
+ conn_hnd->zcp_in_progress = 0;
+ conn_hnd->zcp_send_offset = 0;
+ MPID_Nem_nd_dev_hnd_g->zcp_pending = 0;
+
+ if(sreqp->dev.iov_count == 0){
+ /* Call the cont success handler */
+ mpi_errno = cont_send_success_handler(&(conn_hnd->zcp_send_result));
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
- MPIU_ERR_CHKANDJUMP2(FAILED(hr),
- mpi_errno, MPI_ERR_OTHER, "**nd_write", "**nd_write %s %d",
- _com_error(hr).ErrorMessage(), hr);
+ else{
+ /* Continue sending data on this req */
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
- /* Call the send success handler for zcp transfer */
- mpi_errno = zcp_send_success_handler(&(conn_hnd->zcp_send_result));
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_ZCP_MW_INVALIDATE_SUCCESS_HANDLER);
return mpi_errno;
@@ -1362,36 +2276,56 @@
}
#undef FUNCNAME
-#define FUNCNAME zcp_send_success_handler
+#define FUNCNAME cont_send_success_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int zcp_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_send_result)
+static int cont_send_success_handler(MPID_Nem_nd_msg_result_t *zcp_send_result)
{
int mpi_errno = MPI_SUCCESS;
int req_complete;
MPID_Nem_nd_conn_hnd_t conn_hnd;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_ZCP_SEND_SUCCESS_HANDLER);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_CONT_SEND_SUCCESS_HANDLER);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_ZCP_SEND_SUCCESS_HANDLER);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_CONT_SEND_SUCCESS_HANDLER);
conn_hnd = GET_CONNHND_FROM_ZCP_SEND_MSGRESULT(zcp_send_result);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+ MPIU_Assert(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(conn_hnd->vc));
if(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(conn_hnd->vc)){
- mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
+ req_complete = 0;
+ mpi_errno = MPID_Nem_nd_handle_posted_sendq_tail_req(conn_hnd->vc, &req_complete);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- }
- /* If we have queued sends and credits to send data - go ahead with sending */
- if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
- mpi_errno = process_pending_req(conn_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ if(req_complete){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ZCP/Cont_send req complete...");
+ /* Allow sends on the conn */
+ conn_hnd->send_in_progress = 0;
+ conn_hnd->send_credits = conn_hnd->cache_credits;
+
+ /* If we have queued sends and credits to send data - go ahead with sending */
+ if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
+ mpi_errno = process_pending_req(conn_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+ else{
+ MPID_Request *sreqp;
+
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ZCP/Cont_send req NOT complete... sending remaining/reloaded IOVs");
+ sreqp = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+ MPIU_Assert(sreqp != NULL);
+
+ /* Send reloaded iovs */
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
}
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_ZCP_SEND_SUCCESS_HANDLER);
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_CONT_SEND_SUCCESS_HANDLER);
return mpi_errno;
fn_fail:
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -1420,8 +2354,14 @@
MPIU_Assert(req != NULL);
/* FIXME: Can we coalesce multiple pending sends ? */
- mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, &(req->dev.iov[req->dev.iov_offset]), req->dev.iov_count);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ if(!MPID_NEM_ND_IS_BLOCKING_REQ(req)){
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, req);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ else{
+ mpi_errno = MPID_Nem_nd_post_sendbv(conn_hnd, req);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(conn_hnd->vc, req);
}
@@ -1459,7 +2399,6 @@
MPIU_Assert(MPIDI_Request_get_type(req) != MPIDI_REQUEST_TYPE_GET_RESP);
MPIDI_CH3U_Request_complete(req);
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(vc, &req);
*req_complete = 1;
}
else{
@@ -1469,10 +2408,12 @@
if (*req_complete){
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(vc, &req);
}
+ else{
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... Not complete");
+ req->dev.iov_offset = 0;
+ }
}
- req->dev.iov_offset = 0;
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_HEAD_REQ);
@@ -1481,7 +2422,58 @@
goto fn_exit;
}
+/* Handle the request at the tail of vc's sendq */
#undef FUNCNAME
+#define FUNCNAME MPID_Nem_nd_handle_posted_sendq_tail_req
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPID_Nem_nd_handle_posted_sendq_tail_req(MPIDI_VC_t *vc, int *req_complete)
+{
+ int (*req_handler)(MPIDI_VC_t *, MPID_Request *, int *);
+ int mpi_errno = MPI_SUCCESS;
+ MPID_Request *req = NULL;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_TAIL_REQ);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_TAIL_REQ);
+
+ MPIU_Assert(req_complete != NULL);
+
+ MPIU_Assert(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY(vc));
+ req = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(vc);
+ MPIU_Assert(req != NULL);
+
+ req_handler = req->dev.OnDataAvail;
+ if (!req_handler){
+ MPIU_Assert(MPIDI_Request_get_type(req) != MPIDI_REQUEST_TYPE_GET_RESP);
+ MPIDI_CH3U_Request_complete(req);
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_REM_TAIL(vc, &req);
+ *req_complete = 1;
+ }
+ else{
+ *req_complete = 0;
+ mpi_errno = req_handler(vc, req, req_complete);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ if (*req_complete){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_REM_TAIL(vc, &req);
+ }
+ else{
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... Not complete");
+ req->dev.iov_offset = 0;
+ }
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_HANDLE_POSTED_SENDQ_TAIL_REQ);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+
+#undef FUNCNAME
#define FUNCNAME send_success_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -1491,6 +2483,7 @@
int req_complete = 0;
MPID_Nem_nd_conn_hnd_t conn_hnd;
MPID_Nem_nd_msg_t *pmsg;
+ MPID_Request *sreqp;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_SEND_SUCCESS_HANDLER);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_SEND_SUCCESS_HANDLER);
@@ -1501,24 +2494,69 @@
pmsg = GET_MSGBUF_FROM_MSGRESULT(send_result);
MPIU_Assert(pmsg != NULL);
+ sreqp = MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc);
+ MPIU_Assert(sreqp != NULL);
+
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send succeeded...");
- conn_hnd->npending_ops--;
- if(conn_hnd->vc != NULL){
+ /* Reset the handlers & enqueue this send buffer to freeq */
+ SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
+ MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+ MPIU_Assert(!MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_EMPTY((conn_hnd->vc)));
+
+ if(MPID_NEM_ND_VC_IS_CONNECTED(conn_hnd->vc)){
/* Increment number of available send credits only when a credit packet is recvd */
/* Complete the request associated with this send if no pending events */
- mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ req_complete = 0;
- /* Enqueue this send buffer to freeq */
- MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+ if(!conn_hnd->send_in_progress){
+ mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /* If we have queued sends and credits to send data - go ahead with sending */
- if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
- mpi_errno = process_pending_req(conn_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(conn_hnd->vc, &sreqp);
+ if(req_complete){
+ /* If we have queued sends and credits to send data - go ahead with sending */
+ if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send credits available. Processing queued req...");
+ mpi_errno = process_pending_req(conn_hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+ else{
+ MPIU_Assert(0);
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_ENQUEUE(conn_hnd->vc, sreqp);
+ }
}
+ else{
+ if(MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_HEAD(conn_hnd->vc) ==
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_TAIL(conn_hnd->vc)){
+ /* Only ZCP/Cont_send req in posted Q */
+ /* If ZCP is in progress - the ZCP handlers will handle sends */
+ if(!conn_hnd->zcp_in_progress){
+ if(sreqp->dev.iov_count == 0){
+ mpi_errno = cont_send_success_handler(&(conn_hnd->zcp_send_result));
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ else{
+ /* Repost the remaining/reloaded IOV */
+ mpi_errno = MPID_Nem_nd_post_sendv(conn_hnd, sreqp);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+ }
+ else{
+ req_complete = 0;
+ mpi_errno = MPID_Nem_nd_handle_posted_sendq_head_req(conn_hnd->vc, &req_complete);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ MPID_NEM_ND_VCCH_NETMOD_POSTED_SENDQ_DEQUEUE(conn_hnd->vc, &sreqp);
+
+ MPIU_Assert(req_complete);
+ }
+ }
}
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_SEND_SUCCESS_HANDLER);
@@ -1548,14 +2586,15 @@
pmsg = GET_MSGBUF_FROM_MSGRESULT(send_result);
MPIU_Assert(pmsg != NULL);
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Send succeeded...");
- conn_hnd->npending_ops--;
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Netmod Send succeeded...");
- if(conn_hnd->vc != NULL){
+ /* Reset the handlers & enqueue this send buffer to freeq */
+ SET_MSGBUF_HANDLER(pmsg, send_success_handler, gen_send_fail_handler);
+ MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
+
+ if(MPID_NEM_ND_VC_IS_CONNECTED(conn_hnd->vc)){
/* Increment number of available send credits only when a credit packet is recvd */
/* There is no request associated with this send - Netmod msg */
- /* Enqueue this send buffer to freeq */
- MSGBUF_FREEQ_ENQUEUE(conn_hnd, pmsg);
/* If we have queued sends and credits to send data - go ahead with sending */
if(MPID_NEM_ND_CONN_HAS_SCREDITS(conn_hnd)){
@@ -1572,7 +2611,6 @@
goto fn_exit;
}
-
#undef FUNCNAME
#define FUNCNAME wait_cack_success_handler
#undef FCNAME
@@ -1598,11 +2636,14 @@
if(pmsg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT){
/* Connection successful */
MPIDI_VC_t *vc;
- MPIDI_CH3I_VC *vc_ch;
- vc = conn_hnd->vc;
- vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
+ /* Set this conn vc to the stored vc info */
+ vc = conn_hnd->tmp_vc;
+ conn_hnd->vc = vc;
+ MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, conn_hnd);
+ /* We no longer need tmp conn info in VC */
+ MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(vc);
MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_CONNECTED);
MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_ACTIVE);
@@ -1617,6 +2658,8 @@
else{
/* Connection failed - Lost in head to head on the remote side */
conn_hnd->vc = NULL;
+ conn_hnd->tmp_vc = NULL;
+
MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -1655,7 +2698,7 @@
lconn_hnd = GET_CONNHND_FROM_EX_RECV_OV(recv_ov);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(lconn_hnd));
- mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_ACCEPT_CONN, lconn_hnd->p_conn, &new_conn_hnd);
+ mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_ACCEPT_CONN, lconn_hnd->p_conn, NULL, &new_conn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
/* Get the pg information sent with the connect request */
@@ -1691,6 +2734,7 @@
pg_id = (char *)MPIDI_Process.my_pg->id;
mpi_errno = MPID_Nem_nd_decode_pg_info(pg_id, pg_info->pg_rank, &vc, &pg);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New conn (%p) for rank = %d", new_conn_hnd, pg_info->pg_rank));
}
else{
/* FIXME: TODO */
@@ -1698,43 +2742,53 @@
}
vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
- if(MPID_NEM_ND_CONN_HND_IS_VALID(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc))){
- if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) != MPID_NEM_ND_VC_STATE_CONNECTED){
- /* VC is connecting - head-to-head scenario */
- MPID_Nem_nd_conn_hnd_t old_conn_hnd = MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc);
- int old_conn_won_hh=0;
- mpi_errno = MPID_Nem_nd_resolve_head_to_head(pg_info->pg_rank, pg, pg_id, &old_conn_won_hh);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "H-H: Old conn (%p:%d) & new conn (%p:%d)", old_conn_hnd, old_conn_hnd->state, new_conn_hnd, new_conn_hnd->state));
- if(old_conn_won_hh){
- /* Won head to head with new conn */
- /* Send a NAK and close the new conn */
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Old conn (%p) won head to head", MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc)));
- terminate_conn = 1;
- }
- else{
- /* Lost head to head with new conn */
- /* Make old conn orphan - The other side with send
- * us a LNAK and we can close the old conn
- */
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New conn (%p) won head to head", new_conn_hnd));
+ if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_CONNECTED){
+ /* VC is already connected */
+ terminate_conn = 1;
+ }
+ else if(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_CONNECTING){
+ /* VC is connecting - head-to-head scenario */
+ MPID_Nem_nd_conn_hnd_t old_conn_hnd = MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_GET(vc);
+ int old_conn_won_hh = 0;
- MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, new_conn_hnd);
- new_conn_hnd->vc = vc;
- terminate_conn = 0;
- }
+ mpi_errno = MPID_Nem_nd_resolve_head_to_head(pg_info->pg_rank, pg, pg_id, &old_conn_won_hh);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ /* The old conn may not be VALID yet - So don't use it */
+ MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_INIT(old_conn_hnd));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "H-H: Old conn (%p:%d) & new conn (%p:%d)", old_conn_hnd, old_conn_hnd->state, new_conn_hnd, new_conn_hnd->state));
+
+ if(old_conn_won_hh){
+ /* Won head to head with new conn
+ * Send a NAK and close the new conn
+ */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Old conn (%p) won head to head", MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc)));
+ terminate_conn = 1;
}
else{
- /* VC is already connected */
- terminate_conn = 1;
+ /* Lost head to head with new conn
+ * Make old conn orphan - The other side with send
+ * us a LNAK and we then close the old conn
+ */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New conn (%p) won head to head", new_conn_hnd));
+
+ /*
+ MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, new_conn_hnd);
+ new_conn_hnd->vc = vc;
+ */
+ old_conn_hnd->is_orphan = 1;
+ /* Save VC info */
+ new_conn_hnd->tmp_vc = vc;
+ terminate_conn = 0;
}
}
- else{
- /* No conn associated with this vc */
- /* Associate vc with this connection */
- MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, new_conn_hnd);
- /* Associate this conn with the vc */
- new_conn_hnd->vc = vc;
+ else{ /* VC is DISCONNECTED */
+ /* Save vc info with this connection. We are still not
+ * sure if the VC is CONNECTING - Since a CNAK could mean
+ * an orphan conn
+ * Associate this conn with vc when we receive a CACK
+ */
+ new_conn_hnd->tmp_vc = vc;
terminate_conn = 0;
}
@@ -1743,31 +2797,29 @@
* Do a blocking send for LNAK & disc
*/
MPID_Nem_nd_msg_t *pmsg;
- int msg_len=0;
+ SIZE_T msg_len=0;
- /* Post a LACK - do a blocking send */
+ MPID_NEM_ND_CONN_STATE_SET(new_conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
+ /* Post a LNAK */
MSGBUF_FREEQ_DEQUEUE(new_conn_hnd, pmsg);
MPIU_Assert(pmsg != NULL);
+ SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
+
pmsg->hdr.type = MPID_NEM_ND_CONN_NAK_PKT;
msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
- mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 1);
+ mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 0);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- MSGBUF_FREEQ_ENQUEUE(new_conn_hnd, pmsg);
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Passive disc on (%p)", new_conn_hnd));
- /* Wait for a disconnect from the other side and free resources */
+
+ /* Wait for a disconnect/CNAK/CACK from the other side and free resources */
mpi_errno = MPID_Nem_nd_conn_passive_disc(new_conn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
- /*
- mpi_errno = MPID_Nem_nd_conn_disc(new_conn_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- */
}
else{
/* Connection successful - send LACK & wait for CACK */
MPID_Nem_nd_msg_t *pmsg;
- int msg_len=0;
+ SIZE_T msg_len=0;
MPID_NEM_ND_CONN_STATE_SET(new_conn_hnd, MPID_NEM_ND_CONN_WAIT_CACK);
/* Grab the head of the recv ssbufs and set its handlers */
@@ -1779,13 +2831,13 @@
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Post LACK wait for CACK on (%p)", new_conn_hnd));
- /* Post a LACK - do a blocking send */
+ /* Post a LACK - do a non-blocking send */
MSGBUF_FREEQ_DEQUEUE(new_conn_hnd, pmsg);
- pmsg->hdr.type = MPID_NEM_ND_CONN_ACK_PKT;
+ SET_MSGBUF_HANDLER(pmsg, netmod_msg_send_success_handler, gen_send_fail_handler);
+ pmsg->hdr.type = MPID_NEM_ND_CONN_ACK_PKT;
msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
- mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 1);
+ mpi_errno = MPID_Nem_nd_post_send_msg(new_conn_hnd, pmsg, msg_len, 0);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- MSGBUF_FREEQ_ENQUEUE(new_conn_hnd, pmsg);
}
fn_exit:
@@ -1805,7 +2857,7 @@
int mpi_errno = MPI_SUCCESS;
MPID_Nem_nd_msg_t *precv_msg;
MPID_Nem_nd_conn_hnd_t conn_hnd;
- MPIDI_CH3I_VC *vc_ch;
+ int terminate_conn = 0;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_WAIT_LACK_SUCCESS_HANDLER);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_WAIT_LACK_SUCCESS_HANDLER);
@@ -1828,10 +2880,19 @@
if(precv_msg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT){
MPID_Nem_nd_msg_t *psend_msg;
MPIDI_VC_t *vc;
- int msg_len=0;
+ SIZE_T msg_len=0;
+
/* VC is now connected - send an ACK - CACK - to listen side */
- vc = conn_hnd->vc;
- vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
+
+ terminate_conn = 0;
+
+ /* FIXME: Use a single macro to set vc->conn/conn->vc/state/checks etc */
+ vc = conn_hnd->tmp_vc;
+ conn_hnd->vc = vc;
+ MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, conn_hnd);
+ /* We no longer need the tmp conn info in vc */
+ MPID_NEM_ND_VCCH_NETMOD_TMP_CONN_HND_INIT(vc);
+ MPIU_Assert(MPID_NEM_ND_VCCH_NETMOD_STATE_GET(vc) == MPID_NEM_ND_VC_STATE_CONNECTING);
MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_CONNECTED);
MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_ACTIVE);
@@ -1844,12 +2905,9 @@
psend_msg->hdr.type = MPID_NEM_ND_CONN_ACK_PKT;
msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LACK - Sending CACK");
- /* We block till the ACK is sent */
- mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 1);
+ mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 0);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- MSGBUF_FREEQ_ENQUEUE(conn_hnd, psend_msg);
-
/* Repost receive on the used msg buf */
SET_MSGBUF_HANDLER(precv_msg, recv_success_handler, gen_recv_fail_handler);
mpi_errno = MPID_Nem_nd_post_recv_msg(conn_hnd, precv_msg);
@@ -1860,46 +2918,49 @@
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
else{
- /* Received LNAK - Close connection - silently */
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Closing connection");
-
- MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
- mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ /* Received LNAK - Close connection */
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Sending CNAK and closing conn");
+ terminate_conn = 1;
}
}
else{
/* Send NAK - We lost in head to head connection to the listen side */
MPIU_Assert((precv_msg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT) ||
(precv_msg->hdr.type == MPID_NEM_ND_CONN_NAK_PKT));
- /* vc is already disconnected from conn */
- MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
-
+ terminate_conn = 1;
if(precv_msg->hdr.type == MPID_NEM_ND_CONN_ACK_PKT){
- /* Send a CNAK and disc */
- MPID_Nem_nd_msg_t *psend_msg;
- int msg_len=0;
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LACK - Lost HH - Sending CNAK & Closing connection");
+ }
+ else{
+ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Lost HH - Sending CNAK & Closing connection");
+ }
+ }
+ if(terminate_conn){
+ /* We reach here in 2 cases
+ * case 1: Received LNAK, Conn is not orphan yet
+ * case 2: Conn is orphan
+ * In both cases send a CNAK and disconnect
+ */
+ /* Send a CNAK and disc */
+ MPID_Nem_nd_msg_t *psend_msg;
+ SIZE_T msg_len=0;
- /* Blocking send for CNAK */
- MPIU_Assert(!MSGBUF_FREEQ_IS_EMPTY(conn_hnd));
- MSGBUF_FREEQ_DEQUEUE(conn_hnd, psend_msg);
- MPIU_Assert(psend_msg != NULL);
+ /* VC is already/or-will-be connected from conn
+ * by the listen side. The VC may also have terminated by now.
+ * - So don't change the state of VC here
+ * the vc state is left *as-is*
+ */
+ MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_QUIESCENT);
- psend_msg->hdr.type = MPID_NEM_ND_CONN_NAK_PKT;
- msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LACK - Lost HH - Sending CNAK & Closing connection");
+ MPIU_Assert(!MSGBUF_FREEQ_IS_EMPTY(conn_hnd));
+ MSGBUF_FREEQ_DEQUEUE(conn_hnd, psend_msg);
+ MPIU_Assert(psend_msg != NULL);
+ SET_MSGBUF_HANDLER(psend_msg, quiescent_msg_handler, gen_send_fail_handler);
- /* We block till the ACK is sent */
- mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 1);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ psend_msg->hdr.type = MPID_NEM_ND_CONN_NAK_PKT;
+ msg_len = sizeof(MPID_Nem_nd_msg_hdr_t );
- MSGBUF_FREEQ_ENQUEUE(conn_hnd, psend_msg);
- }
- else{
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received LNAK - Lost HH - Closing connection");
- }
- /* Close connection - silently */
- mpi_errno = MPID_Nem_nd_conn_disc(conn_hnd);
+ mpi_errno = MPID_Nem_nd_post_send_msg(conn_hnd, psend_msg, msg_len, 0);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
@@ -1921,9 +2982,10 @@
HRESULT hr;
MPID_Nem_nd_conn_hnd_t conn_hnd;
MPID_Nem_nd_msg_t *pmsg, *pzcp_msg;
- MPID_Nem_nd_block_op_hnd_t zcp_op_hnd;
- MPID_Request *rreq = NULL;
- int nb, udata_len=0;
+ MPID_Request *rreqp = NULL;
+ int i, offset_end;
+ char *buf;
+ SIZE_T buflen, nb, udata_len=0;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_RECV_SUCCESS_HANDLER);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_RECV_SUCCESS_HANDLER);
@@ -1936,14 +2998,15 @@
nb = GET_NB_FROM_MSGRESULT(recv_result);
MPIU_ERR_CHKANDJUMP(nb == 0, mpi_errno, MPI_ERR_OTHER, "**nd_write");
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Recvd %d bytes (msg type=%d)",nb, pmsg->hdr.type));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Recvd " MPIR_UPINT_FMT_DEC_SPEC " bytes (msg type=%d) on conn=%p",
+ nb, pmsg->hdr.type, conn_hnd));
MPIU_Assert(nb >= sizeof(MPID_Nem_nd_msg_hdr_t ));
- if(!conn_hnd->zcp_in_progress){
+ if(!conn_hnd->send_in_progress){
conn_hnd->send_credits += pmsg->hdr.credits;
}
else{
- conn_hnd->zcp_credits += pmsg->hdr.credits;
+ conn_hnd->cache_credits += pmsg->hdr.credits;
}
udata_len = nb - sizeof(MPID_Nem_nd_msg_hdr_t );
switch(pmsg->hdr.type){
@@ -1959,13 +3022,39 @@
break;
case MPID_NEM_ND_DATA_PKT:
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Received DATA PKT (len = %d, credits = %d)",udata_len, pmsg->hdr.credits));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Received DATA PKT (len =" MPIR_UPINT_FMT_DEC_SPEC ", credits = %d)",udata_len, pmsg->hdr.credits));
+ buf = pmsg->buf;
+ buflen = udata_len;
- /* The msg just contains the type and udata */
- /* FIXME: We need to keep track of incomplete recv reqs on the conn */
- mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, pmsg->buf, udata_len);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ do{
+ MPIU_Assert(conn_hnd->zcp_rreqp == NULL);
+ rreqp = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
+ if(rreqp == NULL){
+ /* The msg just contains the type and udata */
+ mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, buf, buflen);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ /* MPID_nem_handle_pkt() consumes all data */
+ buflen = 0;
+ }
+ else{
+ /* Continuing to recv on this conn - Just copy data into req IOVs */
+ int complete = 0;
+ SIZE_T nb = 0;
+
+ nb = buflen;
+ mpi_errno = nd_read_progress_update(conn_hnd, rreqp, buf, &nb, &complete);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ if(complete){
+ MPID_NEM_ND_VCCH_SET_ACTIVE_RECV_REQ(conn_hnd->vc, NULL);
+ }
+ buflen -= nb;
+ buf += nb;
+ }
+
+ MPIU_Assert(buflen == 0);
+ }while(buflen > 0);
+
/* When handling a packet the conn might be disconnected */
if(conn_hnd->vc != NULL){
/* Repost the recv on the scratch buf */
@@ -1978,18 +3067,86 @@
}
break;
case MPID_NEM_ND_RD_AVAIL_PKT:
- MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "Received RD Avail pkt (len=%d)", udata_len);
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "Received RD Avail pkt (len=" MPIR_UPINT_FMT_DEC_SPEC ")", udata_len);
udata_len -= sizeof(MPID_Nem_nd_msg_mw_t);
- mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, pmsg->buf, udata_len);
+
+ MPIU_Assert(((MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc) != NULL) && (conn_hnd->zcp_rreqp != NULL))?
+ (MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc) == conn_hnd->zcp_rreqp) : 1);
+ rreqp = (conn_hnd->zcp_rreqp) ? (conn_hnd->zcp_rreqp) : MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc) ;
+ conn_hnd->zcp_rreqp = rreqp;
+
+ if(rreqp == NULL){
+ mpi_errno = MPID_nem_handle_pkt(conn_hnd->vc, pmsg->buf, udata_len);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ rreqp = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
+ MPIU_Assert(rreqp != NULL);
+
+ conn_hnd->zcp_rreqp = rreqp;
+ }
+ else{
+ SIZE_T len = udata_len;
+ /* Continuing to recv data on a req */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,"Cont to recv data on req=%p", rreqp));
+ while(len > 0){
+ int req_complete = 0;
+ SIZE_T nb_unpack;
+
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "RD AVAIL contains " MPIR_UPINT_FMT_DEC_SPEC " bytes", len);
+
+ offset_end = rreqp->dev.iov_offset + rreqp->dev.iov_count;
+ nb_unpack = len;
+ mpi_errno = MPID_Nem_nd_unpack_iov(conn_hnd,
+ rreqp->dev.iov,
+ rreqp->dev.iov_offset,
+ &offset_end,
+ MPID_NEM_ND_SR_PACK,
+ NULL,
+ pmsg->buf,
+ &nb_unpack);
+ len -= nb_unpack;
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ /* Update the req offset */
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "rreqp(%p) off %d -> %d", rreqp, rreqp->dev.iov_offset, offset_end));
+
+ rreqp->dev.iov_count -= (offset_end - rreqp->dev.iov_offset);
+ rreqp->dev.iov_offset = offset_end;
+
+ if(rreqp->dev.iov_count == 0){
+ mpi_errno = MPID_Nem_nd_handle_recv_req(conn_hnd, rreqp, &req_complete);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ /* RD AVAIL always contains data to zcpy */
+ MPIU_Assert(!req_complete);
+ }
+ }
+ }
+
+ for(i=0; i<rreqp->dev.iov_count; i++){
+ int off = rreqp->dev.iov_offset + i;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,
+ "Trying to zcp unpack req (%p) - iov[%d/tot=%d] = {%p/%u}",
+ rreqp, off, rreqp->dev.iov_count,
+ rreqp->dev.iov[off].MPID_IOV_BUF,
+ rreqp->dev.iov[off].MPID_IOV_LEN
+ ));
+ }
+
+ offset_end = rreqp->dev.iov_offset + rreqp->dev.iov_count;
+ mpi_errno = MPID_Nem_nd_unpack_iov(conn_hnd,
+ rreqp->dev.iov,
+ rreqp->dev.iov_offset,
+ &offset_end,
+ MPID_NEM_ND_ZCP_PACK,
+ (MPID_Nem_nd_msg_mw_t *)&(pmsg->buf[udata_len]),
+ NULL, NULL);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
- rreq = MPID_NEM_ND_VCCH_GET_ACTIVE_RECV_REQ(conn_hnd->vc);
- MPIU_Assert(rreq != NULL);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "rreqp(%p) off %d -> %d", rreqp, rreqp->dev.iov_offset, offset_end));
- ret_errno = memcpy_s((void *)&(conn_hnd->zcp_msg_recv_mw), sizeof(MPID_Nem_nd_msg_mw_t ), (void *)&(pmsg->buf[udata_len]), sizeof(MPID_Nem_nd_msg_mw_t));
- MPIU_ERR_CHKANDJUMP2((ret_errno != 0), mpi_errno, MPI_ERR_OTHER,
- "**nd_read", "**nd_read %s %d", strerror(ret_errno), ret_errno);
+ rreqp->dev.iov_count -= (offset_end - rreqp->dev.iov_offset);
+ rreqp->dev.iov_offset = offset_end;
/* Repost recv buffer */
if(conn_hnd->vc != NULL){
@@ -1997,57 +3154,31 @@
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
- /* A msg buf is guaranteed for RDMA read */
- MSGBUF_FREEQ_DEQUEUE(conn_hnd, pzcp_msg);
- MPIU_Assert(pzcp_msg != NULL);
-
- SET_MSGBUF_HANDLER(pzcp_msg, zcp_read_success_handler, zcp_read_fail_handler);
-
- /* FIXME: We just support 1 IOV for now */
- conn_hnd->zcp_recv_sge.Length = rreq->dev.iov[rreq->dev.iov_offset].MPID_IOV_LEN;
- conn_hnd->zcp_recv_sge.pAddr = rreq->dev.iov[rreq->dev.iov_offset].MPID_IOV_BUF;
- /* Registering the local IOV */
- mpi_errno = MPID_Nem_nd_block_op_init(&zcp_op_hnd, conn_hnd);
+ mpi_errno = MPID_Nem_nd_zcp_recv(conn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->RegisterMemory(conn_hnd->zcp_recv_sge.pAddr, conn_hnd->zcp_recv_sge.Length, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd), &(conn_hnd->zcp_recv_sge.hMr));
- if(hr == ND_PENDING){
- /* Manual event */
- conn_hnd->npending_ops++;
- mpi_errno = MPID_Nem_nd_sm_block(zcp_op_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(zcp_op_hnd), &nb, TRUE);
- */
- }
- MPIU_ERR_CHKANDJUMP2(FAILED(hr),
- mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
- _com_error(hr).ErrorMessage(), hr);
-
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Performing RDMA read for %d bytes", conn_hnd->zcp_recv_sge.Length));
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Using remote mem descriptor : base = %p, length=%I64d, token=%d",
- _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Base), _byteswap_uint64(conn_hnd->zcp_msg_recv_mw.mw_data.Length),
- conn_hnd->zcp_msg_recv_mw.mw_data.Token));
-
- hr = conn_hnd->p_ep->Read(GET_PNDRESULT_FROM_MSGBUF(pzcp_msg), &(conn_hnd->zcp_recv_sge), 1,
- &(conn_hnd->zcp_msg_recv_mw.mw_data), 0, 0x0);
- MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
- mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
- _com_error(hr).ErrorMessage(), hr);
-
break;
case MPID_NEM_ND_RD_ACK_PKT:
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Received RD ACK pkt");
+ MPID_Nem_nd_dev_hnd_g->npending_rds--; conn_hnd->npending_rds--;
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "dev prds = %d; conn prds = %d",
+ MPID_Nem_nd_dev_hnd_g->npending_rds, conn_hnd->npending_rds));
+
/* Get the send credits for conn */
MPIU_Assert(udata_len == 0);
/* Save the credits in the RD ack pkt */
- /* conn_hnd->zcp_credits = pmsg->hdr.credits; */
/* Invalidate/unbind the address */
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Trying to invalidate MW [%p]",
+ conn_hnd->zcp_send_mw));
+
SET_MSGBUF_HANDLER(pmsg, zcp_mw_invalidate_success_handler, gen_recv_fail_handler);
- hr = conn_hnd->p_ep->Invalidate(GET_PNDRESULT_FROM_MSGBUF(pmsg), conn_hnd->zcp_send_mw, 0x0);
+ hr = conn_hnd->p_ep->Invalidate(GET_PNDRESULT_FROM_MSGBUF(pmsg),
+ conn_hnd->zcp_send_mw, ND_OP_FLAG_READ_FENCE);
MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_read", "**nd_read %s %d",
_com_error(hr).ErrorMessage(), hr);
+
break;
default:
MPIU_Assert(0);
@@ -2080,33 +3211,33 @@
/* FIXME: We shouldn't block here */
/* Block and complete the connect() */
- mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, conn_hnd);
+ mpi_errno = MPID_Nem_nd_block_op_init(&op_hnd, 1, conn_hnd, 1);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Complete connect on conn(%p)/block_op(%p) on ov(%p)",
+ conn_hnd, op_hnd, MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd)));
+
hr = conn_hnd->p_conn->CompleteConnect(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd));
- if(hr == ND_PENDING){
- /* Manual event */
+ if(SUCCEEDED(hr)){
+ MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_WAIT_LACK);
+ /* Receive is already pre-posted. Set the handlers correctly and wait
+ * for LACK from the other process
+ */
+ pmsg = GET_RECV_SBUF_HEAD(conn_hnd);
+ MPIU_Assert(pmsg != NULL);
+ SET_MSGBUF_HANDLER(pmsg, wait_lack_success_handler, gen_recv_fail_handler);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Setting the wait_lack recv handler for msg_buf = %p", pmsg));
+
+ /* Manual event */
conn_hnd->npending_ops++;
mpi_errno = MPID_Nem_nd_sm_block(op_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- /*
- hr = MPID_Nem_nd_dev_hnd_g->p_ad->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd), &nb, TRUE);
- */
}
MPIU_ERR_CHKANDJUMP2(FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_connect", "**nd_connect %s %d",
_com_error(hr).ErrorMessage(), hr);
- MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_WAIT_LACK);
- /* Receive is already pre-posted. Set the handlers correctly and wait
- * for LACK from the other process
- */
- pmsg = GET_RECV_SBUF_HEAD(conn_hnd);
- MPIU_Assert(pmsg != NULL);
- SET_MSGBUF_HANDLER(pmsg, wait_lack_success_handler, gen_recv_fail_handler);
-
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Setting the wait_lack recv handler for msg_buf = %p", pmsg));
-
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_CONNECTING_SUCCESS_HANDLER);
return mpi_errno;
@@ -2171,10 +3302,10 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_HANDLER);
conn_hnd = GET_CONNHND_FROM_EX_SEND_OV(send_ov);
- if(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd)){
- MPID_Nem_nd_conn_hnd_finalize(MPID_Nem_nd_dev_hnd_g, &conn_hnd);
- }
+ MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(conn_hnd));
+ MPID_Nem_nd_conn_hnd_finalize(MPID_Nem_nd_dev_hnd_g, &conn_hnd);
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_QUIESCENT_HANDLER);
return mpi_errno;
@@ -2202,6 +3333,13 @@
SET_EX_WR_HANDLER(conn_hnd, quiescent_handler, quiescent_handler);
+ /* FIXME: DEREGISTER ALL RECV BUFS HERE ...*/
+ MPIU_Assert(conn_hnd->npending_ops == 0);
+ MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "Posting disconnect on conn(%p)\n", conn_hnd);
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting disc on conn(%p) on ov(%p)",
+ conn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov))));
+
hr = conn_hnd->p_conn->Disconnect(MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov)));
MPIU_ERR_CHKANDJUMP2((hr != ND_PENDING) && FAILED(hr),
mpi_errno, MPI_ERR_OTHER, "**nd_disc", "**nd_disc %s %d",
@@ -2215,6 +3353,7 @@
goto fn_exit;
}
+
#undef FUNCNAME
#define FUNCNAME block_op_handler
#undef FCNAME
@@ -2229,13 +3368,21 @@
hnd = CONTAINING_RECORD(ov, MPID_Nem_nd_block_op_hnd_, ex_ov);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(hnd->conn_hnd));
- /* Handle manual event completion */
- hnd->conn_hnd->npending_ops--;
- MPID_Nem_nd_block_op_finalize(&hnd);
+ if(hnd->npending_ops == 0){
+ MPID_Nem_nd_block_op_finalize(&hnd);
+ }
+ else{
+ mpi_errno = MPID_Nem_nd_block_op_reinit(hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_BLOCK_OP_HANDLER);
return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
}
#undef FUNCNAME
@@ -2252,15 +3399,56 @@
hnd = CONTAINING_RECORD(ov, MPID_Nem_nd_block_op_hnd_, ex_ov);
MPIU_Assert(MPID_NEM_ND_CONN_HND_IS_VALID(hnd->conn_hnd));
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Manual event handler on conn(%p)/block_op(%p) on ov(%p)",
+ hnd->conn_hnd, hnd, ov));
+
+ MPIU_Assert(hnd->npending_ops > 0);
+ /* FIXME: Atleast for now both block op and conn have same number of pending ops */
+ MPIU_Assert(hnd->conn_hnd->npending_ops > 0);
+ /* Note that we might want to wait only for one blocking op on a conn, conn_hnd->npending_ops, but have two
+ * blocking ops on the blocking op, eg: registering memory before a connect etc
+ */
/* Handle manual event completion */
- hnd->conn_hnd->npending_ops--;
+ hnd->npending_ops--;
+ if(hnd->conn_hnd->npending_ops > 0){
+ hnd->conn_hnd->npending_ops--;
+ }
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "[%d] manual events pending", hnd->conn_hnd->npending_ops);
+ if(hnd->npending_ops == 0){
+ MPID_Nem_nd_block_op_finalize(&hnd);
+ }
+ else{
+ mpi_errno = MPID_Nem_nd_block_op_reinit(hnd);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_MANUAL_EVENT_HANDLER);
return mpi_errno;
+ fn_fail:
+ MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
+ goto fn_exit;
}
+#undef FUNCNAME
+#define FUNCNAME dummy_handler
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int __cdecl dummy_handler(MPIU_EXOVERLAPPED *ov)
+{
+ int mpi_errno = MPI_SUCCESS;
+ MPID_Nem_nd_block_op_hnd_t hnd;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_DUMMY_HANDLER);
+
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_DUMMY_HANDLER);
+
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_DUMMY_HANDLER);
+
+ return mpi_errno;
+}
+
/* The caller is responsible for freeing the pg info buffer allocated by
* this function
*/
@@ -2314,44 +3502,7 @@
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
goto fn_exit;
}
-/* FIXME - Remove
-#undef FUNCNAME
-#define FUNCNAME MPID_Nem_nd_resolve_remote_addr
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_resolve_remote_addr(MPID_Nem_nd_conn_hnd_t conn_hnd,
- struct sockaddr *punresolved_sin,
- int unresolved_sin_len,
- struct sockaddr *presolved_sin,
- int resolved_sin_len)
-{
- int mpi_errno = MPI_SUCCESS, ret, len;
- SOCKET s;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_RESOLVE_REMOTE_ADDR);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_RESOLVE_REMOTE_ADDR);
-
- s = WSASocketW(AF_INET, SOCK_STREAM, 0, NULL, 0, WSA_FLAG_OVERLAPPED);
- MPIU_ERR_CHKANDJUMP2((s == INVALID_SOCKET), mpi_errno, MPI_ERR_OTHER,
- "**sock_create", "**sock_create %s %d",
- MPIU_OSW_Strerror(MPIU_OSW_Get_errno()), MPIU_OSW_Get_errno());
-
- ret = WSAIoctl(s, SIO_ROUTING_INTERFACE_QUERY,
- (void *)punresolved_sin, (DWORD )unresolved_sin_len,
- (void *)presolved_sin, (DWORD )resolved_sin_len, (DWORD *)&len, NULL, NULL);
- MPIU_ERR_CHKANDJUMP2((ret == SOCKET_ERROR), mpi_errno, MPI_ERR_OTHER,
- "**ioctl_socket", "**ioctl_socket %s %d",
- MPIU_OSW_Strerror(MPIU_OSW_Get_errno()), MPIU_OSW_Get_errno());
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_RESOLVE_REMOTE_ADDR);
- return mpi_errno;
- fn_fail:
- MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
- goto fn_exit;
-}
-*/
-
/* Start connecting on the nd conn corresponding to vc
* Prepost recvs before we connect
*/
@@ -2372,21 +3523,28 @@
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_CONN_EST);
+ MPIU_Assert(!MPID_NEM_ND_CONN_HND_IS_INIT(MPID_NEM_ND_VCCH_NETMOD_CONN_HND_GET(vc)));
+
+ /* This should be done first because at least some ops below can block
+ * Setting VC state to CONNECTING prevents dup connect()s
+ */
+ MPID_NEM_ND_VCCH_NETMOD_STATE_SET(vc, MPID_NEM_ND_VC_STATE_CONNECTING);
+
/* Create a conn - The progress engine will keep track of
* this connection.
*/
- mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_CONNECT_CONN, NULL, &conn_hnd);
+ /* Set tmp conn info in the VC at init time - This might be required by the accept() side
+ * to mark this conn as an orphan
+ */
+ mpi_errno = MPID_Nem_nd_conn_hnd_init(MPID_Nem_nd_dev_hnd_g, MPID_NEM_ND_CONNECT_CONN, NULL, vc, &conn_hnd);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
MPID_NEM_ND_CONN_STATE_SET(conn_hnd, MPID_NEM_ND_CONN_C_CONNECTING);
- /* Set VC's conn to this conn */
- MPID_NEM_ND_VCCH_NETMOD_CONN_HND_SET(vc, conn_hnd);
- /* This connection is related to this vc. If this connection
- * loses in a head to head battle this conn will still point to
- * the vc, however the vc will no longer point to this conn
- * making this conn an ORPHAN CONN
+
+ /* Save the vc info in the conn
+ * Set conn info in vc & vc info in conn after we receive LACK
*/
- conn_hnd->vc = vc;
+ conn_hnd->tmp_vc = vc;
/* We don't handle dynamic conns yet - no tmp vcs*/
MPIU_Assert(vc->pg != NULL);
@@ -2426,7 +3584,10 @@
* successful ?
*/
MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Connecting to %s:%d(pg_rank=%d, pg_id_len=%d)", ifname, sin.sin_port, ((MPID_Nem_nd_pg_info_hdr_t *)pg_info)->pg_rank, ((MPID_Nem_nd_pg_info_hdr_t *)pg_info)->pg_id_len));
-
+
+ MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "Posting connect on conn(%p) on ov(%p)",
+ conn_hnd, MPIU_EX_GET_OVERLAPPED_PTR(&(conn_hnd->send_ov))));
+
hr = conn_hnd->p_conn->Connect(conn_hnd->p_ep,
(const struct sockaddr *)&sin, sizeof(struct sockaddr_in),
MPID_NEM_ND_PROT_FAMILY, 0, (void *)pg_info, pg_info_len,
@@ -2453,7 +3614,7 @@
#define FUNCNAME MPID_Nem_nd_sm_poll
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_Nem_nd_sm_poll(void )
+int MPID_Nem_nd_sm_poll(int in_blocking_poll)
{
int mpi_errno = MPI_SUCCESS;
BOOL wait_for_event_and_status = FALSE;
@@ -2461,7 +3622,7 @@
static int num_skip_polls = 0;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_ND_SM_POLL);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_POLL);
+ /* MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_ND_SM_POLL); */
/* ND progress */
if(num_skip_polls++ < MPID_NEM_ND_SM_SKIP_POLL){
goto fn_exit;
@@ -2474,8 +3635,8 @@
/* Reset event completion status */
status = FALSE;
/* On return, if (wait_for_event_and_status == FALSE) then
- * there are no more events in ND Cq
- */
+ * there are no more events in ND Cq
+ */
mpi_errno = MPID_Nem_nd_process_completions(MPID_Nem_nd_dev_hnd_g->p_cq, &status);
if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}while(status == TRUE);
@@ -2488,7 +3649,7 @@
}while(wait_for_event_and_status == TRUE);
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_POLL);
+ /* MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_POLL); */
return mpi_errno;
fn_fail:
MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "failed, mpi_errno = %d", mpi_errno);
@@ -2518,35 +3679,37 @@
/* We need to check conn_hnd status even if block op becomes invalid */
conn_hnd = op_hnd->conn_hnd;
- /* Currently only blocking on pending nd ops */
- while(conn_hnd->npending_ops > 0){
+ MPIU_Assert(conn_hnd->npending_ops == 1);
+ /* MPIU_Assert(op_hnd->npending_ops == 1); */
+ /* Currently only blocking on pending ex ops */
+ /*
+ while(conn_hnd->npending_ops > 0){
HRESULT hr;
SIZE_T nb=0;
- /* Wait for an event */
hr = MPID_Nem_nd_dev_hnd_g->p_cq->GetOverlappedResult(MPID_NEM_ND_BLOCK_OP_GET_OVERLAPPED_PTR(op_hnd), &nb, TRUE);
MPIU_ERR_CHKANDJUMP(FAILED(hr), mpi_errno, MPI_ERR_OTHER, "**intern");
- /* Process the completed event */
- status = FALSE;
- mpi_errno = MPID_Nem_nd_process_completions(MPID_Nem_nd_dev_hnd_g->p_cq, &status);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ do{
+ status = FALSE;
- if(status == FALSE){
- /* No event on CQ - We must be blocking on a manual event */
- status = FALSE;
- mpi_errno = MPIU_ExProcessCompletions(MPID_Nem_nd_exset_hnd_g, &status);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Nem_nd_process_completions(MPID_Nem_nd_dev_hnd_g->p_cq, &status);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- MPIU_Assert(status == TRUE);
- }
-
- if(conn_hnd->npending_ops > 0){
- /* Re-initialize block op */
- mpi_errno = MPID_Nem_nd_block_op_reinit(op_hnd);
- if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
- }
+ if(status == FALSE){
+ mpi_errno = MPIU_ExProcessCompletions(MPID_Nem_nd_exset_hnd_g, &status);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ }
+ }while(status == TRUE);
}
+ */
+ while(conn_hnd->npending_ops > 0){
+ status = TRUE;
+ mpi_errno = MPIU_ExProcessCompletions(MPID_Nem_nd_exset_hnd_g, &status);
+ if(mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+ /* Since we only support blocking ops on EX - atleast one op should complete */
+ MPIU_Assert(status == TRUE);
+ }
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_ND_SM_BLOCK);
Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h 2011-02-21 21:00:09 UTC (rev 8002)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/nd/nd_sm.h 2011-02-21 22:57:52 UTC (rev 8003)
@@ -31,6 +31,13 @@
#define MPID_NEM_ND_IS_FC_PKT(pkt_type) ((pkt_type != MPID_NEM_ND_CRED_PKT) && \
(pkt_type != MPID_NEM_ND_RD_AVAIL_PKT) && \
(pkt_type != MPID_NEM_ND_RD_ACK_PKT))
+
+typedef enum{
+ MPID_NEM_ND_SR_PACK=0,
+ MPID_NEM_ND_ZCP_PACK,
+ MPID_NEM_ND_INVALID_PACK
+} MPID_Nem_nd_pack_t;
+
/* We use a simple cookie to make sure that the connection
* is an MPICH2 nd connection
*/
@@ -81,6 +88,7 @@
}while(0)
#define MSGBUF_FREEQ_IS_EMPTY(_conn_hnd) (_conn_hnd->ssbuf_freeq.nbuf == 0)
#define MSGBUF_FREEQ_DEQUEUE(_conn_hnd, _pmsg_buf) do{\
+ MPIU_Assert(!MSGBUF_FREEQ_IS_EMPTY(_conn_hnd)); \
_pmsg_buf = &(_conn_hnd->ssbuf[_conn_hnd->ssbuf_freeq.head].msg); \
(_pmsg_buf)->hdr.type = MPID_NEM_ND_INVALID_PKT; \
(_pmsg_buf)->hdr.credits = 0; \
More information about the mpich2-commits
mailing list