[mpich2-commits] r6648 - in mpich2/trunk/src: mpi/comm mpi/errhan mpid/ch3/channels/nemesis/include mpid/ch3/channels/nemesis/nemesis/netmod/tcp mpid/ch3/channels/nemesis/src

buntinas at mcs.anl.gov buntinas at mcs.anl.gov
Fri May 14 15:45:02 CDT 2010


Author: buntinas
Date: 2010-05-14 15:45:02 -0500 (Fri, 14 May 2010)
New Revision: 6648

Modified:
   mpich2/trunk/src/mpi/comm/comm_split.c
   mpich2/trunk/src/mpi/errhan/errnames.txt
   mpich2/trunk/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.h
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c
   mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_utility.c
   mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_init.c
Log:
added robust error handling to tcp module.  State machine should now be left in a consistent state in the event of an error.  As part of this commit the QUIESCENT state was removed from the state machine, instead the SC goes to the CLOSED state and is freed.

Modified: mpich2/trunk/src/mpi/comm/comm_split.c
===================================================================
--- mpich2/trunk/src/mpi/comm/comm_split.c	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpi/comm/comm_split.c	2010-05-14 20:45:02 UTC (rev 6648)
@@ -172,8 +172,9 @@
     }
     /* Gather information on the local group of processes */
     MPIR_Nest_incr();
-    NMPI_Allgather( MPI_IN_PLACE, 2, MPI_INT, table, 2, MPI_INT, local_comm );
+    mpi_errno = NMPI_Allgather( MPI_IN_PLACE, 2, MPI_INT, table, 2, MPI_INT, local_comm );
     MPIR_Nest_decr();
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
     /* Step 2: How many processes have our same color? */
     new_size = 0;
@@ -219,10 +220,11 @@
 	   same color */
 	mypair.color = color;
 	mypair.key   = key;
-	NMPI_Allgather( &mypair, 2, MPI_INT, remotetable, 2, MPI_INT,
-			comm );
+	mpi_errno = NMPI_Allgather( &mypair, 2, MPI_INT, remotetable, 2, MPI_INT,
+                                    comm );
 	MPIR_Nest_decr();
-
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        
 	/* Each process can now match its color with the entries in the table */
 	new_remote_size = 0;
 	last_ptr = &first_remote_entry;
@@ -266,14 +268,16 @@
 				       0, 0, comm, MPI_STATUS_IGNORE );
 	    if (mpi_errno) { MPIU_ERR_POP( mpi_errno ); }
             MPIR_Nest_incr();
-	    NMPI_Bcast( &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, local_comm );
+	    mpi_errno = NMPI_Bcast( &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, local_comm );
             MPIR_Nest_decr();
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 	}
 	else {
 	    /* Broadcast to the other members of the local group */
             MPIR_Nest_incr();
-	    NMPI_Bcast( &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, local_comm );
+	    mpi_errno = NMPI_Bcast( &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, local_comm );
             MPIR_Nest_decr();
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 	}
     }
 

Modified: mpich2/trunk/src/mpi/errhan/errnames.txt
===================================================================
--- mpich2/trunk/src/mpi/errhan/errnames.txt	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpi/errhan/errnames.txt	2010-05-14 20:45:02 UTC (rev 6648)
@@ -857,6 +857,12 @@
 **sem_destroy %s:sem_destroy() failed %s
 
 **comm_fail:Communication error
+**comm_fail %d:Communication error with rank %d
+**exceeded_connect_tries:Unable to establish connection to process
+**exceeded_connect_tries %d:Unable to establish connection to rank %d
+**vc_in_error_state:Connection is in error state
+**tcp_cleanup_fail:Error while cleaning up failed connection
+**tmpvc_connect_fail:Failure during connection protocol
 
 **blcr_mod:BLCR kernel module not present
 

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h	2010-05-14 20:45:02 UTC (rev 6648)
@@ -114,7 +114,6 @@
 struct MPID_nem_lmt_shm_wait_element;
 struct MPIDI_CH3_PktGeneric;
 
-typedef enum{MPID_NEM_VC_STATE_CONNECTED, MPID_NEM_VC_STATE_DISCONNECTED} MPIDI_Nem_vc_state_t;
 typedef struct MPIDI_CH3I_VC
 {
     int pg_rank;
@@ -140,8 +139,6 @@
     struct MPIDI_VC *next;
     struct MPIDI_VC *prev;
 
-    MPIDI_Nem_vc_state_t state;
-
     /* contig function pointers.  Netmods should set these. */
     /* iStartContigMsg -- sends a message consisting of a header (hdr) and contiguous data (data), possibly of 0 size.  If the
        message cannot be sent immediately, the function should create a request and return a pointer in sreq_ptr.  The network

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.c	2010-05-14 20:45:02 UTC (rev 6648)
@@ -87,7 +87,7 @@
         (sc)->vc = NULL;                        \
         (sc)->pg_is_set = FALSE;                \
         (sc)->is_tmpvc = FALSE;                 \
-        (sc)->state.cstate = CONN_STATE_TS_CLOSED; \
+        CHANGE_STATE(sc, CONN_STATE_TS_CLOSED); \
     } while (0)
 
 #define INIT_POLLFD_ENTRY(plfd)                               \
@@ -123,6 +123,7 @@
 
 static int find_free_entry(int *index);
 static int cleanup_and_free_sc_plfd(sockconn_t *const sc);
+static int error_closed(struct MPIDI_VC *const vc);
 
 #undef FUNCNAME
 #define FUNCNAME alloc_sc_plfd_tbls
@@ -563,7 +564,6 @@
     /* The other side closed this connection (hopefully as part of a
        head-to-head resolution. */
     if (0 == nread) {
-        CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
         *got_sc_eof = 1;
         goto fn_exit;
     }
@@ -637,7 +637,7 @@
         vc_tcp = VC_TCP(vc);
         
         MPIDI_VC_Init(vc, NULL, 0);
-        ((MPIDI_CH3I_VC *)vc->channel_private)->state = MPID_NEM_TCP_VC_STATE_CONNECTED; /* FIXME: is it needed ? */
+        vc_tcp->state = MPID_NEM_TCP_VC_STATE_CONNECTED; /* FIXME: is it needed ? */
         sc->vc = vc;
         MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "about to incr sc_ref_count sc=%p sc->vc=%p sc_ref_count=%d", sc, sc->vc, vc_tcp->sc_ref_count));
         ++vc_tcp->sc_ref_count;
@@ -673,10 +673,6 @@
     goto fn_exit;
 }
 
-#define send_cmd_pkt(fd_, pkt_type_) ( \
-    send_cmd_pkt_func(fd_, pkt_type_) \
-)
-
 /*
   This function is used to send commands that don't have data but just only
   the header.
@@ -685,7 +681,7 @@
 #define FUNCNAME send_cmd_pkt
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int send_cmd_pkt_func(int fd, MPIDI_nem_tcp_socksm_pkt_type_t pkt_type)
+static int send_cmd_pkt(int fd, MPIDI_nem_tcp_socksm_pkt_type_t pkt_type)
 {
     int mpi_errno = MPI_SUCCESS, offset;
     MPIDI_nem_tcp_header_t pkt;
@@ -757,12 +753,14 @@
 }
 
 
-
+/* Initiates connection protocol to remote process.  It's OK to call
+   this function with the VC in an error state; an appropriate error
+   code will be returned. */
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_tcp_connect
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_tcp_connect(struct MPIDI_VC *const vc) 
+int MPID_nem_tcp_connect(struct MPIDI_VC *const vc)
 {
     MPID_nem_tcp_vc_area *const vc_tcp = VC_TCP(vc);
     sockconn_t *sc = NULL;
@@ -778,26 +776,40 @@
     MPIU_Assert(vc != NULL);
 
     /* We have an active connection, start polling more often */
-    MPID_nem_tcp_skip_polls = MAX_SKIP_POLLS_ACTIVE;    
+    MPID_nem_tcp_skip_polls = MAX_SKIP_POLLS_ACTIVE;
         
     MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
 
-    if (((MPIDI_CH3I_VC *)vc->channel_private)->state == MPID_NEM_TCP_VC_STATE_DISCONNECTED) {
+    if (vc_tcp->state == MPID_NEM_TCP_VC_STATE_DISCONNECTED) {
         struct sockaddr_in *sock_addr;
 	struct in_addr addr;
         int rc = 0;
 
+        if (vc_tcp->connect_retry_count > MPIDI_NEM_TCP_MAX_CONNECT_RETRIES) {
+            int mpi_errno2 = MPI_SUCCESS;
+            MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "exceeded retries, closing sc");
+            mpi_errno2 = error_closed(vc);
+            if (mpi_errno2) {
+                MPIU_ERR_SET(mpi_errno2, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+                if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
+            }
+            MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**exceeded_connect_tries", "**exceeded_connect_tries %d", vc->pg_rank);
+            goto fn_fail;
+        }
+        
+        ++vc_tcp->connect_retry_count;
+
         MPIU_Assert(vc_tcp->sc == NULL);
         mpi_errno = find_free_entry(&index);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP (mpi_errno);
 
         sc = &g_sc_tbl[index];
-        plfd = &MPID_nem_tcp_plfd_tbl[index];        
+        plfd = &MPID_nem_tcp_plfd_tbl[index];
 
-        /* FIXME:  
+        /* FIXME:
            We need to set addr and port using bc.
            If a process is dynamically spawned, vc->pg is NULL.
-           In that case, same procedure is done 
+           In that case, same procedure is done
            in MPID_nem_tcp_connect_to_root()
         */
         if (vc->pg != NULL) { /* VC is not a temporary one */
@@ -842,7 +854,7 @@
         if (mpi_errno) MPIU_ERR_POP (mpi_errno);
 
         MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "connecting to 0x%08X:%d", sock_addr->sin_addr.s_addr, sock_addr->sin_port));
-        rc = connect(sc->fd, (SA*)sock_addr, sizeof(*sock_addr)); 
+        rc = connect(sc->fd, (SA*)sock_addr, sizeof(*sock_addr));
         /* connect should not be called with CHECK_EINTR macro */
         if (rc < 0 && errno != EINPROGRESS) {
             MPIDU_FTB_COMMERR(rc == ENETUNREACH ? MPIDU_FTB_EV_UNREACHABLE : MPIDU_FTB_EV_COMMUNICATION, vc);
@@ -856,8 +868,7 @@
             CHANGE_STATE(sc, CONN_STATE_TC_C_CNTING);
         }
         
-/*         sc->handler = sc_state_info[sc->state.cstate].sc_state_handler; */
-        ((MPIDI_CH3I_VC *)vc->channel_private)->state = MPID_NEM_TCP_VC_STATE_CONNECTED;
+        vc_tcp->state = MPID_NEM_TCP_VC_STATE_CONNECTED;
         sc->pg_rank = vc->pg_rank;
 
         if (vc->pg != NULL) { /* normal (non-dynamic) connection */
@@ -883,7 +894,7 @@
         MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "about to incr sc_ref_count sc=%p sc->vc=%p sc_ref_count=%d", sc, sc->vc, vc_tcp->sc_ref_count));
         ++vc_tcp->sc_ref_count;
     }
-    else if (((MPIDI_CH3I_VC *)vc->channel_private)->state == MPID_NEM_TCP_VC_STATE_CONNECTED) {
+    else if (vc_tcp->state == MPID_NEM_TCP_VC_STATE_CONNECTED) {
         sc = vc_tcp->sc;
         MPIU_Assert(sc != NULL);
         /* Do nothing here, the caller just needs to wait for the connection
@@ -892,7 +903,7 @@
            resolution. */
     }
     else {
-        MPIU_Assert(0);
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**vc_in_error_state");
     }
 
  fn_exit:
@@ -901,19 +912,16 @@
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_CONNECT);
     return mpi_errno;
  fn_fail:
-    if (index != -1) {
-        int cleanup_error = MPI_SUCCESS;
-        cleanup_error = cleanup_and_free_sc_plfd(&g_sc_tbl[index]);
-        if (cleanup_error) MPIU_ERR_SET(cleanup_error, MPI_ERR_OTHER, "**fail");
-    }
+    if (index != -1)
+        cleanup_and_free_sc_plfd(&g_sc_tbl[index]); /* ignore error return */
     MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
     goto fn_exit;
 }
 
 /* Called to transition an sc to CLOSED, and free associated
    resources.  This might be done as part of a ch3 close protocol,
-   because the sc is in a quiescent state, or becaues there was an
-   error associated with the connection. */
+   because the sc lost in a head-to-head connection, or because there
+   was an error associated with the connection. */
 #undef FUNCNAME
 #define FUNCNAME cleanup_and_free_sc_plfd
 #undef FCNAME
@@ -949,12 +957,12 @@
             MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, sc_vc);
         else
             MPIDU_Ftb_publish(MPIDU_FTB_EV_COMMUNICATION, "");
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**close", "**close %s", strerror(errno));
+        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**close", "**close %s", strerror(errno));
     }
     
     if (sc_vc && sc_vc_tcp->sc == sc) /* this vc may be connecting/accepting with another sc e.g., this sc lost the tie-breaker */
     {
-        ((MPIDI_CH3I_VC *)sc_vc->channel_private)->state = MPID_NEM_TCP_VC_STATE_DISCONNECTED;
+        sc_vc_tcp->state = MPID_NEM_TCP_VC_STATE_DISCONNECTED;
         ASSIGN_SC_TO_VC(sc_vc_tcp, NULL);
     }
 
@@ -1044,10 +1052,8 @@
         CHANGE_STATE(sc, CONN_STATE_TC_C_CNTD);
     }
     else if (status == MPID_NEM_TCP_SOCK_ERROR_EOF) {
-        MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_tc_c_cnting_handler(): changing to "
-              "quiescent"));
-        CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-        /* FIXME: retry 'n' number of retries before signalling an error to VC layer. */
+        MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "state_tc_c_cnting_handler(): closing sc");
+        mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
     }
     else { /* status == MPID_NEM_TCP_SOCK_NOEVENT */
         /*
@@ -1078,9 +1084,8 @@
     MPIDI_FUNC_ENTER(MPID_STATE_STATE_TC_C_CNTD_HANDLER);
 
     if (found_better_sc(sc, NULL)) {
-        MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_tc_c_cntd_handler(): changing to "
-              "quiescent"));
-        CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
+        MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "state_tc_c_cntd_handler(): closing sc");
+        mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
         goto fn_exit;
     }
     
@@ -1088,7 +1093,12 @@
         MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "inside if (IS_WRITEABLE(plfd))");
         if (!sc->is_tmpvc) { /* normal connection */
             mpi_errno = send_id_info(sc);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) {
+                MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "state_tc_c_cntd_handler(): closing sc");
+                mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+                mpi_errno = MPI_SUCCESS; /* don't return an error if we fail in the connect protocol, just retry. */
+                goto fn_exit;
+            }
 
             CHANGE_STATE(sc, CONN_STATE_TC_C_RANKSENT);
         }
@@ -1107,6 +1117,7 @@
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
+    mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
     goto fn_exit;
 }
 
@@ -1126,33 +1137,33 @@
 
     if (IS_READABLE(plfd)) {
         mpi_errno = recv_cmd_pkt(sc->fd, &pkt_type);
-        if (mpi_errno != MPI_SUCCESS) {
-            MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_c_ranksent_handler() 1: changing to "
-              "quiescent.. "));
-            CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-            if (vc_is_in_shutdown(sc_vc)) {
-                mpi_errno = MPI_SUCCESS;
-            }
-        }
-        else {
-            MPIU_Assert(pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_ACK ||
-                        pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        
+        MPIU_Assert(pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_ACK ||
+                    pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK);
 
-            if (pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_ACK) {
-                CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
-                ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
+        if (pkt_type == MPIDI_NEM_TCP_SOCKSM_PKT_ID_ACK) {
+            CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
+            ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
 
-                MPID_nem_tcp_conn_est (sc_vc);
-                MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "c_ranksent_handler(): connection established (sc=%p, sc->vc=%p, fd=%d)", sc, sc->vc, sc->fd));
-            }
-            else { /* pkt_type must be MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK */
-                CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-            }
+            MPID_nem_tcp_conn_est (sc_vc);
+            sc_vc_tcp->connect_retry_count = 0; /* successfully connected, reset connection retry count */
+            MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "c_ranksent_handler(): connection established (sc=%p, sc->vc=%p, fd=%d)", sc, sc->vc, sc->fd));
         }
+        else { /* pkt_type must be MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK */
+            MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "received NAK, closing sc");
+            mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+        }
     }
 
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_STATE_C_RANKSENT_HANDLER);
     return mpi_errno;
+fn_fail:
+    MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
+    mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+    mpi_errno = MPI_SUCCESS; /* don't return an error if we fail in the connect protocol, just retry. */
+    goto fn_exit;
 }
 
 #undef FUNCNAME
@@ -1169,11 +1180,17 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_STATE_C_TMPVCSENT_HANDLER);
 
-
     if (IS_READABLE(plfd)) {
         mpi_errno = recv_cmd_pkt(sc->fd, &pkt_type);
-        if (mpi_errno != MPI_SUCCESS) {
-            CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
+        if (mpi_errno) {
+            int mpi_errno2 = MPI_SUCCESS;
+            MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "error sending cmd pkt, closing sc");
+            mpi_errno2 = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+            if (mpi_errno2) {
+                MPIU_ERR_SET(mpi_errno2, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+                if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
+            }
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**tmpvc_connect_fail");
             /* no head-to-head issues to deal with, if we failed to recv the
                packet then there really was a problem */
         }
@@ -1185,13 +1202,15 @@
                 CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
                 ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
                 MPID_nem_tcp_conn_est (sc_vc);
+                sc_vc_tcp->connect_retry_count = 0; /* successfully connected, reset connection retry count */
                 MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "c_tmpvcsent_handler(): connection established (fd=%d, sc=%p, sc->vc=%p)", sc->fd, sc, sc_vc));
             }
             else { /* pkt_type must be MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK */
-                MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_c_tmpvcsent_handler() 2: changing to quiescent"));
-                CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
+                MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_c_tmpvcsent_handler() 2: closing sc"));
+                MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "received NAK, closing sc");
+                mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
             }
-        }    
+        }
     }
 
     MPIDI_FUNC_EXIT(MPID_STATE_STATE_C_TMPVCSENT_HANDLER);
@@ -1213,9 +1232,8 @@
 
     status = MPID_nem_tcp_check_sock_status(plfd);
     if (status == MPID_NEM_TCP_SOCK_ERROR_EOF) {
-        MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_l_cntd_handler() 1: changing to "
-            "quiescent"));
-        CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
+        MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "Received EOF, closing sc");
+        mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
         goto fn_exit;
     }
 
@@ -1224,25 +1242,19 @@
 
     if (IS_READABLE(plfd)) {
         mpi_errno = recv_id_or_tmpvc_info(sc, &got_sc_eof);
-        if (mpi_errno == MPI_SUCCESS) {
-            if (got_sc_eof) {
-                /* recv_id_or_tmpvc already moved the sc to QUIESCENT, just return */
-                goto fn_exit;
-            }
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-            if (!sc->is_tmpvc) {
-                CHANGE_STATE(sc, CONN_STATE_TA_C_RANKRCVD);
-            }
-            else {
-                CHANGE_STATE(sc, CONN_STATE_TA_C_TMPVCRCVD);
-            }
+        if (got_sc_eof) {
+            MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "got eof, closing sc");
+            mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+            goto fn_exit;
         }
+        
+        if (!sc->is_tmpvc) {
+            CHANGE_STATE(sc, CONN_STATE_TA_C_RANKRCVD);
+        }
         else {
-            MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_l_cntd_handler() 2: changing to "
-               "quiescent"));
-            CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-
-            MPIU_ERR_POP(mpi_errno);
+            CHANGE_STATE(sc, CONN_STATE_TA_C_TMPVCRCVD);
         }
     }
     else {
@@ -1255,8 +1267,9 @@
     return mpi_errno;
  fn_fail:
     MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
+    mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+    mpi_errno = MPI_SUCCESS; /* don't return an error if we fail in the connect protocol, just retry. */
     goto fn_exit;
-
 }
 
 /*
@@ -1311,25 +1324,23 @@
     MPIDI_FUNC_ENTER(MPID_STATE_STATE_L_RANKRCVD_HANDLER);
 
     status = MPID_nem_tcp_check_sock_status(plfd);
-    if (status == MPID_NEM_TCP_SOCK_ERROR_EOF) {
-        MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_l_rankrcvd_handler() 1: changing to quiescent"));
-        CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-        goto fn_exit;
-    }
+    if (status == MPID_NEM_TCP_SOCK_ERROR_EOF)
+        goto fn_fail;
+    
     if (found_better_sc(sc, &fnd_sc)) {
         if (fnd_sc->state.cstate == CONN_STATE_TS_COMMRDY)
             snd_nak = TRUE;
         else if (fnd_sc->state.cstate == CONN_STATE_TC_C_RANKSENT)
             snd_nak = do_i_win(sc);
     }
+    
     if (IS_WRITEABLE(plfd)) {
         if (snd_nak) {
-            if (send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK) == MPI_SUCCESS) {
-                MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "state_l_rankrcvd_handler() 2: changing to quiescent"));
-                CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-            }
-        }
-        else {
+            mpi_errno = send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_ID_NAK);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "lost head-to-head, closing sc");
+            mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+        } else {
             /* The following line is _crucial_ to correct operation.  We need to
              * ensure that all head-to-head resolution has completed before we
              * move to COMMRDY and send any pending messages.  If we don't this
@@ -1339,18 +1350,25 @@
              * accessing PG/VC info that is no longer present. */
             if (sc_vc_tcp->sc_ref_count > 1) goto fn_exit;
 
-            if (send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_ID_ACK) == MPI_SUCCESS) {
-                CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
-                ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
-		MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "connection established: sc=%p, sc->vc=%p, sc->fd=%d, is_same_pg=%s, pg_rank=%d", sc, sc_vc, sc->fd, (sc->is_same_pg ? "TRUE" : "FALSE"), sc->pg_rank));
-                MPID_nem_tcp_conn_est (sc_vc);
-            }
+            mpi_errno = send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_ID_ACK);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            
+            CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
+            ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
+            MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "connection established: sc=%p, sc->vc=%p, sc->fd=%d, is_same_pg=%s, pg_rank=%d", sc, sc_vc, sc->fd, (sc->is_same_pg ? "TRUE" : "FALSE"), sc->pg_rank));
+            MPID_nem_tcp_conn_est (sc_vc);
+            sc_vc_tcp->connect_retry_count = 0; /* successfully connected, reset connection retry count */
         }
     }
 
- fn_exit:
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_STATE_L_RANKRCVD_HANDLER);
     return mpi_errno;
+fn_fail:
+    MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
+    mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+    mpi_errno = MPI_SUCCESS; /* don't return an error if we fail in the connect protocol, just retry. */
+    goto fn_exit;
 }
 
 #undef FUNCNAME
@@ -1363,36 +1381,42 @@
     MPID_nem_tcp_vc_area *const sc_vc_tcp = VC_TCP(sc_vc);
     int mpi_errno = MPI_SUCCESS;
     MPID_NEM_TCP_SOCK_STATUS_t status;
-    int snd_nak = FALSE;
     MPIDI_STATE_DECL(MPID_STATE_STATE_L_TMPVCRCVD_HANDLER);
 
     MPIDI_FUNC_ENTER(MPID_STATE_STATE_L_TMPVCRCVD_HANDLER);
 
     status = MPID_nem_tcp_check_sock_status(plfd);
     if (status == MPID_NEM_TCP_SOCK_ERROR_EOF) {
-        CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
+        MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "Received EOF, closing sc");
+        mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
         goto fn_exit;
     }
     /* we don't want to perform any h2h resolution for temp vcs */
     if (IS_WRITEABLE(plfd)) {
-        if (snd_nak) {
-            if (send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_TMPVC_NAK) == MPI_SUCCESS) {
-                CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
-            }
-        }
-        else {
-            if (send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_TMPVC_ACK) == MPI_SUCCESS) {
-                CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
-                ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
-                MPID_nem_tcp_conn_est(sc_vc);
-                MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "fd=%d: TMPVC_ACK sent, connection established!", sc->fd));
-            }
-        }
+        mpi_errno = send_cmd_pkt(sc->fd, MPIDI_NEM_TCP_SOCKSM_PKT_TMPVC_ACK);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+        CHANGE_STATE(sc, CONN_STATE_TS_COMMRDY);
+        ASSIGN_SC_TO_VC(sc_vc_tcp, sc);
+        MPID_nem_tcp_conn_est(sc_vc);
+        sc_vc_tcp->connect_retry_count = 0; /* successfully connected, reset connection retry count */
+        MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "fd=%d: TMPVC_ACK sent, connection established!", sc->fd));
     }
 
- fn_exit:
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_STATE_L_TMPVCRCVD_HANDLER);
     return mpi_errno;
+ fn_fail:
+    {
+        int cleanup_errno = MPI_SUCCESS;
+        MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
+        mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
+        if (cleanup_errno) {
+            MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+            MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+        }
+        goto fn_exit;
+    }
 }
 
 #undef FUNCNAME
@@ -1433,19 +1457,18 @@
                        disconnecting, then we end up with a potential race where
                        the other side performs a tcp close() before we do and we
                        blow up here. */
-                    CHANGE_STATE(sc, CONN_STATE_TS_D_QUIESCENT);
+                    MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "other side closed, but we're shutting down, closing sc");
+                    mpi_errno = cleanup_and_free_sc_plfd(sc); /* QUIESCENT */
                     goto fn_exit;
                 }
                 else
                 {
-                    MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, sc_vc);
                     MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "ERROR: sock (fd=%d) is closed: bytes_recvd == 0", sc->fd );
                     MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
                 }
             }
             else
             {
-                MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, sc_vc);
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**read", "**read %s", strerror(errno));
             }
         }
@@ -1453,7 +1476,7 @@
         MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "New recv " MPIDI_MSG_SZ_FMT " (fd=%d, vc=%p, sc=%p)", bytes_recvd, sc->fd, sc_vc, sc));
 
         mpi_errno = MPID_nem_handle_pkt(sc_vc, recv_buf, bytes_recvd);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno) MPIU_ERR_POP_LABEL(mpi_errno, fn_noncomm_fail);
     }
     else
     {
@@ -1515,7 +1538,7 @@
             int complete = 0;
                 
             mpi_errno = reqFn(sc_vc, rreq, &complete);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno) MPIU_ERR_POP_LABEL(mpi_errno, fn_noncomm_fail);
 
             if (complete)
             {
@@ -1529,11 +1552,23 @@
         }
     }
 
- fn_exit:
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_RECV_HANDLER);
     return mpi_errno;
- fn_fail:
+fn_fail: /* comm related failures jump here */
+    {
+        int cleanup_errno = MPI_SUCCESS;
+        MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, sc_vc);
+        cleanup_errno = MPID_nem_tcp_cleanup_on_error(sc_vc); /* QUIESCENT */
+        if (cleanup_errno) {
+            MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+            MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+        }
+        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", sc_vc->pg_rank);
+    }
+fn_noncomm_fail: /* NON-comm related failures jump here */
     goto fn_exit;
+    
 }
 
 #undef FUNCNAME
@@ -1568,28 +1603,6 @@
 }
 
 #undef FUNCNAME
-#define FUNCNAME state_d_quiescent_handler
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int state_d_quiescent_handler(struct pollfd *const plfd, sockconn_t *const sc)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_STATE_D_QUIESCENT_HANDLER);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_STATE_D_QUIESCENT_HANDLER);
-
-    mpi_errno = cleanup_and_free_sc_plfd(sc);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
- fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_STATE_D_QUIESCENT_HANDLER);
-    return mpi_errno;
- fn_fail:
-    MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
-    goto fn_exit;
-}
-
-#undef FUNCNAME
 #define FUNCNAME MPID_nem_tcp_sm_init
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -1607,7 +1620,6 @@
     sc_state_info[CONN_STATE_TA_C_RANKRCVD].sc_state_handler = state_l_rankrcvd_handler;
     sc_state_info[CONN_STATE_TA_C_TMPVCRCVD].sc_state_handler = state_l_tmpvcrcvd_handler;
     sc_state_info[CONN_STATE_TS_COMMRDY].sc_state_handler = state_commrdy_handler;
-    sc_state_info[CONN_STATE_TS_D_QUIESCENT].sc_state_handler = state_d_quiescent_handler;
 
     /* Set the appropriate states */
     sc_state_info[CONN_STATE_TS_CLOSED].sc_state_plfd_events = 0;
@@ -1619,7 +1631,6 @@
     sc_state_info[CONN_STATE_TA_C_RANKRCVD].sc_state_plfd_events = POLLOUT | POLLIN;
     sc_state_info[CONN_STATE_TA_C_TMPVCRCVD].sc_state_plfd_events = POLLOUT | POLLIN;
     sc_state_info[CONN_STATE_TS_COMMRDY].sc_state_plfd_events = POLLIN;
-    sc_state_info[CONN_STATE_TS_D_QUIESCENT].sc_state_plfd_events = POLLOUT | POLLIN;
 
     /* Allocate the PLFD table */
     g_tbl_size = 0;
@@ -1704,20 +1715,28 @@
         {
             /* We could check for POLLHUP here, but HUP/HUP+EOF is not erroneous
              * on many platforms, including modern Linux. */
-            if (it_plfd->revents & POLLERR) {
-                if (it_sc->vc)
+            if (it_plfd->revents & POLLERR || it_plfd->revents & POLLNVAL) {
+                int cleanup_errno = MPI_SUCCESS;
+                MPIU_DBG_MSG(NEM_SOCK_DET, VERBOSE, "error polling fd, closing sc");
+                if (it_sc->vc) {
                     MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, it_sc->vc);
-                else
+                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(it_sc->vc);
+                    if (cleanup_errno) {
+                        MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+                        MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    }
+                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", it_sc->vc->pg_rank);
+                } else {
                     MPIDU_Ftb_publish(MPIDU_FTB_EV_COMMUNICATION, "");
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**comm_fail");
+                    cleanup_errno = cleanup_and_free_sc_plfd(it_sc);
+                    if (cleanup_errno) {
+                        MPIU_ERR_SET(cleanup_errno, MPI_ERR_OTHER, "**tcp_cleanup_fail");
+                        MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    }
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**comm_fail");
+                }
+                goto fn_fail;
             }
-            if (it_sc->state.cstate != CONN_STATE_TS_D_QUIESCENT && (it_plfd->revents & POLLNVAL)) {
-                if (it_sc->vc)
-                    MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, it_sc->vc);
-                else
-                    MPIDU_Ftb_publish(MPIDU_FTB_EV_COMMUNICATION, "");
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**comm_fail");
-            }
             
             mpi_errno = it_sc->handler(it_plfd, it_sc);
             if (mpi_errno) MPIU_ERR_POP (mpi_errno);
@@ -1796,7 +1815,7 @@
 
             MPID_nem_tcp_set_sockopts(connfd); /* (N2) */
             mpi_errno = find_free_entry(&index);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP (mpi_errno);        
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP (mpi_errno);
             sc = &g_sc_tbl[index];
             plfd = &MPID_nem_tcp_plfd_tbl[index];
             
@@ -1819,3 +1838,54 @@
     goto fn_exit;
 }
 
+/* This is called whenever a vc has been closed as a result of an
+   error.  The VC is put into an error state and cannot be opened
+   again. */
+#undef FUNCNAME
+#define FUNCNAME error_closed
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int error_closed(struct MPIDI_VC *const vc)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_nem_tcp_vc_area *vc_tcp = VC_TCP(vc);
+    MPIDI_STATE_DECL(MPID_STATE_ERROR_CLOSED);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_ERROR_CLOSED);
+
+    vc_tcp->state = MPID_NEM_TCP_VC_STATE_ERROR;
+    /* complete pending send/recv requests with error ??? */
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_ERROR_CLOSED);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+/* This is called when an communication error has occurred on a VC to
+   close the VC and release associated resources. */
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_tcp_cleanup_on_error
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int mpi_errno2 = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_CLEANUP_ON_ERROR);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_CLEANUP_ON_ERROR);
+
+    mpi_errno = MPID_nem_tcp_cleanup(vc);
+    /* not jumping on error, keep going */
+    
+    mpi_errno2 = error_closed(vc);
+    if (mpi_errno2) MPIU_ERR_ADD(mpi_errno, mpi_errno2);
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_CLEANUP_ON_ERROR);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.h	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/socksm.h	2010-05-14 20:45:02 UTC (rev 6648)
@@ -64,8 +64,7 @@
     M_(CONN_STATE_TA_C_CNTD),                   \
     M_(CONN_STATE_TA_C_RANKRCVD),               \
     M_(CONN_STATE_TA_C_TMPVCRCVD),              \
-    M_(CONN_STATE_TS_COMMRDY),                  \
-    M_(CONN_STATE_TS_D_QUIESCENT)
+    M_(CONN_STATE_TS_COMMRDY)
 
 /* REQ - Request, RSP - Response */
 

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_impl.h	2010-05-14 20:45:02 UTC (rev 6648)
@@ -21,9 +21,13 @@
 extern sockconn_t MPID_nem_tcp_g_lstn_sc;
 extern struct pollfd MPID_nem_tcp_g_lstn_plfd;
 
-#define MPID_NEM_TCP_VC_STATE_DISCONNECTED 0
-#define MPID_NEM_TCP_VC_STATE_CONNECTED 1
+typedef enum{MPID_NEM_TCP_VC_STATE_DISCONNECTED,
+             MPID_NEM_TCP_VC_STATE_CONNECTED,
+             MPID_NEM_TCP_VC_STATE_ERROR
+} MPID_nem_tcp_vc_state_t;
 
+#define MPIDI_NEM_TCP_MAX_CONNECT_RETRIES 100
+
 typedef struct MPIDI_nem_tcp_request_queue
 {
     struct MPID_Request *head;
@@ -33,15 +37,17 @@
 /* The vc provides a generic buffer in which network modules can store
    private fields This removes all dependencies from the VC struction
    on the network module, facilitating dynamic module loading. */
-typedef struct 
+typedef struct
 {
     struct sockaddr_in sock_id;
+    MPID_nem_tcp_vc_state_t state;
     struct MPID_nem_new_tcp_sockconn *sc;
     int send_paused;
     MPIDI_nem_tcp_request_queue_t send_queue;
     MPIDI_nem_tcp_request_queue_t paused_send_queue;
     /* this is a count of how many sc objects refer to this vc */
     int sc_ref_count;
+    int connect_retry_count; /* number of times we've tried to connect */
 } MPID_nem_tcp_vc_area;
 
 /* macro for tcp private in VC */
@@ -87,6 +93,7 @@
 int MPID_nem_tcp_is_sock_connected(int fd);
 int MPID_nem_tcp_disconnect(struct MPIDI_VC *const vc);
 int MPID_nem_tcp_cleanup (struct MPIDI_VC *const vc);
+int MPID_nem_tcp_cleanup_on_error(MPIDI_VC_t *const vc);
 int MPID_nem_tcp_state_listening_handler(struct pollfd *const l_plfd, sockconn_t *const l_sc);
 int MPID_nem_tcp_send_queued(MPIDI_VC_t *vc, MPIDI_nem_tcp_request_queue_t *send_queue);
 

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_init.c	2010-05-14 20:45:02 UTC (rev 6648)
@@ -412,7 +412,7 @@
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_VC_INIT);
 
-    vc_ch->state = MPID_NEM_TCP_VC_STATE_DISCONNECTED;
+    vc_tcp->state = MPID_NEM_TCP_VC_STATE_DISCONNECTED;
     
     vc->sendNoncontig_fn   = MPID_nem_tcp_SendNoncontig;
     vc_ch->iStartContigMsg = MPID_nem_tcp_iStartContigMsg;
@@ -441,6 +441,8 @@
     vc_tcp->paused_send_queue.head = vc_tcp->paused_send_queue.tail = NULL;
 
     vc_tcp->sc_ref_count = 0;
+    
+    vc_tcp->connect_retry_count = 0;
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_VC_INIT);
     return mpi_errno;

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_send.c	2010-05-14 20:45:02 UTC (rev 6648)
@@ -98,8 +98,12 @@
         
         CHECK_EINTR(offset, writev(vc_tcp->sc->fd, iov, sreq->dev.iov_count));
         if (offset == 0) {
+            int cleanup_errno = MPI_SUCCESS;
             MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+            cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+            if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+            goto fn_fail;
         }
         if (offset == -1)
         {
@@ -109,8 +113,12 @@
                 MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "EAGAIN");
                 break;
             } else {
+                int cleanup_errno = MPI_SUCCESS;
                 MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                goto fn_fail;
             }
         }
         MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "write " MPIDI_MSG_SZ_FMT, offset);
@@ -167,10 +175,10 @@
     if (SENDQ_EMPTY(*send_queue))
         UNSET_PLFD(vc_tcp);
     
- fn_exit:
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_SEND_QUEUED);
     return mpi_errno;
- fn_fail:
+fn_fail:
     goto fn_exit;
 }
 
@@ -224,7 +232,7 @@
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPID_nem_tcp_iStartContigMsg(MPIDI_VC_t *vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz,
-                                    MPID_Request **sreq_ptr)
+                                 MPID_Request **sreq_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Request * sreq = NULL;
@@ -254,16 +262,24 @@
                 
                 CHECK_EINTR(offset, writev(sc->fd, iov, 2));
                 if (offset == 0) {
+                    int cleanup_errno = MPI_SUCCESS;
                     MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    goto fn_fail;
                 }
                 if (offset == -1)
                 {
                     if (errno == EAGAIN)
                         offset = 0;
                     else {
+                        int cleanup_errno = MPI_SUCCESS;
                         MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                        cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                        if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                        goto fn_fail;
                     }
                 }
                 MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "write " MPIDI_MSG_SZ_FMT, offset);
@@ -278,6 +294,8 @@
         }
         else
         {
+            /* state may be DISCONNECTED or ERROR.  Calling tcp_connect in an ERROR state will return an
+               appropriate error code. */
             mpi_errno = MPID_nem_tcp_connect(vc);
             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         }
@@ -340,10 +358,10 @@
     
     *sreq_ptr = sreq;
     
- fn_exit:
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_ISTARTCONTIGMSG);
     return mpi_errno;
- fn_fail:
+fn_fail:
     goto fn_exit;
 }
 
@@ -382,16 +400,24 @@
                 
             CHECK_EINTR(offset, writev(sc->fd, iov, 2));
             if (offset == 0) {
+                int cleanup_errno = MPI_SUCCESS;
                 MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                goto fn_fail;
             }
             if (offset == -1)
             {
                 if (errno == EAGAIN)
                     offset = 0;
                 else {
+                    int cleanup_errno = MPI_SUCCESS;
                     MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                    MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                    MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc); /* ignoring return code */
+                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    goto fn_fail;
                 }
             }
             MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "write " MPIDI_MSG_SZ_FMT, offset);
@@ -406,6 +432,8 @@
     }
     else
     {
+        /* state may be DISCONNECTED or ERROR.  Calling tcp_connect in an ERROR state will return an
+           appropriate error code. */
         mpi_errno = MPID_nem_tcp_connect(vc);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
@@ -475,7 +503,7 @@
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPID_nem_tcp_iSendContig(MPIDI_VC_t *vc, MPID_Request *sreq, void *hdr, MPIDI_msg_sz_t hdr_sz,
-                                void *data, MPIDI_msg_sz_t data_sz)
+                             void *data, MPIDI_msg_sz_t data_sz)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_msg_sz_t offset = 0;
@@ -505,16 +533,24 @@
                 
                 CHECK_EINTR(offset, writev(sc->fd, iov, 2));
                 if (offset == 0) {
+                    int cleanup_errno = MPI_SUCCESS;
                     MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    goto fn_fail;
                 }
                 if (offset == -1)
                 {
                     if (errno == EAGAIN)
                         offset = 0;
                     else {
+                        int cleanup_errno = MPI_SUCCESS;
                         MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                        cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                        if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                        goto fn_fail;
                     }
                 }
                 MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "write " MPIDI_MSG_SZ_FMT, offset);
@@ -553,6 +589,8 @@
         }
         else
         {
+            /* state may be DISCONNECTED or ERROR.  Calling tcp_connect in an ERROR state will return an
+               appropriate error code. */
             mpi_errno = MPID_nem_tcp_connect(vc);
             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         }
@@ -581,7 +619,7 @@
         sreq->dev.iov_count = 1;
     }
 
- enqueue_request:
+enqueue_request:
     /* enqueue request */
     MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "enqueuing");
     MPIU_Assert(sreq->dev.iov_count >= 1 && sreq->dev.iov[0].MPID_IOV_LEN > 0);
@@ -608,10 +646,10 @@
         }
     }
     
- fn_exit:
+fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_ISENDCONTIGMSG);
     return mpi_errno;
- fn_fail:
+fn_fail:
     goto fn_exit;
 }
 
@@ -651,16 +689,24 @@
             {
                 CHECK_EINTR(offset, writev(vc_tcp->sc->fd, iov, iov_n));
                 if (offset == 0) {
+                    int cleanup_errno = MPI_SUCCESS;
                     MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                    MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**sock_closed");
+                    cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                    if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                    goto fn_fail;
                 }
                 if (offset == -1)
                 {
                     if (errno == EAGAIN)
                         offset = 0;
                     else {
+                        int cleanup_errno = MPI_SUCCESS;
                         MPIDU_FTB_COMMERR(MPIDU_FTB_EV_COMMUNICATION, vc);
-                        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**writev", "**writev %s", strerror (errno));
+                        cleanup_errno = MPID_nem_tcp_cleanup_on_error(vc);
+                        if (cleanup_errno) MPIU_ERR_ADD(mpi_errno, cleanup_errno);
+                        goto fn_fail;
                     }
                 }
                 
@@ -669,6 +715,8 @@
         }
         else
         {
+            /* state may be DISCONNECTED or ERROR.  Calling tcp_connect in an ERROR state will return an
+               appropriate error code. */
             mpi_errno = MPID_nem_tcp_connect(vc);
             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         }
@@ -749,9 +797,9 @@
         }
     }
     
- fn_exit:
+fn_exit:
     return mpi_errno;
- fn_fail:
+fn_fail:
     MPIU_Object_set_ref(sreq, 0);
     MPIDI_CH3_Request_destroy(sreq);
     goto fn_exit;

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_utility.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_utility.c	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/nemesis/netmod/tcp/tcp_utility.c	2010-05-14 20:45:02 UTC (rev 6648)
@@ -209,10 +209,9 @@
 {
     int i;
     MPID_Request *sreq;
-    MPIDI_CH3I_VC *vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
     MPID_nem_tcp_vc_area *vc_tcp = VC_TCP(vc);
 
-    fprintf(stream, "..   sc=%p fd=%d vc_ch->state=%d\n", vc_tcp->sc, (vc_tcp->sc ? vc_tcp->sc->fd : -1), vc_ch->state);
+    fprintf(stream, "..   sc=%p fd=%d vc_tcp->state=%d\n", vc_tcp->sc, (vc_tcp->sc ? vc_tcp->sc->fd : -1), vc_tcp->state);
 
     /* This function violates any abstraction in the queues, since there's no
        good way to print them without inspecting the internals. */

Modified: mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_init.c
===================================================================
--- mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_init.c	2010-05-14 20:39:19 UTC (rev 6647)
+++ mpich2/trunk/src/mpid/ch3/channels/nemesis/src/ch3_init.c	2010-05-14 20:45:02 UTC (rev 6648)
@@ -213,27 +213,8 @@
 #ifdef USE_DBG_LOGGING
 const char * MPIDI_CH3_VC_GetStateString( struct MPIDI_VC *vc )
 {
-    const char *name = "unknown";
-    static char asdigits[20];
-    MPIDI_CH3I_VC *vcch = (MPIDI_CH3I_VC *)vc->channel_private;
-    int    state = vcch->state;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_VC_GETSTATESTRING);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_VC_GETSTATESTRING);
-
-    switch (state) {
-    case MPIDI_CH3I_VC_STATE_UNCONNECTED: name = "CH3I_VC_STATE_UNCONNECTED"; break;
-    case MPIDI_CH3I_VC_STATE_CONNECTING:  name = "CH3I_VC_STATE_CONNECTING"; break;
-    case MPIDI_CH3I_VC_STATE_CONNECTED:   name = "CH3I_VC_STATE_CONNECTED"; break;
-    case MPIDI_CH3I_VC_STATE_FAILED:      name = "CH3I_VC_STATE_FAILED"; break;
-    default:
-	MPIU_Snprintf( asdigits, sizeof(asdigits), "%d", state );
-	asdigits[20-1] = 0;
-	name = (const char *)asdigits;
-    }
-
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_VC_GETSTATESTRING);
-    return name;
+    /* Nemesis doesn't have connection state associated with the VC */
+    return "N/A";
 }
 #endif
 #endif



More information about the mpich2-commits mailing list