[mpich2-commits] r7724 - mpich2/trunk/src/pm/smpd

jayesh at mcs.anl.gov jayesh at mcs.anl.gov
Thu Jan 13 16:43:00 CST 2011


Author: jayesh
Date: 2011-01-13 16:43:00 -0600 (Thu, 13 Jan 2011)
New Revision: 7724

Modified:
   mpich2/trunk/src/pm/smpd/smpd_handle_command.c
   mpich2/trunk/src/pm/smpd/smpd_launch_process.c
Log:
Retry suspending a thread even if we fail - however go ahead with terminating the process even if we fail to suspend the thread. Also return failure only if safe terminate AND explicit terminate of a process fails. This fixes ticket# 1125

Modified: mpich2/trunk/src/pm/smpd/smpd_handle_command.c
===================================================================
--- mpich2/trunk/src/pm/smpd/smpd_handle_command.c	2011-01-13 22:17:24 UTC (rev 7723)
+++ mpich2/trunk/src/pm/smpd/smpd_handle_command.c	2011-01-13 22:43:00 UTC (rev 7724)
@@ -5617,7 +5617,7 @@
     
     result = smpd_kill_process(pmi_context->process, exit_code);
     if (result != SMPD_SUCCESS){
-	    smpd_err_printf("unable to kill process.\n");
+	    smpd_err_printf("unable to kill process. result = %d\n", result);
         pmi_context->state = SMPD_CLOSING;
         if(pmi_context->process->in){
             smpd_dbg_printf("Closing stdin ...\n");

Modified: mpich2/trunk/src/pm/smpd/smpd_launch_process.c
===================================================================
--- mpich2/trunk/src/pm/smpd/smpd_launch_process.c	2011-01-13 22:17:24 UTC (rev 7723)
+++ mpich2/trunk/src/pm/smpd/smpd_launch_process.c	2011-01-13 22:43:00 UTC (rev 7724)
@@ -2196,23 +2196,52 @@
 #endif
 }
 
+#define SMPD_MAX_SUSPEND_RETRY_COUNT 4
+
 #undef FCNAME
 #define FCNAME "smpd_suspend_process"
 int smpd_suspend_process(smpd_process_t *process)
 {
 #ifdef HAVE_WINDOWS_H
     int result = SMPD_SUCCESS;
+    int retry_cnt = 0;
     smpd_enter_fn(FCNAME);
 
-    if (SuspendThread(process->wait.hThread) == -1)
-    {
-	result = GetLastError();
-	smpd_err_printf("SuspendThread failed with error %d for process %d:%s:'%s'\n",
-	    result, process->rank, process->kvs_name, process->exe);
-    }
+    do{
+        if (SuspendThread(process->wait.hThread) == -1){
+            int exit_code;
 
+            /* Check if the thread is still active */
+            if(!GetExitCodeThread(process->wait.hThread, &exit_code)){
+                smpd_err_printf("Getting exit code for thread failed\n");
+                break;
+            }
+            else{
+                if(exit_code != STILL_ACTIVE){
+                    smpd_err_printf("The thread to be suspended is no longer active, exit_code = %d\n", exit_code);
+                    break;
+                }
+                else{
+                    smpd_err_printf("The thread is active but cannot be suspended\n");
+                }
+            }
+
+	        result = GetLastError();
+	        smpd_err_printf("SuspendThread failed[%d times] with error %d for process %d:%s:'%s'\n",
+	            retry_cnt, result, process->rank, process->kvs_name, process->exe);
+        }
+        else{
+            break;
+        }
+
+        /* Ignore error and proceed if we fail to suspend */
+        result = SMPD_SUCCESS;
+        retry_cnt++;
+    }while(retry_cnt < SMPD_MAX_SUSPEND_RETRY_COUNT);
+
     smpd_exit_fn(FCNAME);
-    return result;
+    /* Ignore error */
+    return SMPD_SUCCESS;
 #else
     smpd_enter_fn(FCNAME);
 
@@ -2295,19 +2324,23 @@
 #define FCNAME "smpd_kill_process"
 int smpd_kill_process(smpd_process_t *process, int exit_code)
 {
+    int result = SMPD_SUCCESS;
 #ifdef HAVE_WINDOWS_H
     smpd_enter_fn(FCNAME);
 
     smpd_process_from_registry(process);
     if (!SafeTerminateProcess(process->wait.hProcess, exit_code)){
+        smpd_err_printf("unable terminate process safely. exit_code = %d\n", exit_code);
 	    if (GetLastError() != ERROR_PROCESS_ABORTED){
-	        TerminateProcess(process->wait.hProcess, exit_code);
+            if(!TerminateProcess(process->wait.hProcess, exit_code)){
+                if (GetLastError() != ERROR_PROCESS_ABORTED){
+                    result = SMPD_FAIL;
+                }
+            }
 	    }
-        smpd_exit_fn(FCNAME);
-        return SMPD_FAIL;
     }
     smpd_exit_fn(FCNAME);
-    return SMPD_SUCCESS;
+    return result;
 #else
     int status;
     smpd_enter_fn(FCNAME);



More information about the mpich2-commits mailing list