[mpich2-commits] r7724 - mpich2/trunk/src/pm/smpd
jayesh at mcs.anl.gov
jayesh at mcs.anl.gov
Thu Jan 13 16:43:00 CST 2011
Author: jayesh
Date: 2011-01-13 16:43:00 -0600 (Thu, 13 Jan 2011)
New Revision: 7724
Modified:
mpich2/trunk/src/pm/smpd/smpd_handle_command.c
mpich2/trunk/src/pm/smpd/smpd_launch_process.c
Log:
Retry suspending a thread even if we fail - however go ahead with terminating the process even if we fail to suspend the thread. Also return failure only if safe terminate AND explicit terminate of a process fails. This fixes ticket# 1125
Modified: mpich2/trunk/src/pm/smpd/smpd_handle_command.c
===================================================================
--- mpich2/trunk/src/pm/smpd/smpd_handle_command.c 2011-01-13 22:17:24 UTC (rev 7723)
+++ mpich2/trunk/src/pm/smpd/smpd_handle_command.c 2011-01-13 22:43:00 UTC (rev 7724)
@@ -5617,7 +5617,7 @@
result = smpd_kill_process(pmi_context->process, exit_code);
if (result != SMPD_SUCCESS){
- smpd_err_printf("unable to kill process.\n");
+ smpd_err_printf("unable to kill process. result = %d\n", result);
pmi_context->state = SMPD_CLOSING;
if(pmi_context->process->in){
smpd_dbg_printf("Closing stdin ...\n");
Modified: mpich2/trunk/src/pm/smpd/smpd_launch_process.c
===================================================================
--- mpich2/trunk/src/pm/smpd/smpd_launch_process.c 2011-01-13 22:17:24 UTC (rev 7723)
+++ mpich2/trunk/src/pm/smpd/smpd_launch_process.c 2011-01-13 22:43:00 UTC (rev 7724)
@@ -2196,23 +2196,52 @@
#endif
}
+#define SMPD_MAX_SUSPEND_RETRY_COUNT 4
+
#undef FCNAME
#define FCNAME "smpd_suspend_process"
int smpd_suspend_process(smpd_process_t *process)
{
#ifdef HAVE_WINDOWS_H
int result = SMPD_SUCCESS;
+ int retry_cnt = 0;
smpd_enter_fn(FCNAME);
- if (SuspendThread(process->wait.hThread) == -1)
- {
- result = GetLastError();
- smpd_err_printf("SuspendThread failed with error %d for process %d:%s:'%s'\n",
- result, process->rank, process->kvs_name, process->exe);
- }
+ do{
+ if (SuspendThread(process->wait.hThread) == -1){
+ int exit_code;
+ /* Check if the thread is still active */
+ if(!GetExitCodeThread(process->wait.hThread, &exit_code)){
+ smpd_err_printf("Getting exit code for thread failed\n");
+ break;
+ }
+ else{
+ if(exit_code != STILL_ACTIVE){
+ smpd_err_printf("The thread to be suspended is no longer active, exit_code = %d\n", exit_code);
+ break;
+ }
+ else{
+ smpd_err_printf("The thread is active but cannot be suspended\n");
+ }
+ }
+
+ result = GetLastError();
+ smpd_err_printf("SuspendThread failed[%d times] with error %d for process %d:%s:'%s'\n",
+ retry_cnt, result, process->rank, process->kvs_name, process->exe);
+ }
+ else{
+ break;
+ }
+
+ /* Ignore error and proceed if we fail to suspend */
+ result = SMPD_SUCCESS;
+ retry_cnt++;
+ }while(retry_cnt < SMPD_MAX_SUSPEND_RETRY_COUNT);
+
smpd_exit_fn(FCNAME);
- return result;
+ /* Ignore error */
+ return SMPD_SUCCESS;
#else
smpd_enter_fn(FCNAME);
@@ -2295,19 +2324,23 @@
#define FCNAME "smpd_kill_process"
int smpd_kill_process(smpd_process_t *process, int exit_code)
{
+ int result = SMPD_SUCCESS;
#ifdef HAVE_WINDOWS_H
smpd_enter_fn(FCNAME);
smpd_process_from_registry(process);
if (!SafeTerminateProcess(process->wait.hProcess, exit_code)){
+ smpd_err_printf("unable terminate process safely. exit_code = %d\n", exit_code);
if (GetLastError() != ERROR_PROCESS_ABORTED){
- TerminateProcess(process->wait.hProcess, exit_code);
+ if(!TerminateProcess(process->wait.hProcess, exit_code)){
+ if (GetLastError() != ERROR_PROCESS_ABORTED){
+ result = SMPD_FAIL;
+ }
+ }
}
- smpd_exit_fn(FCNAME);
- return SMPD_FAIL;
}
smpd_exit_fn(FCNAME);
- return SMPD_SUCCESS;
+ return result;
#else
int status;
smpd_enter_fn(FCNAME);
More information about the mpich2-commits
mailing list