[mpich2-commits] r4218 - mpich2/trunk/test/mpi/errors/faults

gropp at mcs.anl.gov gropp at mcs.anl.gov
Sat Mar 28 16:34:49 CDT 2009


Author: gropp
Date: 2009-03-28 16:34:49 -0500 (Sat, 28 Mar 2009)
New Revision: 4218

Added:
   mpich2/trunk/test/mpi/errors/faults/README
   mpich2/trunk/test/mpi/errors/faults/collf1.c
   mpich2/trunk/test/mpi/errors/faults/collf2.c
   mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c
Modified:
   mpich2/trunk/test/mpi/errors/faults/Makefile.sm
   mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c
   mpich2/trunk/test/mpi/errors/faults/testlist
Log:
Added additional tests for fault handling

Modified: mpich2/trunk/test/mpi/errors/faults/Makefile.sm
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/Makefile.sm	2009-03-28 21:34:30 UTC (rev 4217)
+++ mpich2/trunk/test/mpi/errors/faults/Makefile.sm	2009-03-28 21:34:49 UTC (rev 4218)
@@ -4,6 +4,9 @@
 smvar_do_sharedlibs = 0
 
 pt2ptf1_SOURCES = pt2ptf1.c
+pt2ptf2_SOURCES = pt2ptf2.c
+collf1_SOURCES  = collf1.c
+collf2_SOURCES  = collf2.c
 
 ../../util/mtest.o: 
 	(cd ../../util && make mtest.o)

Added: mpich2/trunk/test/mpi/errors/faults/README
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/README	                        (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/README	2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,7 @@
+This directory contains tests for the ability of MPI to survive faults.
+The MPI standard permits but does not require that an MPI
+implementation continue through a fault.  If the MPI implementation
+does (claim) to be fault tolerant, these test programs provide some
+basic tests.  To enable these, configure the tests with the configure
+option --enable-checkfaults .
+

Added: mpich2/trunk/test/mpi/errors/faults/collf1.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/collf1.c	                        (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/collf1.c	2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ *  (C) 2009 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+static char MTEST_Descrip[] = "Test survivability from faults with collective communication";
+
+int main( int argc, char *argv[] )
+{
+    int wrank, wsize, rank, size, color;
+    int i, j, tmp;
+    MPI_Comm newcomm;
+
+    MPI_Init( &argc, &argv );
+
+    MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+    MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+    /* Color is 0 or 1; 1 will be the processes that "fault" */
+    color = (wrank > 0) && (wrank <= wsize/2);
+    MPI_Comm_split( MPI_COMM_WORLD, color, wrank, &newcomm );
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (color) {
+	/* Simulate a fault on some processes */
+	exit(1);
+    }
+    
+    /* Can we still use newcomm? */
+    MPI_Comm_size( newcomm, &size );
+    MPI_Comm_rank( newcomm, &rank );
+
+    MPI_Allreduce( &rank, &tmp, 1, MPI_INT, MPI_SUM, newcomm );
+    if (tmp != (size*(size+1)) / 2) {
+	printf( "Allreduce gave %d but expected %d\n", tmp, (size*(size+1))/2);
+    }
+
+    MPI_Comm_free( &newcomm );
+    MPI_Finalize();
+
+    printf( " No Errors\n" );
+
+    return 0;
+}

Added: mpich2/trunk/test/mpi/errors/faults/collf2.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/collf2.c	                        (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/collf2.c	2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,70 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ *  (C) 2009 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+static char MTEST_Descrip[] = "Test error reporting from faults with collective communication";
+
+int ReportErr( int errcode, const char name[] );
+
+int main( int argc, char *argv[] )
+{
+    int wrank, wsize, rank, size, color;
+    int i, j, tmp;
+    int err, errs = 0, toterrs;
+    MPI_Comm newcomm;
+
+    MPI_Init( &argc, &argv );
+
+    MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+    MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+    /* Color is 0 or 1; 1 will be the processes that "fault" */
+    /* process 0 and wsize/2+1...wsize-1 are in non-faulting group */
+    color = (wrank > 0) && (wrank <= wsize/2);
+    MPI_Comm_split( MPI_COMM_WORLD, color, wrank, &newcomm );
+
+    MPI_Comm_size( newcomm, &size );
+    MPI_Comm_rank( newcomm, &rank );
+
+    /* Set errors return on COMM_WORLD and the new comm */
+    MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, MPI_COMM_WORLD );
+    MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, newcomm );
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (color) {
+	/* Simulate a fault on some processes */
+	exit(1);
+    }
+    
+    /* Can we still use newcomm? */
+    MPI_Allreduce( &rank, &tmp, 1, MPI_INT, MPI_SUM, newcomm );
+    if (tmp != (size*(size+1)) / 2) {
+	printf( "Allreduce gave %d but expected %d\n", tmp, (size*(size+1))/2);
+	errs ++;
+    }
+
+    MPI_Comm_free( &newcomm );
+    MPI_Finalize();
+
+    printf( " No Errors\n" );
+
+    return 0;
+}
+
+int ReportErr( int errcode, const char name[] )
+{
+    int errclass, errlen;
+    char errmsg[MPI_MAX_ERROR_STRING];
+    MPI_Error_class( errcode, &errclass );
+    MPI_Error_string( errcode, errmsg, &errlen );
+    fprintf( stderr, "In %s, error code %d(class %d) = %s\n",
+	     name, errcode, errclass, errmsg );
+    return 1;
+}    

Modified: mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c	2009-03-28 21:34:30 UTC (rev 4217)
+++ mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c	2009-03-28 21:34:49 UTC (rev 4218)
@@ -6,10 +6,10 @@
  */
 #include "mpi.h"
 #include <stdio.h>
+#include <stdlib.h>
 #include "mpitest.h"
 
-static char MTEST_Descrip[] = "Test err in status return, using truncated \
-messages for MPI_Testall";
+static char MTEST_Descrip[] = "Test survivability from faults with point to point communication";
 
 int main( int argc, char *argv[] )
 {

Added: mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c	                        (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c	2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,103 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ *  (C) 2009 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+static char MTEST_Descrip[] = "Test error reporting from faults with point to point communication";
+
+int ReportErr( int errcode, const char name[] );
+
+int main( int argc, char *argv[] )
+{
+    int wrank, wsize, rank, size, color;
+    int i, j, tmp;
+    int err, errclass, toterrs, errs = 0;
+    MPI_Comm newcomm;
+
+    MPI_Init( &argc, &argv );
+
+    MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+    MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+    /* Color is 0 or 1; 1 will be the processes that "fault" */
+    /* process 0 and wsize/2+1...wsize-1 are in non-faulting group */
+    color = (wrank > 0) && (wrank <= wsize/2);
+    MPI_Comm_split( MPI_COMM_WORLD, color, wrank, &newcomm );
+
+    MPI_Comm_size( newcomm, &size );
+    MPI_Comm_rank( newcomm, &rank );
+
+    /* Set errors return on COMM_WORLD and the new comm */
+    MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, MPI_COMM_WORLD );
+    MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, newcomm );
+
+    err = MPI_Barrier( MPI_COMM_WORLD );
+    if (err) errs += ReportErr( err, "Barrier" );
+    if (color) {
+	/* Simulate a fault on some processes */
+	exit(1);
+    }
+    else {
+	/* To improve the chance that the "faulted" processes will have
+	   exited, wait for 1 second */
+	sleep( 1 );
+    }
+    
+    /* Can we still use newcomm? */
+    for (j=0; j<rank; j++) {
+	err = MPI_Recv( &tmp, 1, MPI_INT, j, 0, newcomm, MPI_STATUS_IGNORE );
+	if (err) errs += ReportErr( err, "Recv" );
+    }
+    for (j=rank+1; j<size; j++) {
+	err = MPI_Send( &rank, 1, MPI_INT, j, 0, newcomm );
+	if (err) errs += ReportErr( err, "Recv" );
+    }
+
+    /* Now, try sending in MPI_COMM_WORLD on dead processes */
+    /* There is a race condition here - we don't know for sure that the faulted
+       processes have exited.  However, we can ensure a failure by using 
+       synchronous sends - the sender will wait until the reciever handles 
+       receives the message, which will not happen (the process will exit 
+       without matching the message, even if it has not yet exited). */
+    for (j=1; j<=wsize/2; j++) {
+	err = MPI_Ssend( &rank, 1, MPI_INT, j, 0, MPI_COMM_WORLD );
+	if (!err) {
+	    errs++;
+	    fprintf( stderr, "Ssend succeeded to dead process %d\n", j );
+	}
+    }
+
+    err = MPI_Allreduce( &errs, &toterrs, 1, MPI_INT, MPI_SUM, newcomm );
+    if (err) errs += ReportErr( err, "Allreduce" );
+    MPI_Comm_free( &newcomm );
+
+    MPI_Finalize();
+
+    if (wrank == 0) {
+	if (toterrs > 0) {
+	    printf( " Found %d errors\n", toterrs );
+	}
+	else {
+	    printf( " No Errors\n" );
+	}
+    }
+
+    return 0;
+}
+
+int ReportErr( int errcode, const char name[] )
+{
+    int errclass, errlen;
+    char errmsg[MPI_MAX_ERROR_STRING];
+    MPI_Error_class( errcode, &errclass );
+    MPI_Error_string( errcode, errmsg, &errlen );
+    fprintf( stderr, "In %s, error code %d(class %d) = %s\n",
+	     name, errcode, errclass, errmsg );
+    return 1;
+}    

Modified: mpich2/trunk/test/mpi/errors/faults/testlist
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/testlist	2009-03-28 21:34:30 UTC (rev 4217)
+++ mpich2/trunk/test/mpi/errors/faults/testlist	2009-03-28 21:34:49 UTC (rev 4218)
@@ -1 +1,4 @@
-pt2ptf1 4
\ No newline at end of file
+pt2ptf1 4 env=MPIEXEC_ALLOW_FAULT=YES
+collf1 4 env=MPIEXEC_ALLOW_FAULT=YES
+pt2ptf2 4 env=MPIEXEC_ALLOW_FAULT=YES
+collf2 4 env=MPIEXEC_ALLOW_FAULT=YES
\ No newline at end of file



More information about the mpich2-commits mailing list