[mpich2-commits] r4218 - mpich2/trunk/test/mpi/errors/faults
gropp at mcs.anl.gov
gropp at mcs.anl.gov
Sat Mar 28 16:34:49 CDT 2009
Author: gropp
Date: 2009-03-28 16:34:49 -0500 (Sat, 28 Mar 2009)
New Revision: 4218
Added:
mpich2/trunk/test/mpi/errors/faults/README
mpich2/trunk/test/mpi/errors/faults/collf1.c
mpich2/trunk/test/mpi/errors/faults/collf2.c
mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c
Modified:
mpich2/trunk/test/mpi/errors/faults/Makefile.sm
mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c
mpich2/trunk/test/mpi/errors/faults/testlist
Log:
Added additional tests for fault handling
Modified: mpich2/trunk/test/mpi/errors/faults/Makefile.sm
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/Makefile.sm 2009-03-28 21:34:30 UTC (rev 4217)
+++ mpich2/trunk/test/mpi/errors/faults/Makefile.sm 2009-03-28 21:34:49 UTC (rev 4218)
@@ -4,6 +4,9 @@
smvar_do_sharedlibs = 0
pt2ptf1_SOURCES = pt2ptf1.c
+pt2ptf2_SOURCES = pt2ptf2.c
+collf1_SOURCES = collf1.c
+collf2_SOURCES = collf2.c
../../util/mtest.o:
(cd ../../util && make mtest.o)
Added: mpich2/trunk/test/mpi/errors/faults/README
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/README (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/README 2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,7 @@
+This directory contains tests for the ability of MPI to survive faults.
+The MPI standard permits but does not require that an MPI
+implementation continue through a fault. If the MPI implementation
+does (claim) to be fault tolerant, these test programs provide some
+basic tests. To enable these, configure the tests with the configure
+option --enable-checkfaults .
+
Added: mpich2/trunk/test/mpi/errors/faults/collf1.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/collf1.c (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/collf1.c 2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ * (C) 2009 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+static char MTEST_Descrip[] = "Test survivability from faults with collective communication";
+
+int main( int argc, char *argv[] )
+{
+ int wrank, wsize, rank, size, color;
+ int i, j, tmp;
+ MPI_Comm newcomm;
+
+ MPI_Init( &argc, &argv );
+
+ MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+ MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+ /* Color is 0 or 1; 1 will be the processes that "fault" */
+ color = (wrank > 0) && (wrank <= wsize/2);
+ MPI_Comm_split( MPI_COMM_WORLD, color, wrank, &newcomm );
+
+ MPI_Barrier( MPI_COMM_WORLD );
+ if (color) {
+ /* Simulate a fault on some processes */
+ exit(1);
+ }
+
+ /* Can we still use newcomm? */
+ MPI_Comm_size( newcomm, &size );
+ MPI_Comm_rank( newcomm, &rank );
+
+ MPI_Allreduce( &rank, &tmp, 1, MPI_INT, MPI_SUM, newcomm );
+ if (tmp != (size*(size+1)) / 2) {
+ printf( "Allreduce gave %d but expected %d\n", tmp, (size*(size+1))/2);
+ }
+
+ MPI_Comm_free( &newcomm );
+ MPI_Finalize();
+
+ printf( " No Errors\n" );
+
+ return 0;
+}
Added: mpich2/trunk/test/mpi/errors/faults/collf2.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/collf2.c (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/collf2.c 2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,70 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ * (C) 2009 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+static char MTEST_Descrip[] = "Test error reporting from faults with collective communication";
+
+int ReportErr( int errcode, const char name[] );
+
+int main( int argc, char *argv[] )
+{
+ int wrank, wsize, rank, size, color;
+ int i, j, tmp;
+ int err, errs = 0, toterrs;
+ MPI_Comm newcomm;
+
+ MPI_Init( &argc, &argv );
+
+ MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+ MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+ /* Color is 0 or 1; 1 will be the processes that "fault" */
+ /* process 0 and wsize/2+1...wsize-1 are in non-faulting group */
+ color = (wrank > 0) && (wrank <= wsize/2);
+ MPI_Comm_split( MPI_COMM_WORLD, color, wrank, &newcomm );
+
+ MPI_Comm_size( newcomm, &size );
+ MPI_Comm_rank( newcomm, &rank );
+
+ /* Set errors return on COMM_WORLD and the new comm */
+ MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, MPI_COMM_WORLD );
+ MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, newcomm );
+
+ MPI_Barrier( MPI_COMM_WORLD );
+ if (color) {
+ /* Simulate a fault on some processes */
+ exit(1);
+ }
+
+ /* Can we still use newcomm? */
+ MPI_Allreduce( &rank, &tmp, 1, MPI_INT, MPI_SUM, newcomm );
+ if (tmp != (size*(size+1)) / 2) {
+ printf( "Allreduce gave %d but expected %d\n", tmp, (size*(size+1))/2);
+ errs ++;
+ }
+
+ MPI_Comm_free( &newcomm );
+ MPI_Finalize();
+
+ printf( " No Errors\n" );
+
+ return 0;
+}
+
+int ReportErr( int errcode, const char name[] )
+{
+ int errclass, errlen;
+ char errmsg[MPI_MAX_ERROR_STRING];
+ MPI_Error_class( errcode, &errclass );
+ MPI_Error_string( errcode, errmsg, &errlen );
+ fprintf( stderr, "In %s, error code %d(class %d) = %s\n",
+ name, errcode, errclass, errmsg );
+ return 1;
+}
Modified: mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c 2009-03-28 21:34:30 UTC (rev 4217)
+++ mpich2/trunk/test/mpi/errors/faults/pt2ptf1.c 2009-03-28 21:34:49 UTC (rev 4218)
@@ -6,10 +6,10 @@
*/
#include "mpi.h"
#include <stdio.h>
+#include <stdlib.h>
#include "mpitest.h"
-static char MTEST_Descrip[] = "Test err in status return, using truncated \
-messages for MPI_Testall";
+static char MTEST_Descrip[] = "Test survivability from faults with point to point communication";
int main( int argc, char *argv[] )
{
Added: mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c (rev 0)
+++ mpich2/trunk/test/mpi/errors/faults/pt2ptf2.c 2009-03-28 21:34:49 UTC (rev 4218)
@@ -0,0 +1,103 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ * (C) 2009 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+static char MTEST_Descrip[] = "Test error reporting from faults with point to point communication";
+
+int ReportErr( int errcode, const char name[] );
+
+int main( int argc, char *argv[] )
+{
+ int wrank, wsize, rank, size, color;
+ int i, j, tmp;
+ int err, errclass, toterrs, errs = 0;
+ MPI_Comm newcomm;
+
+ MPI_Init( &argc, &argv );
+
+ MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+ MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+ /* Color is 0 or 1; 1 will be the processes that "fault" */
+ /* process 0 and wsize/2+1...wsize-1 are in non-faulting group */
+ color = (wrank > 0) && (wrank <= wsize/2);
+ MPI_Comm_split( MPI_COMM_WORLD, color, wrank, &newcomm );
+
+ MPI_Comm_size( newcomm, &size );
+ MPI_Comm_rank( newcomm, &rank );
+
+ /* Set errors return on COMM_WORLD and the new comm */
+ MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, MPI_COMM_WORLD );
+ MPI_Comm_set_errhandler( MPI_ERRORS_RETURN, newcomm );
+
+ err = MPI_Barrier( MPI_COMM_WORLD );
+ if (err) errs += ReportErr( err, "Barrier" );
+ if (color) {
+ /* Simulate a fault on some processes */
+ exit(1);
+ }
+ else {
+ /* To improve the chance that the "faulted" processes will have
+ exited, wait for 1 second */
+ sleep( 1 );
+ }
+
+ /* Can we still use newcomm? */
+ for (j=0; j<rank; j++) {
+ err = MPI_Recv( &tmp, 1, MPI_INT, j, 0, newcomm, MPI_STATUS_IGNORE );
+ if (err) errs += ReportErr( err, "Recv" );
+ }
+ for (j=rank+1; j<size; j++) {
+ err = MPI_Send( &rank, 1, MPI_INT, j, 0, newcomm );
+ if (err) errs += ReportErr( err, "Recv" );
+ }
+
+ /* Now, try sending in MPI_COMM_WORLD on dead processes */
+ /* There is a race condition here - we don't know for sure that the faulted
+ processes have exited. However, we can ensure a failure by using
+ synchronous sends - the sender will wait until the reciever handles
+ receives the message, which will not happen (the process will exit
+ without matching the message, even if it has not yet exited). */
+ for (j=1; j<=wsize/2; j++) {
+ err = MPI_Ssend( &rank, 1, MPI_INT, j, 0, MPI_COMM_WORLD );
+ if (!err) {
+ errs++;
+ fprintf( stderr, "Ssend succeeded to dead process %d\n", j );
+ }
+ }
+
+ err = MPI_Allreduce( &errs, &toterrs, 1, MPI_INT, MPI_SUM, newcomm );
+ if (err) errs += ReportErr( err, "Allreduce" );
+ MPI_Comm_free( &newcomm );
+
+ MPI_Finalize();
+
+ if (wrank == 0) {
+ if (toterrs > 0) {
+ printf( " Found %d errors\n", toterrs );
+ }
+ else {
+ printf( " No Errors\n" );
+ }
+ }
+
+ return 0;
+}
+
+int ReportErr( int errcode, const char name[] )
+{
+ int errclass, errlen;
+ char errmsg[MPI_MAX_ERROR_STRING];
+ MPI_Error_class( errcode, &errclass );
+ MPI_Error_string( errcode, errmsg, &errlen );
+ fprintf( stderr, "In %s, error code %d(class %d) = %s\n",
+ name, errcode, errclass, errmsg );
+ return 1;
+}
Modified: mpich2/trunk/test/mpi/errors/faults/testlist
===================================================================
--- mpich2/trunk/test/mpi/errors/faults/testlist 2009-03-28 21:34:30 UTC (rev 4217)
+++ mpich2/trunk/test/mpi/errors/faults/testlist 2009-03-28 21:34:49 UTC (rev 4218)
@@ -1 +1,4 @@
-pt2ptf1 4
\ No newline at end of file
+pt2ptf1 4 env=MPIEXEC_ALLOW_FAULT=YES
+collf1 4 env=MPIEXEC_ALLOW_FAULT=YES
+pt2ptf2 4 env=MPIEXEC_ALLOW_FAULT=YES
+collf2 4 env=MPIEXEC_ALLOW_FAULT=YES
\ No newline at end of file
More information about the mpich2-commits
mailing list