[Darshan-commits] [Git][darshan/darshan][master] 2 commits: don't call MPI_File_get_byte_offset for OpenMPI
Shane Snyder
xgitlab at cels.anl.gov
Fri Apr 23 13:20:46 CDT 2021
Shane Snyder pushed to branch master at darshan / darshan
Commits:
6c5c6484 by Shane Snyder at 2021-04-23T13:20:43-05:00
don't call MPI_File_get_byte_offset for OpenMPI
A bug observed in some OpenMPI versions could lead to crashes
triggered by Darshan's own internal MPI-IO wrappers.
- - - - -
95678e94 by Shane Snyder at 2021-04-23T13:20:43-05:00
Merge branch 'snyder/dev-issue317-openmpi-check' into 'master'
Disable MPI-IO module offset tracking for OpenMPI builds
See merge request darshan/darshan!106
- - - - -
4 changed files:
- darshan-runtime/configure
- darshan-runtime/configure.in
- darshan-runtime/darshan-runtime-config.h.in
- darshan-runtime/lib/darshan-mpiio.c
Changes:
=====================================
darshan-runtime/configure
=====================================
@@ -5206,6 +5206,40 @@ fi
done
+# check for OpenMPI
+# NOTE: this check is needed due to a bug observed in some OpenMPI
+# versions triggered by Darshan's MPI-IO module usage of the
+# MPI_File_get_byte_offset() routine. For applications that create
+# zero-length datatypes, then read/write to them, it can trigger a
+# seg fault due to a divide by zero condition in OpenMPI. See the
+# following Darshan issue for more details:
+# https://xgitlab.cels.anl.gov/darshan/darshan/-/issues/317
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+
+int
+main ()
+{
+
+ #include <mpi.h>
+ #ifdef OPEN_MPI
+ #error OPENMPI FOUND
+ #endif
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+
+$as_echo "#define HAVE_OPEN_MPI 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
DARSHAN_VERSION="3.3.0-pre1"
=====================================
darshan-runtime/configure.in
=====================================
@@ -557,6 +557,24 @@ AC_CHECK_FUNCS([preadv],[],[])
AC_CHECK_FUNCS([pwritev2],[],[])
AC_CHECK_FUNCS([preadv2],[],[])
+# check for OpenMPI
+# NOTE: this check is needed due to a bug observed in some OpenMPI
+# versions triggered by Darshan's MPI-IO module usage of the
+# MPI_File_get_byte_offset() routine. For applications that create
+# zero-length datatypes, then read/write to them, it can trigger a
+# seg fault due to a divide by zero condition in OpenMPI. See the
+# following Darshan issue for more details:
+# https://xgitlab.cels.anl.gov/darshan/darshan/-/issues/317
+AC_COMPILE_IFELSE([
+ AC_LANG_PROGRAM(,[
+ #include <mpi.h>
+ #ifdef OPEN_MPI
+ #error OPENMPI FOUND
+ #endif
+ ])],
+ [],
+ [AC_DEFINE(HAVE_OPEN_MPI, 1, Define if OpenMPI is being used)])
+
DARSHAN_VERSION="AC_PACKAGE_VERSION"
AC_SUBST(darshan_lib_path)
AC_SUBST(darshan_share_path)
=====================================
darshan-runtime/darshan-runtime-config.h.in
=====================================
@@ -39,6 +39,9 @@
/* Define if off64_t type is defined */
#undef HAVE_OFF64_T
+/* Define if OpenMPI is being used */
+#undef HAVE_OPEN_MPI
+
/* Define to 1 if you have the `preadv' function. */
#undef HAVE_PREADV
=====================================
darshan-runtime/lib/darshan-mpiio.c
=====================================
@@ -236,10 +236,20 @@ static int my_rank = -1;
if(newpath != __path) free(newpath); \
} while(0)
+/* XXX: this check is needed to work around an OpenMPI bug that is triggered by
+ * Darshan's MPI-IO read/write wrappers usage of 'MPI_File_get_byte_offset()'
+ * for some workloads. For more details, see comments in 'darshan-runtime/configure.in'.
+ */
+#ifndef HAVE_OPEN_MPI
+static int get_byte_offset = 1;
+#else
+static int get_byte_offset = 0;
+#endif
+
#define MPIIO_RECORD_READ(__ret, __fh, __count, __datatype, __offset, __counter, __tm1, __tm2) do { \
struct mpiio_file_record_ref *rec_ref; \
int size = 0; \
- MPI_Offset displacement=0;\
+ MPI_Offset displacement=-1;\
int64_t size_ll; \
struct darshan_common_val_counter *cvc; \
double __elapsed = __tm2-__tm1; \
@@ -248,7 +258,7 @@ static int my_rank = -1;
if(!rec_ref) break; \
PMPI_Type_size(__datatype, &size); \
size = size * __count; \
- MPI_File_get_byte_offset(__fh, __offset, &displacement);\
+ if(get_byte_offset) MPI_File_get_byte_offset(__fh, __offset, &displacement);\
/* DXT to record detailed read tracing information */ \
dxt_mpiio_read(rec_ref->file_rec->base_rec.id, displacement, size, __tm1, __tm2); \
DARSHAN_BUCKET_INC(&(rec_ref->file_rec->counters[MPIIO_SIZE_READ_AGG_0_100]), size); \
@@ -278,7 +288,7 @@ static int my_rank = -1;
#define MPIIO_RECORD_WRITE(__ret, __fh, __count, __datatype, __offset, __counter, __tm1, __tm2) do { \
struct mpiio_file_record_ref *rec_ref; \
int size = 0; \
- MPI_Offset displacement; \
+ MPI_Offset displacement=-1; \
int64_t size_ll; \
struct darshan_common_val_counter *cvc; \
double __elapsed = __tm2-__tm1; \
@@ -288,7 +298,7 @@ static int my_rank = -1;
PMPI_Type_size(__datatype, &size); \
size = size * __count; \
/* DXT to record detailed write tracing information */ \
- MPI_File_get_byte_offset(__fh, __offset, &displacement); \
+ if(get_byte_offset) MPI_File_get_byte_offset(__fh, __offset, &displacement); \
dxt_mpiio_write(rec_ref->file_rec->base_rec.id, displacement, size, __tm1, __tm2); \
DARSHAN_BUCKET_INC(&(rec_ref->file_rec->counters[MPIIO_SIZE_WRITE_AGG_0_100]), size); \
size_ll = size; \
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/-/compare/5b596ed12c746a059fb198eb0779f64f4ca78f4c...95678e943cfdafab39b8ba549dfa5775bf938e5c
--
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/-/compare/5b596ed12c746a059fb198eb0779f64f4ca78f4c...95678e943cfdafab39b8ba549dfa5775bf938e5c
You're receiving this email because of your account on xgitlab.cels.anl.gov.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20210423/8cd1665f/attachment-0001.html>
More information about the Darshan-commits
mailing list