[Darshan-commits] [Git][darshan/darshan][master] 2 commits: don't call MPI_File_get_byte_offset for OpenMPI

Shane Snyder xgitlab at cels.anl.gov
Fri Apr 23 13:20:46 CDT 2021



Shane Snyder pushed to branch master at darshan / darshan


Commits:
6c5c6484 by Shane Snyder at 2021-04-23T13:20:43-05:00
don't call MPI_File_get_byte_offset for OpenMPI

A bug observed in some OpenMPI versions could lead to crashes
triggered by Darshan's own internal MPI-IO wrappers.

- - - - -
95678e94 by Shane Snyder at 2021-04-23T13:20:43-05:00
Merge branch 'snyder/dev-issue317-openmpi-check' into 'master'

Disable MPI-IO module offset tracking for OpenMPI builds

See merge request darshan/darshan!106
- - - - -


4 changed files:

- darshan-runtime/configure
- darshan-runtime/configure.in
- darshan-runtime/darshan-runtime-config.h.in
- darshan-runtime/lib/darshan-mpiio.c


Changes:

=====================================
darshan-runtime/configure
=====================================
@@ -5206,6 +5206,40 @@ fi
 done
 
 
+# check for OpenMPI
+# NOTE: this check is needed due to a bug observed in some OpenMPI
+# versions triggered by Darshan's MPI-IO module usage of the
+# MPI_File_get_byte_offset() routine. For applications that create
+# zero-length datatypes, then read/write to them, it can trigger a
+# seg fault due to a divide by zero condition in OpenMPI. See the
+# following Darshan issue for more details:
+#   https://xgitlab.cels.anl.gov/darshan/darshan/-/issues/317
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+
+        #include <mpi.h>
+        #ifdef OPEN_MPI
+        #error OPENMPI FOUND
+        #endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+
+$as_echo "#define HAVE_OPEN_MPI 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
 DARSHAN_VERSION="3.3.0-pre1"
 
 


=====================================
darshan-runtime/configure.in
=====================================
@@ -557,6 +557,24 @@ AC_CHECK_FUNCS([preadv],[],[])
 AC_CHECK_FUNCS([pwritev2],[],[])
 AC_CHECK_FUNCS([preadv2],[],[])
 
+# check for OpenMPI
+# NOTE: this check is needed due to a bug observed in some OpenMPI
+# versions triggered by Darshan's MPI-IO module usage of the
+# MPI_File_get_byte_offset() routine. For applications that create
+# zero-length datatypes, then read/write to them, it can trigger a
+# seg fault due to a divide by zero condition in OpenMPI. See the
+# following Darshan issue for more details:
+#   https://xgitlab.cels.anl.gov/darshan/darshan/-/issues/317
+AC_COMPILE_IFELSE([
+    AC_LANG_PROGRAM(,[
+        #include <mpi.h>
+        #ifdef OPEN_MPI
+        #error OPENMPI FOUND
+        #endif
+    ])],
+    [],
+    [AC_DEFINE(HAVE_OPEN_MPI, 1, Define if OpenMPI is being used)])
+
 DARSHAN_VERSION="AC_PACKAGE_VERSION"
 AC_SUBST(darshan_lib_path)
 AC_SUBST(darshan_share_path)


=====================================
darshan-runtime/darshan-runtime-config.h.in
=====================================
@@ -39,6 +39,9 @@
 /* Define if off64_t type is defined */
 #undef HAVE_OFF64_T
 
+/* Define if OpenMPI is being used */
+#undef HAVE_OPEN_MPI
+
 /* Define to 1 if you have the `preadv' function. */
 #undef HAVE_PREADV
 


=====================================
darshan-runtime/lib/darshan-mpiio.c
=====================================
@@ -236,10 +236,20 @@ static int my_rank = -1;
     if(newpath != __path) free(newpath); \
 } while(0)
 
+/* XXX: this check is needed to work around an OpenMPI bug that is triggered by
+ * Darshan's MPI-IO read/write wrappers usage of 'MPI_File_get_byte_offset()'
+ * for some workloads. For more details, see comments in 'darshan-runtime/configure.in'.
+ */
+#ifndef HAVE_OPEN_MPI
+static int get_byte_offset = 1;
+#else
+static int get_byte_offset = 0;
+#endif
+
 #define MPIIO_RECORD_READ(__ret, __fh, __count, __datatype, __offset, __counter, __tm1, __tm2) do { \
     struct mpiio_file_record_ref *rec_ref; \
     int size = 0; \
-    MPI_Offset displacement=0;\
+    MPI_Offset displacement=-1;\
     int64_t size_ll; \
     struct darshan_common_val_counter *cvc; \
     double __elapsed = __tm2-__tm1; \
@@ -248,7 +258,7 @@ static int my_rank = -1;
     if(!rec_ref) break; \
     PMPI_Type_size(__datatype, &size);  \
     size = size * __count; \
-    MPI_File_get_byte_offset(__fh, __offset, &displacement);\
+    if(get_byte_offset) MPI_File_get_byte_offset(__fh, __offset, &displacement);\
     /* DXT to record detailed read tracing information */ \
     dxt_mpiio_read(rec_ref->file_rec->base_rec.id, displacement, size, __tm1, __tm2); \
     DARSHAN_BUCKET_INC(&(rec_ref->file_rec->counters[MPIIO_SIZE_READ_AGG_0_100]), size); \
@@ -278,7 +288,7 @@ static int my_rank = -1;
 #define MPIIO_RECORD_WRITE(__ret, __fh, __count, __datatype, __offset, __counter, __tm1, __tm2) do { \
     struct mpiio_file_record_ref *rec_ref; \
     int size = 0; \
-    MPI_Offset displacement; \
+    MPI_Offset displacement=-1; \
     int64_t size_ll; \
     struct darshan_common_val_counter *cvc; \
     double __elapsed = __tm2-__tm1; \
@@ -288,7 +298,7 @@ static int my_rank = -1;
     PMPI_Type_size(__datatype, &size);  \
     size = size * __count; \
     /* DXT to record detailed write tracing information */ \
-    MPI_File_get_byte_offset(__fh, __offset, &displacement); \
+    if(get_byte_offset) MPI_File_get_byte_offset(__fh, __offset, &displacement); \
     dxt_mpiio_write(rec_ref->file_rec->base_rec.id, displacement, size, __tm1, __tm2); \
     DARSHAN_BUCKET_INC(&(rec_ref->file_rec->counters[MPIIO_SIZE_WRITE_AGG_0_100]), size); \
     size_ll = size; \



View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/-/compare/5b596ed12c746a059fb198eb0779f64f4ca78f4c...95678e943cfdafab39b8ba549dfa5775bf938e5c

-- 
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/-/compare/5b596ed12c746a059fb198eb0779f64f4ca78f4c...95678e943cfdafab39b8ba549dfa5775bf938e5c
You're receiving this email because of your account on xgitlab.cels.anl.gov.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20210423/8cd1665f/attachment-0001.html>


More information about the Darshan-commits mailing list