[Darshan-commits] [Git][darshan/darshan][dev-modular] fix possible race cond in module locking

Shane Snyder xgitlab at cels.anl.gov
Tue Dec 8 14:26:11 CST 2015


Shane Snyder pushed to branch dev-modular at darshan / darshan


Commits:
c371cdea by Shane Snyder at 2015-12-08T14:25:51Z
fix possible race cond in module locking

- - - - -


6 changed files:

- darshan-runtime/lib/darshan-bgq.c
- darshan-runtime/lib/darshan-hdf5.c
- darshan-runtime/lib/darshan-mpiio.c
- darshan-runtime/lib/darshan-null.c
- darshan-runtime/lib/darshan-pnetcdf.c
- darshan-runtime/lib/darshan-posix.c


Changes:

=====================================
darshan-runtime/lib/darshan-bgq.c
=====================================
--- a/darshan-runtime/lib/darshan-bgq.c
+++ b/darshan-runtime/lib/darshan-bgq.c
@@ -212,7 +212,6 @@ static void bgq_get_output_data(
     void **buffer,
     int *size)
 {
-
     /* Just set the output buffer to point at the array of the "BGQ" module's
      * I/O records, and set the output size according to the number of records
      * currently being tracked.
@@ -221,6 +220,8 @@ static void bgq_get_output_data(
     int result;
     uint64_t *ion_ids;
 
+    BGQ_LOCK();
+
     if (my_rank == 0)
     {
         DARSHAN_MPI_CALL(PMPI_Comm_size)(mod_comm, &nprocs);
@@ -268,18 +269,21 @@ static void bgq_get_output_data(
         *size   = 0;
     }
 
+    BGQ_UNLOCK();
     return;
 }
 
 /* Shutdown the "BGQ" module by freeing up all data structures. */
 static void bgq_shutdown()
 {
+    BGQ_LOCK();
     if (bgq_runtime)
     {
         free(bgq_runtime);
         bgq_runtime = NULL;
     }
 
+    BGQ_UNLOCK();
     return;
 }
 


=====================================
darshan-runtime/lib/darshan-hdf5.c
=====================================
--- a/darshan-runtime/lib/darshan-hdf5.c
+++ b/darshan-runtime/lib/darshan-hdf5.c
@@ -484,6 +484,8 @@ static void hdf5_get_output_data(
 
     assert(hdf5_runtime);
 
+    HDF5_LOCK();
+
     /* if there are globally shared files, do a shared file reduction */
     /* NOTE: the shared file reduction is also skipped if the 
      * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
@@ -516,7 +518,10 @@ static void hdf5_get_output_data(
         {
             red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_hdf5_file));
             if(!red_recv_buf)
+            {
+                HDF5_UNLOCK();
                 return;
+            }
         }
 
         /* construct a datatype for a HDF5 file record.  This is serving no purpose
@@ -553,6 +558,7 @@ static void hdf5_get_output_data(
     *hdf5_buf = (void *)(hdf5_runtime->file_record_array);
     *hdf5_buf_sz = hdf5_runtime->file_array_ndx * sizeof(struct darshan_hdf5_file);
 
+    HDF5_UNLOCK();
     return;
 }
 
@@ -562,6 +568,7 @@ static void hdf5_shutdown()
 
     assert(hdf5_runtime);
 
+    HDF5_LOCK();
     HASH_ITER(hlink, hdf5_runtime->hid_hash, ref, tmp)
     {
         HASH_DELETE(hlink, hdf5_runtime->hid_hash, ref);
@@ -575,6 +582,7 @@ static void hdf5_shutdown()
     free(hdf5_runtime);
     hdf5_runtime = NULL;
 
+    HDF5_UNLOCK();
     return;
 }
 


=====================================
darshan-runtime/lib/darshan-mpiio.c
=====================================
--- a/darshan-runtime/lib/darshan-mpiio.c
+++ b/darshan-runtime/lib/darshan-mpiio.c
@@ -1320,6 +1320,8 @@ static void mpiio_get_output_data(
 
     assert(mpiio_runtime);
 
+    MPIIO_LOCK();
+
     /* go through and set the 4 most common access sizes for MPI-IO */
     for(i = 0; i < mpiio_runtime->file_array_ndx; i++)
     {
@@ -1388,7 +1390,10 @@ static void mpiio_get_output_data(
         {
             red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_mpiio_file));
             if(!red_recv_buf)
+            {
+                MPIIO_UNLOCK();
                 return;
+            }
         }
 
         /* construct a datatype for a MPIIO file record.  This is serving no purpose
@@ -1429,6 +1434,7 @@ static void mpiio_get_output_data(
     *mpiio_buf = (void *)(mpiio_runtime->file_record_array);
     *mpiio_buf_sz = mpiio_runtime->file_array_ndx * sizeof(struct darshan_mpiio_file);
 
+    MPIIO_UNLOCK();
     return;
 }
 
@@ -1438,6 +1444,7 @@ static void mpiio_shutdown()
 
     assert(mpiio_runtime);
 
+    MPIIO_LOCK();
     HASH_ITER(hlink, mpiio_runtime->fh_hash, ref, tmp)
     {
         HASH_DELETE(hlink, mpiio_runtime->fh_hash, ref);
@@ -1451,6 +1458,7 @@ static void mpiio_shutdown()
     free(mpiio_runtime);
     mpiio_runtime = NULL;
 
+    MPIIO_UNLOCK();
     return;
 }
 


=====================================
darshan-runtime/lib/darshan-null.c
=====================================
--- a/darshan-runtime/lib/darshan-null.c
+++ b/darshan-runtime/lib/darshan-null.c
@@ -349,6 +349,8 @@ static void null_get_output_data(
 {
     assert(null_runtime);
 
+    NULL_LOCK();
+
     /* NOTE: this function can be used to run collective operations prior to
      * shutting down the module, as implied by the MPI communicator passed in
      * as the first agrument. Typically, module developers will want to run a
@@ -366,6 +368,7 @@ static void null_get_output_data(
     *null_buf = (void *)(null_runtime->record_array);
     *null_buf_sz = null_runtime->rec_array_ndx * sizeof(struct darshan_null_record);
 
+    NULL_UNLOCK();
     return;
 }
 
@@ -374,6 +377,7 @@ static void null_shutdown()
 {
     assert(null_runtime);
 
+    NULL_LOCK();
     HASH_CLEAR(hlink, null_runtime->record_hash); /* these hash entries are freed all at once below */
 
     free(null_runtime->runtime_record_array);
@@ -381,6 +385,7 @@ static void null_shutdown()
     free(null_runtime);
     null_runtime = NULL;
 
+    NULL_UNLOCK();
     return;
 }
 


=====================================
darshan-runtime/lib/darshan-pnetcdf.c
=====================================
--- a/darshan-runtime/lib/darshan-pnetcdf.c
+++ b/darshan-runtime/lib/darshan-pnetcdf.c
@@ -496,6 +496,8 @@ static void pnetcdf_get_output_data(
 
     assert(pnetcdf_runtime);
 
+    PNETCDF_LOCK();
+
     /* if there are globally shared files, do a shared file reduction */
     /* NOTE: the shared file reduction is also skipped if the 
      * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
@@ -528,7 +530,10 @@ static void pnetcdf_get_output_data(
         {
             red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_pnetcdf_file));
             if(!red_recv_buf)
+            {
+                PNETCDF_UNLOCK();
                 return;
+            }
         }
 
         /* construct a datatype for a PNETCDF file record.  This is serving no purpose
@@ -565,6 +570,7 @@ static void pnetcdf_get_output_data(
     *pnetcdf_buf = (void *)(pnetcdf_runtime->file_record_array);
     *pnetcdf_buf_sz = pnetcdf_runtime->file_array_ndx * sizeof(struct darshan_pnetcdf_file);
 
+    PNETCDF_UNLOCK();
     return;
 }
 
@@ -574,6 +580,7 @@ static void pnetcdf_shutdown()
 
     assert(pnetcdf_runtime);
 
+    PNETCDF_LOCK();
     HASH_ITER(hlink, pnetcdf_runtime->ncid_hash, ref, tmp)
     {
         HASH_DELETE(hlink, pnetcdf_runtime->ncid_hash, ref);
@@ -587,6 +594,7 @@ static void pnetcdf_shutdown()
     free(pnetcdf_runtime);
     pnetcdf_runtime = NULL;
 
+    PNETCDF_UNLOCK();
     return;
 }
 


=====================================
darshan-runtime/lib/darshan-posix.c
=====================================
--- a/darshan-runtime/lib/darshan-posix.c
+++ b/darshan-runtime/lib/darshan-posix.c
@@ -2062,6 +2062,8 @@ static void posix_get_output_data(
 
     assert(posix_runtime);
 
+    POSIX_LOCK();
+
     /* go through file access data for each record and set the 4 most common
      * stride/access size counters.
      */
@@ -2136,7 +2138,10 @@ static void posix_get_output_data(
         {
             red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_posix_file));
             if(!red_recv_buf)
+            {
                 return;
+                POSIX_UNLOCK();
+            }
         }
 
         /* construct a datatype for a POSIX file record.  This is serving no purpose
@@ -2177,6 +2182,7 @@ static void posix_get_output_data(
     *posix_buf = (void *)(posix_runtime->file_record_array);
     *posix_buf_sz = posix_runtime->file_array_ndx * sizeof(struct darshan_posix_file);
 
+    POSIX_UNLOCK();
     return;
 }
 
@@ -2186,6 +2192,7 @@ static void posix_shutdown()
 
     assert(posix_runtime);
 
+    POSIX_LOCK();
     HASH_ITER(hlink, posix_runtime->fd_hash, ref, tmp)
     {
         HASH_DELETE(hlink, posix_runtime->fd_hash, ref);
@@ -2198,7 +2205,8 @@ static void posix_shutdown()
     free(posix_runtime->file_record_array);
     free(posix_runtime);
     posix_runtime = NULL;
-
+    
+    POSIX_UNLOCK();
     return;
 }
 



View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/commit/c371cdea3257003237f12c63ff688c35934a3464
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20151208/12f6861d/attachment-0001.html>


More information about the Darshan-commits mailing list