[Darshan-commits] [Git][darshan/darshan][dev-modular] fix possible race cond in module locking
Shane Snyder
xgitlab at cels.anl.gov
Tue Dec 8 14:26:11 CST 2015
Shane Snyder pushed to branch dev-modular at darshan / darshan
Commits:
c371cdea by Shane Snyder at 2015-12-08T14:25:51Z
fix possible race cond in module locking
- - - - -
6 changed files:
- darshan-runtime/lib/darshan-bgq.c
- darshan-runtime/lib/darshan-hdf5.c
- darshan-runtime/lib/darshan-mpiio.c
- darshan-runtime/lib/darshan-null.c
- darshan-runtime/lib/darshan-pnetcdf.c
- darshan-runtime/lib/darshan-posix.c
Changes:
=====================================
darshan-runtime/lib/darshan-bgq.c
=====================================
--- a/darshan-runtime/lib/darshan-bgq.c
+++ b/darshan-runtime/lib/darshan-bgq.c
@@ -212,7 +212,6 @@ static void bgq_get_output_data(
void **buffer,
int *size)
{
-
/* Just set the output buffer to point at the array of the "BGQ" module's
* I/O records, and set the output size according to the number of records
* currently being tracked.
@@ -221,6 +220,8 @@ static void bgq_get_output_data(
int result;
uint64_t *ion_ids;
+ BGQ_LOCK();
+
if (my_rank == 0)
{
DARSHAN_MPI_CALL(PMPI_Comm_size)(mod_comm, &nprocs);
@@ -268,18 +269,21 @@ static void bgq_get_output_data(
*size = 0;
}
+ BGQ_UNLOCK();
return;
}
/* Shutdown the "BGQ" module by freeing up all data structures. */
static void bgq_shutdown()
{
+ BGQ_LOCK();
if (bgq_runtime)
{
free(bgq_runtime);
bgq_runtime = NULL;
}
+ BGQ_UNLOCK();
return;
}
=====================================
darshan-runtime/lib/darshan-hdf5.c
=====================================
--- a/darshan-runtime/lib/darshan-hdf5.c
+++ b/darshan-runtime/lib/darshan-hdf5.c
@@ -484,6 +484,8 @@ static void hdf5_get_output_data(
assert(hdf5_runtime);
+ HDF5_LOCK();
+
/* if there are globally shared files, do a shared file reduction */
/* NOTE: the shared file reduction is also skipped if the
* DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
@@ -516,7 +518,10 @@ static void hdf5_get_output_data(
{
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_hdf5_file));
if(!red_recv_buf)
+ {
+ HDF5_UNLOCK();
return;
+ }
}
/* construct a datatype for a HDF5 file record. This is serving no purpose
@@ -553,6 +558,7 @@ static void hdf5_get_output_data(
*hdf5_buf = (void *)(hdf5_runtime->file_record_array);
*hdf5_buf_sz = hdf5_runtime->file_array_ndx * sizeof(struct darshan_hdf5_file);
+ HDF5_UNLOCK();
return;
}
@@ -562,6 +568,7 @@ static void hdf5_shutdown()
assert(hdf5_runtime);
+ HDF5_LOCK();
HASH_ITER(hlink, hdf5_runtime->hid_hash, ref, tmp)
{
HASH_DELETE(hlink, hdf5_runtime->hid_hash, ref);
@@ -575,6 +582,7 @@ static void hdf5_shutdown()
free(hdf5_runtime);
hdf5_runtime = NULL;
+ HDF5_UNLOCK();
return;
}
=====================================
darshan-runtime/lib/darshan-mpiio.c
=====================================
--- a/darshan-runtime/lib/darshan-mpiio.c
+++ b/darshan-runtime/lib/darshan-mpiio.c
@@ -1320,6 +1320,8 @@ static void mpiio_get_output_data(
assert(mpiio_runtime);
+ MPIIO_LOCK();
+
/* go through and set the 4 most common access sizes for MPI-IO */
for(i = 0; i < mpiio_runtime->file_array_ndx; i++)
{
@@ -1388,7 +1390,10 @@ static void mpiio_get_output_data(
{
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_mpiio_file));
if(!red_recv_buf)
+ {
+ MPIIO_UNLOCK();
return;
+ }
}
/* construct a datatype for a MPIIO file record. This is serving no purpose
@@ -1429,6 +1434,7 @@ static void mpiio_get_output_data(
*mpiio_buf = (void *)(mpiio_runtime->file_record_array);
*mpiio_buf_sz = mpiio_runtime->file_array_ndx * sizeof(struct darshan_mpiio_file);
+ MPIIO_UNLOCK();
return;
}
@@ -1438,6 +1444,7 @@ static void mpiio_shutdown()
assert(mpiio_runtime);
+ MPIIO_LOCK();
HASH_ITER(hlink, mpiio_runtime->fh_hash, ref, tmp)
{
HASH_DELETE(hlink, mpiio_runtime->fh_hash, ref);
@@ -1451,6 +1458,7 @@ static void mpiio_shutdown()
free(mpiio_runtime);
mpiio_runtime = NULL;
+ MPIIO_UNLOCK();
return;
}
=====================================
darshan-runtime/lib/darshan-null.c
=====================================
--- a/darshan-runtime/lib/darshan-null.c
+++ b/darshan-runtime/lib/darshan-null.c
@@ -349,6 +349,8 @@ static void null_get_output_data(
{
assert(null_runtime);
+ NULL_LOCK();
+
/* NOTE: this function can be used to run collective operations prior to
* shutting down the module, as implied by the MPI communicator passed in
* as the first agrument. Typically, module developers will want to run a
@@ -366,6 +368,7 @@ static void null_get_output_data(
*null_buf = (void *)(null_runtime->record_array);
*null_buf_sz = null_runtime->rec_array_ndx * sizeof(struct darshan_null_record);
+ NULL_UNLOCK();
return;
}
@@ -374,6 +377,7 @@ static void null_shutdown()
{
assert(null_runtime);
+ NULL_LOCK();
HASH_CLEAR(hlink, null_runtime->record_hash); /* these hash entries are freed all at once below */
free(null_runtime->runtime_record_array);
@@ -381,6 +385,7 @@ static void null_shutdown()
free(null_runtime);
null_runtime = NULL;
+ NULL_UNLOCK();
return;
}
=====================================
darshan-runtime/lib/darshan-pnetcdf.c
=====================================
--- a/darshan-runtime/lib/darshan-pnetcdf.c
+++ b/darshan-runtime/lib/darshan-pnetcdf.c
@@ -496,6 +496,8 @@ static void pnetcdf_get_output_data(
assert(pnetcdf_runtime);
+ PNETCDF_LOCK();
+
/* if there are globally shared files, do a shared file reduction */
/* NOTE: the shared file reduction is also skipped if the
* DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
@@ -528,7 +530,10 @@ static void pnetcdf_get_output_data(
{
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_pnetcdf_file));
if(!red_recv_buf)
+ {
+ PNETCDF_UNLOCK();
return;
+ }
}
/* construct a datatype for a PNETCDF file record. This is serving no purpose
@@ -565,6 +570,7 @@ static void pnetcdf_get_output_data(
*pnetcdf_buf = (void *)(pnetcdf_runtime->file_record_array);
*pnetcdf_buf_sz = pnetcdf_runtime->file_array_ndx * sizeof(struct darshan_pnetcdf_file);
+ PNETCDF_UNLOCK();
return;
}
@@ -574,6 +580,7 @@ static void pnetcdf_shutdown()
assert(pnetcdf_runtime);
+ PNETCDF_LOCK();
HASH_ITER(hlink, pnetcdf_runtime->ncid_hash, ref, tmp)
{
HASH_DELETE(hlink, pnetcdf_runtime->ncid_hash, ref);
@@ -587,6 +594,7 @@ static void pnetcdf_shutdown()
free(pnetcdf_runtime);
pnetcdf_runtime = NULL;
+ PNETCDF_UNLOCK();
return;
}
=====================================
darshan-runtime/lib/darshan-posix.c
=====================================
--- a/darshan-runtime/lib/darshan-posix.c
+++ b/darshan-runtime/lib/darshan-posix.c
@@ -2062,6 +2062,8 @@ static void posix_get_output_data(
assert(posix_runtime);
+ POSIX_LOCK();
+
/* go through file access data for each record and set the 4 most common
* stride/access size counters.
*/
@@ -2136,7 +2138,10 @@ static void posix_get_output_data(
{
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_posix_file));
if(!red_recv_buf)
+ {
return;
+ POSIX_UNLOCK();
+ }
}
/* construct a datatype for a POSIX file record. This is serving no purpose
@@ -2177,6 +2182,7 @@ static void posix_get_output_data(
*posix_buf = (void *)(posix_runtime->file_record_array);
*posix_buf_sz = posix_runtime->file_array_ndx * sizeof(struct darshan_posix_file);
+ POSIX_UNLOCK();
return;
}
@@ -2186,6 +2192,7 @@ static void posix_shutdown()
assert(posix_runtime);
+ POSIX_LOCK();
HASH_ITER(hlink, posix_runtime->fd_hash, ref, tmp)
{
HASH_DELETE(hlink, posix_runtime->fd_hash, ref);
@@ -2198,7 +2205,8 @@ static void posix_shutdown()
free(posix_runtime->file_record_array);
free(posix_runtime);
posix_runtime = NULL;
-
+
+ POSIX_UNLOCK();
return;
}
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/commit/c371cdea3257003237f12c63ff688c35934a3464
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20151208/12f6861d/attachment-0001.html>
More information about the Darshan-commits
mailing list