[Darshan-commits] [Git][darshan/darshan][dev-no-mpi] 2 commits: bug fix: don't use mpi_in_place on nonzero ranks
Shane Snyder
xgitlab at cels.anl.gov
Tue Dec 10 22:41:02 CST 2019
Shane Snyder pushed to branch dev-no-mpi at darshan / darshan
Commits:
d987494e by Shane Snyder at 2019-12-11T04:40:47Z
bug fix: don't use mpi_in_place on nonzero ranks
- - - - -
aff75f19 by Shane Snyder at 2019-12-11T04:40:47Z
bug fix: set input buffer for mpi reduxes
- - - - -
1 changed file:
- darshan-runtime/lib/darshan-core.c
Changes:
=====================================
darshan-runtime/lib/darshan-core.c
=====================================
@@ -383,10 +383,18 @@ void darshan_core_initialize(int argc, char **argv)
#ifdef HAVE_MPI
if(using_mpi)
{
- PMPI_Reduce(MPI_IN_PLACE, &init_time, 1,
- MPI_DOUBLE, MPI_MAX, 0, darshan_core->mpi_comm);
+ if(my_rank == 0)
+ {
+ PMPI_Reduce(MPI_IN_PLACE, &init_time, 1,
+ MPI_DOUBLE, MPI_MAX, 0, darshan_core->mpi_comm);
+ }
+ else
+ {
+ PMPI_Reduce(&init_time, &init_time, 1,
+ MPI_DOUBLE, MPI_MAX, 0, darshan_core->mpi_comm);
+ return; /* return early so every rank doesn't print */
+ }
}
- if(my_rank > 0) return;
#endif
darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
@@ -473,10 +481,22 @@ void darshan_core_shutdown()
MPI_SUM, final_core->mpi_comm);
/* reduce to report first start and last end time across all ranks at rank 0 */
- PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->start_time,
- 1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
- PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->end_time,
- 1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
+ if(my_rank == 0)
+ {
+ PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->start_time,
+ 1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
+ PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->end_time,
+ 1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
+ }
+ else
+ {
+ PMPI_Reduce(&final_core->log_job_p->start_time,
+ &final_core->log_job_p->start_time,
+ 1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
+ PMPI_Reduce(&final_core->log_job_p->end_time,
+ &final_core->log_job_p->end_time,
+ 1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
+ }
/* get a list of records which are shared across all processes */
darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);
@@ -547,42 +567,40 @@ void darshan_core_shutdown()
if(internal_timing_flag)
mod1[i] = darshan_core_wtime();
-#ifdef HAVE_MPI
- struct darshan_core_name_record_ref *ref = NULL;
- int mod_shared_rec_cnt = 0;
- int j;
-
- if(using_mpi)
+ /* if module is registered locally, perform module shutdown operations */
+ if(this_mod)
{
- /* set the shared record list for this module */
- for(j = 0; j < shared_rec_cnt; j++)
+ mod_buf = final_core->mod_array[i]->rec_buf_start;
+ mod_buf_sz = final_core->mod_array[i]->rec_buf_p - mod_buf;
+
+#ifdef HAVE_MPI
+ if(using_mpi)
{
- HASH_FIND(hlink, final_core->name_hash, &shared_recs[j],
- sizeof(darshan_record_id), ref);
- assert(ref);
- if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
+ struct darshan_core_name_record_ref *ref = NULL;
+ int mod_shared_rec_cnt = 0;
+ int j;
+
+ /* set the shared record list for this module */
+ for(j = 0; j < shared_rec_cnt; j++)
{
- mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
+ HASH_FIND(hlink, final_core->name_hash, &shared_recs[j],
+ sizeof(darshan_record_id), ref);
+ assert(ref);
+ if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
+ {
+ mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
+ }
}
- }
- /* allow the module an opportunity to reduce shared files */
- if(this_mod->mod_funcs.mod_redux_func && (mod_shared_recs > 0) &&
- (!getenv("DARSHAN_DISABLE_SHARED_REDUCTION")))
- this_mod->mod_funcs.mod_redux_func(mod_buf, final_core->mpi_comm,
- mod_shared_recs, mod_shared_rec_cnt);
- }
+ /* allow the module an opportunity to reduce shared files */
+ if(this_mod->mod_funcs.mod_redux_func && (mod_shared_recs > 0) &&
+ (!getenv("DARSHAN_DISABLE_SHARED_REDUCTION")))
+ this_mod->mod_funcs.mod_redux_func(mod_buf, final_core->mpi_comm,
+ mod_shared_recs, mod_shared_rec_cnt);
+ }
#endif
- /* if module is registered locally, get the corresponding output buffer
- *
- * NOTE: this function can be used to run collective operations across
- * modules, if there are records shared globally.
- */
- if(this_mod)
- {
- mod_buf = final_core->mod_array[i]->rec_buf_start;
- mod_buf_sz = final_core->mod_array[i]->rec_buf_p - mod_buf;
+ /* get the final output buffer */
this_mod->mod_funcs.mod_shutdown_func(&mod_buf, &mod_buf_sz);
}
@@ -643,22 +661,39 @@ void darshan_core_shutdown()
#ifdef HAVE_MPI
if(using_mpi)
{
- PMPI_Reduce(MPI_IN_PLACE, &open_tm, 1,
- MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
- PMPI_Reduce(MPI_IN_PLACE, &header_tm, 1,
- MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
- PMPI_Reduce(MPI_IN_PLACE, &job_tm, 1,
- MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
- PMPI_Reduce(MPI_IN_PLACE, &rec_tm, 1,
- MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
- PMPI_Reduce(MPI_IN_PLACE, &all_tm, 1,
- MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
- PMPI_Reduce(MPI_IN_PLACE, mod_tm, DARSHAN_MAX_MODS,
- MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
-
- /* let rank 0 report the timing info */
- if(my_rank > 0)
+ if(my_rank == 0)
+ {
+ PMPI_Reduce(MPI_IN_PLACE, &open_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(MPI_IN_PLACE, &header_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(MPI_IN_PLACE, &job_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(MPI_IN_PLACE, &rec_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(MPI_IN_PLACE, &all_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(MPI_IN_PLACE, mod_tm, DARSHAN_MAX_MODS,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ }
+ else
+ {
+ PMPI_Reduce(&open_tm, &open_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(&header_tm, &header_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(&job_tm, &job_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(&rec_tm, &rec_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(&all_tm, &all_tm, 1,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+ PMPI_Reduce(mod_tm, mod_tm, DARSHAN_MAX_MODS,
+ MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
+
+ /* let rank 0 report the timing info */
goto exit;
+ }
}
#endif
@@ -1678,7 +1713,7 @@ static int darshan_log_write_header(darshan_core_log_fh log_fh,
#ifdef HAVE_MPI
MPI_Status status;
- if (using_mpi)
+ if(using_mpi)
{
/* write out log header, after running 2 reductions on header variables:
* 1) reduce 'partial_flag' variable to determine which modules ran out
@@ -1686,16 +1721,25 @@ static int darshan_log_write_header(darshan_core_log_fh log_fh,
* 2) reduce 'mod_ver' array to determine which log format version each
* module used for this output log
*/
- PMPI_Reduce(
- MPI_IN_PLACE, &(core->log_hdr_p->partial_flag),
- 1, MPI_UINT32_T, MPI_BOR, 0, core->mpi_comm);
- PMPI_Reduce(
- MPI_IN_PLACE, &(core->log_hdr_p->mod_ver),
- DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, core->mpi_comm);
-
- /* only rank 0 writes the header */
- if(my_rank > 0)
- return(0);
+ if(my_rank == 0)
+ {
+ PMPI_Reduce(
+ MPI_IN_PLACE, &(core->log_hdr_p->partial_flag),
+ 1, MPI_UINT32_T, MPI_BOR, 0, core->mpi_comm);
+ PMPI_Reduce(
+ MPI_IN_PLACE, &(core->log_hdr_p->mod_ver),
+ DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, core->mpi_comm);
+ }
+ else
+ {
+ PMPI_Reduce(
+ &(core->log_hdr_p->partial_flag), &(core->log_hdr_p->partial_flag),
+ 1, MPI_UINT32_T, MPI_BOR, 0, core->mpi_comm);
+ PMPI_Reduce(
+ &(core->log_hdr_p->mod_ver), &(core->log_hdr_p->mod_ver),
+ DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, core->mpi_comm);
+ return(0); /* only rank 0 writes the header */
+ }
/* write the header using MPI */
ret = PMPI_File_write_at(log_fh.mpi_fh, 0, core->log_hdr_p,
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/compare/c0e202d699e382a9516eac42dca3e0592137cf45...aff75f19b5cb8d3d1e3cf01e7eabac8a84583105
--
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/compare/c0e202d699e382a9516eac42dca3e0592137cf45...aff75f19b5cb8d3d1e3cf01e7eabac8a84583105
You're receiving this email because of your account on xgitlab.cels.anl.gov.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20191210/84ea0337/attachment-0001.html>
More information about the Darshan-commits
mailing list