[Darshan-commits] [Darshan] branch, dev-modular, updated. darshan-2.3.1-118-gaa46bf3

Service Account git at mcs.anl.gov
Wed Jul 22 15:03:05 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "".

The branch, dev-modular has been updated
       via  aa46bf39a7c294be76267303f98f8d34a5da3975 (commit)
       via  4e6d0c89355ee38b733e83d594b77590f379ecde (commit)
       via  f4df41a355a7327706ed3a9fc8d65a0ab4496c0b (commit)
      from  a708b0ce36c2bc7fd25b44d1756624129bb870e0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit aa46bf39a7c294be76267303f98f8d34a5da3975
Author: Shane Snyder <ssnyder at mcs.anl.gov>
Date:   Wed Jul 22 15:02:40 2015 -0500

    add time/size variance reductions to mpiio/posix

commit 4e6d0c89355ee38b733e83d594b77590f379ecde
Author: Shane Snyder <ssnyder at mcs.anl.gov>
Date:   Wed Jul 22 10:32:40 2015 -0500

    move shared file redux code to get_output func

commit f4df41a355a7327706ed3a9fc8d65a0ab4496c0b
Author: Shane Snyder <ssnyder at mcs.anl.gov>
Date:   Tue Jul 21 16:42:41 2015 -0500

    add slowest/fastest rank counters to mpiio

-----------------------------------------------------------------------

Summary of changes:
 darshan-mpiio-log-format.h           |    5 +-
 darshan-posix-log-format.h           |    9 +-
 darshan-runtime/darshan-common.h     |   25 +++
 darshan-runtime/darshan.h            |   44 ++---
 darshan-runtime/lib/darshan-common.c |   21 ++
 darshan-runtime/lib/darshan-core.c   |   70 ++-----
 darshan-runtime/lib/darshan-mpiio.c  |  360 ++++++++++++++++++++++---------
 darshan-runtime/lib/darshan-null.c   |   26 ++-
 darshan-runtime/lib/darshan-posix.c  |  393 +++++++++++++++++++++-------------
 9 files changed, 611 insertions(+), 342 deletions(-)


Diff of changes:
diff --git a/darshan-mpiio-log-format.h b/darshan-mpiio-log-format.h
index 2dd254d..5aa28e2 100644
--- a/darshan-mpiio-log-format.h
+++ b/darshan-mpiio-log-format.h
@@ -9,7 +9,6 @@
 
 #include "darshan-log-format.h"
 
-/* TODO: slowest/fastest rank (f)counters */
 /* TODO: maybe use a counter to track cases in which a derived datatype is used? */
 
 #define MPIIO_COUNTERS \
@@ -116,6 +115,10 @@
     /* total i/o and meta time for fastest/slowest ranks */\
     X(MPIIO_F_FASTEST_RANK_TIME) \
     X(MPIIO_F_SLOWEST_RANK_TIME) \
+    /* variance of total i/o time and bytes moved across all ranks */\
+    /* NOTE: for shared records only */\
+    X(MPIIO_F_VARIANCE_RANK_TIME) \
+    X(MPIIO_F_VARIANCE_RANK_BYTES) \
     /* end of counters*/\
     X(MPIIO_F_NUM_INDICES)
 
diff --git a/darshan-posix-log-format.h b/darshan-posix-log-format.h
index c7c5503..237e478 100644
--- a/darshan-posix-log-format.h
+++ b/darshan-posix-log-format.h
@@ -8,11 +8,6 @@
 
 #include "darshan-log-format.h"
 
-/* TODO we need to be able to run more reduction operations to get
- * time and byte variances for shared files. currently, darshan-core
- * just runs a single reduction, which is used to reduce all other
- * shared record fields. (VARIANCE_RANK_TIME, VARIANCE_RANK_BYTES) */
-
 #define POSIX_COUNTERS \
     /* count of posix opens */\
     X(POSIX_OPENS) \
@@ -144,6 +139,10 @@
     /* total i/o and meta time consumed for fastest/slowest ranks */\
     X(POSIX_F_FASTEST_RANK_TIME) \
     X(POSIX_F_SLOWEST_RANK_TIME) \
+    /* variance of total i/o time and bytes moved across all ranks */\
+    /* NOTE: for shared records only */\
+    X(POSIX_F_VARIANCE_RANK_TIME) \
+    X(POSIX_F_VARIANCE_RANK_BYTES) \
     /* end of counters */\
     X(POSIX_F_NUM_INDICES)
 
diff --git a/darshan-runtime/darshan-common.h b/darshan-runtime/darshan-common.h
index c359b6e..4e42188 100644
--- a/darshan-runtime/darshan-common.h
+++ b/darshan-runtime/darshan-common.h
@@ -118,6 +118,14 @@ enum darshan_io_type
     DARSHAN_IO_WRITE = 2,
 };
 
+/* struct used for calculating variances */
+struct darshan_variance_dt
+{
+    double n;
+    double T;
+    double S;
+};
+
 /***********************************************
 * darshan-common functions for darshan modules *
 ***********************************************/
@@ -164,4 +172,21 @@ void darshan_walk_common_vals(
     int64_t* val_p,
     int64_t* cnt_p);
 
+/* darshan_variance_reduce()
+ *
+ * MPI reduction operation to calculate variances on counters in
+ * data records which are shared across all processes. This could
+ * be used, for instance, to find the variance in I/O time or total
+ * bytes moved for a given data record. This function needs to be
+ * passed to MPI_Op_create to obtain a corresponding MPI operation
+ * which can be used to complete the reduction.  For more details,
+ * consult the documentation for MPI_Op_create. Example use cases
+ * can be found in the POSIX and MPIIO modules.
+ */
+void darshan_variance_reduce(
+    void *invec,
+    void *inoutvec,
+    int *len,
+    MPI_Datatype *dt);
+
 #endif /* __DARSHAN_COMMON_H */
diff --git a/darshan-runtime/darshan.h b/darshan-runtime/darshan.h
index 4a15340..d6aacca 100644
--- a/darshan-runtime/darshan.h
+++ b/darshan-runtime/darshan.h
@@ -56,38 +56,28 @@
 /* module developers provide the following functions to darshan-core */
 struct darshan_module_funcs
 {
-    /* perform any necessary pre-shutdown steps */
+    /* perform any necessary pre-shutdown steps
+     *
+     * NOTE: this typically includes disabling wrapper functions so
+     * darshan-core can shutdown in a consistent state.
+     */
     void (*begin_shutdown)(void);
-    /* retrieve module data to write to log file */
+    /* retrieve module data to write to log file
+     *
+     * NOTE: module developers can use this function to run collective
+     * MPI operations at shutdown time. Typically this functionality
+     * has been used to reduce records shared globablly (given in the
+     * 'shared_recs' array) into a single data record.
+     */
     void (*get_output_data)(
-        void** buf, /* output parameter to save module buffer address */
-        int* size /* output parameter to save module buffer size */
+        MPI_Comm mod_comm,  /* MPI communicator to run collectives with */
+        darshan_record_id *shared_recs, /* list of shared data record ids */
+        int shared_rec_count, /* count of shared data records */
+        void** mod_buf, /* output parameter to save module buffer address */
+        int* mod_buf_sz /* output parameter to save module buffer size */
     );
     /* shutdown module data structures */
     void (*shutdown)(void);
-    /* (OPTIONAL) perform any necessary steps prior to performing a reduction
-     * of shared Darshan I/O records. To bypass shared file reduction mechanism,
-     * set this pointer to NULL.
-     */
-    void (*setup_reduction)(
-        darshan_record_id *shared_recs, /* input list of shared records */
-        int *shared_rec_count, /* in/out shared record count */
-        void **send_buf, /* send buffer for shared file reduction */
-        void **recv_buf, /* recv buffer for shared file reduction (root only) */
-        int *rec_size /* size of records being stored for this module */
-    );
-    /* (OPTIONAL) perform the actual shared file reduction operation. This 
-     * operation follows the prototype of MPI_Op_create, which allows the
-     * specification of user-defined combination functions which may be used
-     * directly by MPI. To bypass shared file reduction mechanism, set this
-     * pointer to NULL. 
-     */
-    void (*record_reduction_op)(
-        void* infile_v,
-        void* inoutfile_v,
-        int *len,
-        MPI_Datatype *datatype
-    );
 };
 
 /* paths that darshan will not trace */
diff --git a/darshan-runtime/lib/darshan-common.c b/darshan-runtime/lib/darshan-common.c
index 292e986..6f0468d 100644
--- a/darshan-runtime/lib/darshan-common.c
+++ b/darshan-runtime/lib/darshan-common.c
@@ -173,6 +173,27 @@ static int darshan_common_val_compare(const void* a_p, const void* b_p)
     return(0);
 }
 
+void darshan_variance_reduce(void *invec, void *inoutvec, int *len,
+    MPI_Datatype *dt)
+{
+    int i;
+    struct darshan_variance_dt *X = invec;
+    struct darshan_variance_dt *Y = inoutvec;
+    struct darshan_variance_dt  Z;
+
+    for (i=0; i<*len; i++,X++,Y++)
+    {
+        Z.n = X->n + Y->n;
+        Z.T = X->T + Y->T;
+        Z.S = X->S + Y->S + (X->n/(Y->n*Z.n)) *
+           ((Y->n/X->n)*X->T - Y->T) * ((Y->n/X->n)*X->T - Y->T);
+
+        *Y = Z;
+    }
+
+    return;
+}
+
 /*
  * Local variables:
  *  c-indent-level: 4
diff --git a/darshan-runtime/lib/darshan-core.c b/darshan-runtime/lib/darshan-core.c
index 42e81f6..97ca06c 100644
--- a/darshan-runtime/lib/darshan-core.c
+++ b/darshan-runtime/lib/darshan-core.c
@@ -452,8 +452,9 @@ void darshan_core_shutdown()
     for(i = 0; i < DARSHAN_MAX_MODS; i++)
     {
         struct darshan_core_module* this_mod = final_core->mod_array[i];
-        darshan_record_id mod_shared_recs[DARSHAN_CORE_MAX_RECORDS];
         struct darshan_core_record_ref *ref = NULL;
+        darshan_record_id mod_shared_recs[DARSHAN_CORE_MAX_RECORDS];
+        int mod_shared_rec_cnt = 0;
         void* mod_buf = NULL;
         int mod_buf_sz = 0;
         int j;
@@ -469,63 +470,30 @@ void darshan_core_shutdown()
         }
  
         if(internal_timing_flag)
-            mod1[i] = DARSHAN_MPI_CALL(PMPI_Wtime)();   
-        /* if all processes used this module, prepare to do a shared file reduction */
-        if(global_mod_use_count[i] == nprocs)
-        {
-            int shared_rec_count = 0;
-            int rec_sz = 0;
-            void *red_send_buf = NULL, *red_recv_buf = NULL;
-            MPI_Datatype red_type;
-            MPI_Op red_op;
-
-            /* set the shared file list for this module */
-            memset(mod_shared_recs, 0, DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id));
-            for(j = 0; j < DARSHAN_CORE_MAX_RECORDS && shared_recs[j] != 0; j++)
-            {
-                HASH_FIND(hlink, final_core->rec_hash, &shared_recs[j],
-                    sizeof(darshan_record_id), ref);
-                assert(ref);
-                if(DARSHAN_CORE_MOD_ISSET(ref->global_mod_flags, i))
-                {
-                    mod_shared_recs[shared_rec_count++] = shared_recs[j];
-                }
-            }
+            mod1[i] = DARSHAN_MPI_CALL(PMPI_Wtime)();
 
-            /* if there are globally shared files, do a shared file reduction */
-            if(shared_rec_count && this_mod->mod_funcs.setup_reduction &&
-               this_mod->mod_funcs.record_reduction_op)
+        /* set the shared file list for this module */
+        memset(mod_shared_recs, 0, DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id));
+        for(j = 0; j < DARSHAN_CORE_MAX_RECORDS && shared_recs[j] != 0; j++)
+        {
+            HASH_FIND(hlink, final_core->rec_hash, &shared_recs[j],
+                sizeof(darshan_record_id), ref);
+            assert(ref);
+            if(DARSHAN_CORE_MOD_ISSET(ref->global_mod_flags, i))
             {
-                this_mod->mod_funcs.setup_reduction(mod_shared_recs, &shared_rec_count,
-                    &red_send_buf, &red_recv_buf, &rec_sz);
-
-                if(shared_rec_count)
-                {
-                    /* construct a datatype for a file record.  This is serving no purpose
-                     * except to make sure we can do a reduction on proper boundaries
-                     */
-                    DARSHAN_MPI_CALL(PMPI_Type_contiguous)(rec_sz, MPI_BYTE, &red_type);
-                    DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);
-
-                    /* register a reduction operator for this module */
-                    DARSHAN_MPI_CALL(PMPI_Op_create)(this_mod->mod_funcs.record_reduction_op,
-                        1, &red_op);
-
-                    /* reduce shared file records for this module */
-                    DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
-                        shared_rec_count, red_type, red_op, 0, MPI_COMM_WORLD);
-
-                    DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
-                    DARSHAN_MPI_CALL(PMPI_Op_free)(&red_op);
-                }
+                mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
             }
         }
 
-        /* if module is registered locally, get the corresponding output buffer */
+        /* if module is registered locally, get the corresponding output buffer
+         * 
+         * NOTE: this function can be used to run collective operations across
+         * modules, if there are file records shared globally.
+         */
         if(this_mod)
         {
-            /* get output buffer from module */
-            this_mod->mod_funcs.get_output_data(&mod_buf, &mod_buf_sz);
+            this_mod->mod_funcs.get_output_data(MPI_COMM_WORLD, mod_shared_recs,
+                mod_shared_rec_cnt, &mod_buf, &mod_buf_sz);
         }
 
         final_core->log_header.mod_map[i].off = tmp_off;
diff --git a/darshan-runtime/lib/darshan-mpiio.c b/darshan-runtime/lib/darshan-mpiio.c
index e0f4219..29e529b 100644
--- a/darshan-runtime/lib/darshan-mpiio.c
+++ b/darshan-runtime/lib/darshan-mpiio.c
@@ -107,8 +107,6 @@ struct mpiio_runtime
     int file_array_ndx;
     struct mpiio_file_runtime* file_hash;
     struct mpiio_file_runtime_ref* fh_hash;
-    void *red_buf;
-    int shared_rec_count;
 };
 
 static struct mpiio_runtime *mpiio_runtime = NULL;
@@ -122,13 +120,15 @@ static struct mpiio_file_runtime* mpiio_file_by_name_setfh(const char* name, MPI
 static struct mpiio_file_runtime* mpiio_file_by_fh(MPI_File fh);
 static void mpiio_file_close_fh(MPI_File fh);
 static int mpiio_record_compare(const void* a, const void* b);
-
-static void mpiio_begin_shutdown(void);
-static void mpiio_setup_reduction(darshan_record_id *shared_recs, int *shared_rec_count,
-    void **send_buf, void **recv_buf, int *rec_size);
 static void mpiio_record_reduction_op(void* infile_v, void* inoutfile_v,
     int *len, MPI_Datatype *datatype);
-static void mpiio_get_output_data(void **buffer, int *size);
+static void mpiio_shared_record_variance(MPI_Comm mod_comm,
+    struct darshan_mpiio_file *inrec_array, struct darshan_mpiio_file *outrec_array,
+    int shared_rec_count);
+
+static void mpiio_begin_shutdown(void);
+static void mpiio_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
+    int shared_rec_count, void **mpiio_buf, int *mpiio_buf_sz);
 static void mpiio_shutdown(void);
 
 #define MPIIO_LOCK() pthread_mutex_lock(&mpiio_runtime_mutex)
@@ -829,8 +829,6 @@ static void mpiio_runtime_initialize()
     struct darshan_module_funcs mpiio_mod_fns =
     {
         .begin_shutdown = &mpiio_begin_shutdown,
-        .setup_reduction = &mpiio_setup_reduction,
-        .record_reduction_op = &mpiio_record_reduction_op,
         .get_output_data = &mpiio_get_output_data,
         .shutdown = &mpiio_shutdown
     };
@@ -1022,84 +1020,6 @@ static int mpiio_record_compare(const void* a_p, const void* b_p)
     return 0;
 }
 
-/**************************************************************************
- * Functions exported by MPI-IO module for coordinating with darshan-core *
- **************************************************************************/
-
-static void mpiio_begin_shutdown()
-{
-    int i;
-    struct mpiio_file_runtime* tmp;
-
-    assert(mpiio_runtime);
-
-    MPIIO_LOCK();
-    instrumentation_disabled = 1;
-
-    /* go through and set the 4 most common access sizes for MPI-IO */
-    for(i = 0; i < mpiio_runtime->file_array_ndx; i++)
-    {
-        tmp = &(mpiio_runtime->file_runtime_array[i]);
-
-        darshan_walk_common_vals(tmp->access_root,
-            &(tmp->file_record->counters[MPIIO_ACCESS1_ACCESS]),
-            &(tmp->file_record->counters[MPIIO_ACCESS1_COUNT]));
-    }
-    MPIIO_UNLOCK();
-
-    return;
-}
-
-static void mpiio_setup_reduction(
-    darshan_record_id *shared_recs,
-    int *shared_rec_count,
-    void **send_buf,
-    void **recv_buf,
-    int *rec_size)
-{
-    struct mpiio_file_runtime *file;
-    int i;
-
-    assert(mpiio_runtime);
-
-    /* necessary initialization of shared records (e.g., change rank to -1) */
-    for(i = 0; i < *shared_rec_count; i++)
-    {
-        HASH_FIND(hlink, mpiio_runtime->file_hash, &shared_recs[i],
-            sizeof(darshan_record_id), file);
-        assert(file);
-
-        file->file_record->rank = -1;
-    }
-
-    /* sort the array of files descending by rank so that we get all of the 
-     * shared files (marked by rank -1) in a contiguous portion at end 
-     * of the array
-     */
-    qsort(mpiio_runtime->file_record_array, mpiio_runtime->file_array_ndx,
-        sizeof(struct darshan_mpiio_file), mpiio_record_compare);
-
-    /* make *send_buf point to the shared files at the end of sorted array */
-    *send_buf =
-        &(mpiio_runtime->file_record_array[mpiio_runtime->file_array_ndx-(*shared_rec_count)]);
-
-    /* allocate memory for the reduction output on rank 0 */
-    if(my_rank == 0)
-    {
-        *recv_buf = malloc(*shared_rec_count * sizeof(struct darshan_mpiio_file));
-        if(!(*recv_buf))
-            return;
-
-        /* TODO: cleaner way to do this? */
-        mpiio_runtime->red_buf = *recv_buf;
-    }
-
-    *rec_size = sizeof(struct darshan_mpiio_file);
-    mpiio_runtime->shared_rec_count = *shared_rec_count;
-
-    return;
-}
-
 static void mpiio_record_reduction_op(
     void* infile_v,
     void* inoutfile_v,
@@ -1228,6 +1148,48 @@ static void mpiio_record_reduction_op(
                 inoutfile->counters[MPIIO_MAX_WRITE_TIME_SIZE];
         }
 
+        /* min (zeroes are ok here; some procs don't do I/O) */
+        if(infile->fcounters[MPIIO_F_FASTEST_RANK_TIME] <
+            inoutfile->fcounters[MPIIO_F_FASTEST_RANK_TIME])
+        {
+            tmp_file.counters[MPIIO_FASTEST_RANK] =
+                infile->counters[MPIIO_FASTEST_RANK];
+            tmp_file.counters[MPIIO_FASTEST_RANK_BYTES] =
+                infile->counters[MPIIO_FASTEST_RANK_BYTES];
+            tmp_file.fcounters[MPIIO_F_FASTEST_RANK_TIME] =
+                infile->fcounters[MPIIO_F_FASTEST_RANK_TIME];
+        }
+        else
+        {
+            tmp_file.counters[MPIIO_FASTEST_RANK] =
+                inoutfile->counters[MPIIO_FASTEST_RANK];
+            tmp_file.counters[MPIIO_FASTEST_RANK_BYTES] =
+                inoutfile->counters[MPIIO_FASTEST_RANK_BYTES];
+            tmp_file.fcounters[MPIIO_F_FASTEST_RANK_TIME] =
+                inoutfile->fcounters[MPIIO_F_FASTEST_RANK_TIME];
+        }
+
+        /* max */
+        if(infile->fcounters[MPIIO_F_SLOWEST_RANK_TIME] >
+           inoutfile->fcounters[MPIIO_F_SLOWEST_RANK_TIME])
+        {
+            tmp_file.counters[MPIIO_SLOWEST_RANK] =
+                infile->counters[MPIIO_SLOWEST_RANK];
+            tmp_file.counters[MPIIO_SLOWEST_RANK_BYTES] =
+                infile->counters[MPIIO_SLOWEST_RANK_BYTES];
+            tmp_file.fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
+                infile->fcounters[MPIIO_F_SLOWEST_RANK_TIME];
+        }
+        else
+        {
+            tmp_file.counters[MPIIO_SLOWEST_RANK] =
+                inoutfile->counters[MPIIO_SLOWEST_RANK];
+            tmp_file.counters[MPIIO_SLOWEST_RANK_BYTES] =
+                inoutfile->counters[MPIIO_SLOWEST_RANK_BYTES];
+            tmp_file.fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
+                inoutfile->fcounters[MPIIO_F_SLOWEST_RANK_TIME];
+        }
+
         /* update pointers */
         *inoutfile = tmp_file;
         inoutfile++;
@@ -1237,27 +1199,227 @@ static void mpiio_record_reduction_op(
     return;
 }
 
+static void mpiio_shared_record_variance(MPI_Comm mod_comm,
+    struct darshan_mpiio_file *inrec_array, struct darshan_mpiio_file *outrec_array,
+    int shared_rec_count)
+{
+    MPI_Datatype var_dt;
+    MPI_Op var_op;
+    int i;
+    struct darshan_variance_dt *var_send_buf = NULL;
+    struct darshan_variance_dt *var_recv_buf = NULL;
+
+    DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_variance_dt),
+        MPI_BYTE, &var_dt);
+    DARSHAN_MPI_CALL(PMPI_Type_commit)(&var_dt);
+
+    DARSHAN_MPI_CALL(PMPI_Op_create)(darshan_variance_reduce, 1, &var_op);
+
+    var_send_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));
+    if(!var_send_buf)
+        return;
+
+    if(my_rank == 0)
+    {
+        var_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));
+
+        if(!var_recv_buf)
+            return;
+    }
+
+    /* get total i/o time variances for shared records */
+
+    for(i=0; i<shared_rec_count; i++)
+    {
+        var_send_buf[i].n = 1;
+        var_send_buf[i].S = 0;
+        var_send_buf[i].T = inrec_array[i].fcounters[MPIIO_F_READ_TIME] +
+                            inrec_array[i].fcounters[MPIIO_F_WRITE_TIME] +
+                            inrec_array[i].fcounters[MPIIO_F_META_TIME];
+    }
+
+    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
+        var_dt, var_op, 0, mod_comm);
+
+    if(my_rank == 0)
+    {
+        for(i=0; i<shared_rec_count; i++)
+        {
+            outrec_array[i].fcounters[MPIIO_F_VARIANCE_RANK_TIME] =
+                (var_recv_buf[i].S / var_recv_buf[i].n);
+        }
+    }
+
+    /* get total bytes moved variances for shared records */
+
+    for(i=0; i<shared_rec_count; i++)
+    {
+        var_send_buf[i].n = 1;
+        var_send_buf[i].S = 0;
+        var_send_buf[i].T = (double)
+                            inrec_array[i].counters[MPIIO_BYTES_READ] +
+                            inrec_array[i].counters[MPIIO_BYTES_WRITTEN];
+    }
+
+    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
+        var_dt, var_op, 0, mod_comm);
+
+    if(my_rank == 0)
+    {
+        for(i=0; i<shared_rec_count; i++)
+        {
+            outrec_array[i].fcounters[MPIIO_F_VARIANCE_RANK_BYTES] =
+                (var_recv_buf[i].S / var_recv_buf[i].n);
+        }
+    }
+
+    DARSHAN_MPI_CALL(PMPI_Type_free)(&var_dt);
+    DARSHAN_MPI_CALL(PMPI_Op_free)(&var_op);
+    free(var_send_buf);
+    free(var_recv_buf);
+
+    return;
+}
+
+/**************************************************************************
+ * Functions exported by MPI-IO module for coordinating with darshan-core *
+ **************************************************************************/
+
+static void mpiio_begin_shutdown()
+{
+    assert(mpiio_runtime);
+
+    MPIIO_LOCK();
+    /* disable further instrumentation while Darshan shuts down */
+    instrumentation_disabled = 1;
+    MPIIO_UNLOCK();
+
+    return;
+}
+
 static void mpiio_get_output_data(
-    void **buffer,
-    int *size)
+    MPI_Comm mod_comm,
+    darshan_record_id *shared_recs,
+    int shared_rec_count,
+    void **mpiio_buf,
+    int *mpiio_buf_sz)
 {
+    struct mpiio_file_runtime *file;
+    struct mpiio_file_runtime* tmp;
+    int i;
+    double mpiio_time;
+    void *red_send_buf = NULL;
+    void *red_recv_buf = NULL;
+    MPI_Datatype red_type;
+    MPI_Op red_op;
+
     assert(mpiio_runtime);
 
-    /* clean up reduction state */
-    if(my_rank == 0)
+    /* go through and set the 4 most common access sizes for MPI-IO */
+    for(i = 0; i < mpiio_runtime->file_array_ndx; i++)
     {
-        int tmp_ndx = mpiio_runtime->file_array_ndx - mpiio_runtime->shared_rec_count;
-        memcpy(&(mpiio_runtime->file_record_array[tmp_ndx]), mpiio_runtime->red_buf,
-            mpiio_runtime->shared_rec_count * sizeof(struct darshan_mpiio_file));
-        free(mpiio_runtime->red_buf);
+        tmp = &(mpiio_runtime->file_runtime_array[i]);
+
+        /* common access sizes */
+        darshan_walk_common_vals(tmp->access_root,
+            &(tmp->file_record->counters[MPIIO_ACCESS1_ACCESS]),
+            &(tmp->file_record->counters[MPIIO_ACCESS1_COUNT]));
     }
-    else
+
+    /* if there are globally shared files, do a shared file reduction */   
+    if(shared_rec_count)
     {
-        mpiio_runtime->file_array_ndx -= mpiio_runtime->shared_rec_count;
+        /* necessary initialization of shared records */
+        for(i = 0; i < shared_rec_count; i++)
+        {
+            HASH_FIND(hlink, mpiio_runtime->file_hash, &shared_recs[i],
+                sizeof(darshan_record_id), file);
+            assert(file);
+
+            mpiio_time =
+                file->file_record->fcounters[MPIIO_F_READ_TIME] +
+                file->file_record->fcounters[MPIIO_F_WRITE_TIME] +
+                file->file_record->fcounters[MPIIO_F_META_TIME];
+
+            /* initialize fastest/slowest info prior to the reduction */
+            file->file_record->counters[MPIIO_FASTEST_RANK] =
+                file->file_record->rank;
+            file->file_record->counters[MPIIO_FASTEST_RANK_BYTES] =
+                file->file_record->counters[MPIIO_BYTES_READ] +
+                file->file_record->counters[MPIIO_BYTES_WRITTEN];
+            file->file_record->fcounters[MPIIO_F_FASTEST_RANK_TIME] =
+                mpiio_time;
+
+            /* until reduction occurs, we assume that this rank is both
+             * the fastest and slowest. It is up to the reduction operator
+             * to find the true min and max.
+             */
+            file->file_record->counters[MPIIO_SLOWEST_RANK] =
+                file->file_record->counters[MPIIO_FASTEST_RANK];
+            file->file_record->counters[MPIIO_SLOWEST_RANK_BYTES] =
+                file->file_record->counters[MPIIO_FASTEST_RANK_BYTES];
+            file->file_record->fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
+                file->file_record->fcounters[MPIIO_F_FASTEST_RANK_TIME];
+
+            file->file_record->rank = -1;
+        }
+
+        /* sort the array of files descending by rank so that we get all of the 
+         * shared files (marked by rank -1) in a contiguous portion at end 
+         * of the array
+         */
+        qsort(mpiio_runtime->file_record_array, mpiio_runtime->file_array_ndx,
+            sizeof(struct darshan_mpiio_file), mpiio_record_compare);
+
+        /* make *send_buf point to the shared files at the end of sorted array */
+        red_send_buf =
+            &(mpiio_runtime->file_record_array[mpiio_runtime->file_array_ndx-(shared_rec_count)]);
+
+        /* allocate memory for the reduction output on rank 0 */
+        if(my_rank == 0)
+        {
+            red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_mpiio_file));
+            if(!red_recv_buf)
+                return;
+        }
+
+        /* construct a datatype for a MPIIO file record.  This is serving no purpose
+         * except to make sure we can do a reduction on proper boundaries
+         */
+        DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_mpiio_file),
+            MPI_BYTE, &red_type);
+        DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);
+
+        /* register a MPIIO file record reduction operator */
+        DARSHAN_MPI_CALL(PMPI_Op_create)(mpiio_record_reduction_op, 1, &red_op);
+
+        /* reduce shared MPIIO file records */
+        DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
+            shared_rec_count, red_type, red_op, 0, mod_comm);
+
+        /* get the time and byte variances for shared files */
+        mpiio_shared_record_variance(mod_comm, red_send_buf, red_recv_buf,
+            shared_rec_count);
+
+        /* clean up reduction state */
+        if(my_rank == 0)
+        {
+            int tmp_ndx = mpiio_runtime->file_array_ndx - shared_rec_count;
+            memcpy(&(mpiio_runtime->file_record_array[tmp_ndx]), red_recv_buf,
+                shared_rec_count * sizeof(struct darshan_mpiio_file));
+            free(red_recv_buf);
+        }
+        else
+        {
+            mpiio_runtime->file_array_ndx -= shared_rec_count;
+        }
+
+        DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
+        DARSHAN_MPI_CALL(PMPI_Op_free)(&red_op);
     }
 
-    *buffer = (void *)(mpiio_runtime->file_record_array);
-    *size = mpiio_runtime->file_array_ndx * sizeof(struct darshan_mpiio_file);
+    *mpiio_buf = (void *)(mpiio_runtime->file_record_array);
+    *mpiio_buf_sz = mpiio_runtime->file_array_ndx * sizeof(struct darshan_mpiio_file);
 
     return;
 }
diff --git a/darshan-runtime/lib/darshan-null.c b/darshan-runtime/lib/darshan-null.c
index cfa8e6b..ed9eb7d 100644
--- a/darshan-runtime/lib/darshan-null.c
+++ b/darshan-runtime/lib/darshan-null.c
@@ -119,7 +119,8 @@ static struct null_record_runtime* null_record_by_name(const char *name);
 
 /* forward declaration for module functions needed to interface with darshan-core */
 static void null_begin_shutdown(void);
-static void null_get_output_data(void **buffer, int *size);
+static void null_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
+    int shared_rec_count, void **null_buf, int *null_buf_sz);
 static void null_shutdown(void);
 
 /* macros for obtaining/releasing the "NULL" module lock */
@@ -203,8 +204,6 @@ static void null_runtime_initialize()
     struct darshan_module_funcs null_mod_fns =
     {
         .begin_shutdown = &null_begin_shutdown,
-        .setup_reduction = NULL,        /* "NULL" module does not do reductions */
-        .record_reduction_op = NULL,    /* "NULL" module does not do reductions */
         .get_output_data = &null_get_output_data,
         .shutdown = &null_shutdown
     };
@@ -334,17 +333,30 @@ static void null_begin_shutdown()
 
 /* Pass output data for the "NULL" module back to darshan-core to log to file. */
 static void null_get_output_data(
-    void **buffer,
-    int *size)
+    MPI_Comm mod_comm,
+    darshan_record_id *shared_recs,
+    int shared_rec_count,
+    void **null_buf,
+    int *null_buf_sz)
 {
     assert(null_runtime);
 
+    /* NOTE: this function can be used to run collective operations prior to
+     * shutting down the module, as implied by the MPI communicator passed in
+     * as the first agrument. Typically, module developers will want to run a
+     * reduction on shared data records (passed in in the 'shared_recs' array),
+     * but other collective routines can be run here as well. For a detailed
+     * example illustrating how to run shared file reductions, consider the
+     * POSIX or MPIIO instrumentation modules, as they both implement this
+     * functionality.
+     */
+
     /* Just set the output buffer to point at the array of the "NULL" module's
      * I/O records, and set the output size according to the number of records
      * currently being tracked.
      */
-    *buffer = (void *)(null_runtime->record_array);
-    *size = null_runtime->rec_array_ndx * sizeof(struct darshan_null_record);
+    *null_buf = (void *)(null_runtime->record_array);
+    *null_buf_sz = null_runtime->rec_array_ndx * sizeof(struct darshan_null_record);
 
     return;
 }
diff --git a/darshan-runtime/lib/darshan-posix.c b/darshan-runtime/lib/darshan-posix.c
index ad1775e..b0bf350 100644
--- a/darshan-runtime/lib/darshan-posix.c
+++ b/darshan-runtime/lib/darshan-posix.c
@@ -176,8 +176,6 @@ struct posix_runtime
     int file_array_ndx;
     struct posix_file_runtime* file_hash;
     struct posix_file_runtime_ref* fd_hash;
-    void *red_buf;
-    int shared_rec_count;
 };
 
 static struct posix_runtime *posix_runtime = NULL;
@@ -191,16 +189,18 @@ static struct posix_file_runtime* posix_file_by_name(const char *name);
 static struct posix_file_runtime* posix_file_by_name_setfd(const char* name, int fd);
 static struct posix_file_runtime* posix_file_by_fd(int fd);
 static void posix_file_close_fd(int fd);
-static int posix_record_compare(const void* a, const void* b);
 static void posix_aio_tracker_add(int fd, void *aiocbp);
 static struct posix_aio_tracker* posix_aio_tracker_del(int fd, void *aiocbp);
-
-static void posix_begin_shutdown(void);
-static void posix_setup_reduction(darshan_record_id *shared_recs, int *shared_rec_count,
-    void **send_buf, void **recv_buf, int *rec_size);
+static int posix_record_compare(const void* a, const void* b);
 static void posix_record_reduction_op(void* infile_v, void* inoutfile_v,
     int *len, MPI_Datatype *datatype);
-static void posix_get_output_data(void **buffer, int *size);
+static void posix_shared_record_variance(MPI_Comm mod_comm,
+    struct darshan_posix_file *inrec_array, struct darshan_posix_file *outrec_array,
+    int shared_rec_count);
+
+static void posix_begin_shutdown(void);
+static void posix_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
+    int shared_rec_count, void **posix_buf, int *posix_buf_sz);
 static void posix_shutdown(void);
 
 #define POSIX_LOCK() pthread_mutex_lock(&posix_runtime_mutex)
@@ -1454,8 +1454,6 @@ static void posix_runtime_initialize()
     struct darshan_module_funcs posix_mod_fns =
     {
         .begin_shutdown = &posix_begin_shutdown,
-        .setup_reduction = &posix_setup_reduction,
-        .record_reduction_op = &posix_record_reduction_op,
         .get_output_data = &posix_get_output_data,
         .shutdown = &posix_shutdown
     };
@@ -1651,27 +1649,6 @@ static int posix_record_compare(const void* a_p, const void* b_p)
     return 0;
 }
 
-/* adds a tracker for the given aio operation */
-static void posix_aio_tracker_add(int fd, void *aiocbp)
-{
-    struct posix_aio_tracker* tracker;
-    struct posix_file_runtime* file;
-
-    file = posix_file_by_fd(fd);
-    if (file)
-    {
-        tracker = malloc(sizeof(*tracker));
-        if (tracker)
-        {
-            tracker->tm1 = darshan_core_wtime();
-            tracker->aiocbp = aiocbp;
-            LL_PREPEND(file->aio_list, tracker);
-        }
-    }
-
-    return;
-}
-
 /* finds the tracker structure for a given aio operation, removes it from
  * the linked list for the darshan_file structure, and returns a pointer.  
  *
@@ -1699,123 +1676,29 @@ static struct posix_aio_tracker* posix_aio_tracker_del(int fd, void *aiocbp)
     return(tracker);
 }
 
-/************************************************************************
- * Functions exported by this module for coordinating with darshan-core *
- ************************************************************************/
-
-static void posix_begin_shutdown()
-{
-    int i;
-    struct posix_file_runtime* tmp;
-
-    assert(posix_runtime);
-
-    POSIX_LOCK();
-    instrumentation_disabled = 1;
-
-    /* go through file access data for each record and set the 4 most common
-     * stride/access size counters.
-     */
-    for(i = 0; i < posix_runtime->file_array_ndx; i++)
-    {
-        tmp = &(posix_runtime->file_runtime_array[i]);
-
-        /* common accesses */
-        darshan_walk_common_vals(tmp->access_root,
-            &(tmp->file_record->counters[POSIX_ACCESS1_ACCESS]),
-            &(tmp->file_record->counters[POSIX_ACCESS1_COUNT]));
-        /* common strides */
-        darshan_walk_common_vals(tmp->stride_root,
-            &(tmp->file_record->counters[POSIX_STRIDE1_STRIDE]),
-            &(tmp->file_record->counters[POSIX_STRIDE1_COUNT]));
-    }
-
-    /* disable further instrumentation while Darshan shuts down */
-    POSIX_UNLOCK();
-
-    return;
-}
-
-static void posix_setup_reduction(
-    darshan_record_id *shared_recs,
-    int *shared_rec_count,
-    void **send_buf,
-    void **recv_buf,
-    int *rec_size)
+/* adds a tracker for the given aio operation */
+static void posix_aio_tracker_add(int fd, void *aiocbp)
 {
-    struct posix_file_runtime *file;
-    int i;
-    double posix_time;
-
-    assert(posix_runtime);
-
-    /* necessary initialization of shared records (e.g., change rank to -1) */
-    for(i = 0; i < *shared_rec_count; i++)
-    {
-        HASH_FIND(hlink, posix_runtime->file_hash, &shared_recs[i],
-            sizeof(darshan_record_id), file);
-        assert(file);
-
-        posix_time =
-            file->file_record->fcounters[POSIX_F_READ_TIME] +
-            file->file_record->fcounters[POSIX_F_WRITE_TIME] +
-            file->file_record->fcounters[POSIX_F_META_TIME];
-
-        /* initialize fastest/slowest info prior to the reduction */
-        file->file_record->counters[POSIX_FASTEST_RANK] =
-            file->file_record->rank;
-        file->file_record->counters[POSIX_FASTEST_RANK_BYTES] = 
-            file->file_record->counters[POSIX_BYTES_READ] +
-            file->file_record->counters[POSIX_BYTES_WRITTEN];
-        file->file_record->fcounters[POSIX_F_FASTEST_RANK_TIME] = 
-            posix_time;
-
-        /* until reduction occurs, we assume that this rank is both
-         * the fastest and slowest. It is up to the reduction operator
-         * to find the true min and max.
-         */
-        file->file_record->counters[POSIX_SLOWEST_RANK] =
-            file->file_record->counters[POSIX_FASTEST_RANK];
-        file->file_record->counters[POSIX_SLOWEST_RANK_BYTES] =
-            file->file_record->counters[POSIX_FASTEST_RANK_BYTES];
-        file->file_record->fcounters[POSIX_F_SLOWEST_RANK_TIME] =
-            file->file_record->fcounters[POSIX_F_FASTEST_RANK_TIME];
-
-        file->file_record->rank = -1;
-    }
-
-    /* sort the array of files descending by rank so that we get all of the 
-     * shared files (marked by rank -1) in a contiguous portion at end 
-     * of the array
-     */
-    qsort(posix_runtime->file_record_array, posix_runtime->file_array_ndx,
-        sizeof(struct darshan_posix_file), posix_record_compare);
-
-    /* make *send_buf point to the shared files at the end of sorted array */
-    *send_buf =
-        &(posix_runtime->file_record_array[posix_runtime->file_array_ndx-(*shared_rec_count)]);
+    struct posix_aio_tracker* tracker;
+    struct posix_file_runtime* file;
 
-    /* allocate memory for the reduction output on rank 0 */
-    if(my_rank == 0)
+    file = posix_file_by_fd(fd);
+    if (file)
     {
-        *recv_buf = malloc(*shared_rec_count * sizeof(struct darshan_posix_file));
-        if(!(*recv_buf))
-            return;
-
-        posix_runtime->red_buf = *recv_buf;
+        tracker = malloc(sizeof(*tracker));
+        if (tracker)
+        {
+            tracker->tm1 = darshan_core_wtime();
+            tracker->aiocbp = aiocbp;
+            LL_PREPEND(file->aio_list, tracker);
+        }
     }
 
-    *rec_size = sizeof(struct darshan_posix_file);
-    posix_runtime->shared_rec_count = *shared_rec_count;
-
     return;
 }
 
-static void posix_record_reduction_op(
-    void* infile_v,
-    void* inoutfile_v,
-    int *len,
-    MPI_Datatype *datatype)
+static void posix_record_reduction_op(void* infile_v, void* inoutfile_v,
+    int *len, MPI_Datatype *datatype)
 {
     struct darshan_posix_file tmp_file;
     struct darshan_posix_file *infile = infile_v;
@@ -2044,27 +1927,233 @@ static void posix_record_reduction_op(
     return;
 }
 
+static void posix_shared_record_variance(MPI_Comm mod_comm,
+    struct darshan_posix_file *inrec_array, struct darshan_posix_file *outrec_array,
+    int shared_rec_count)
+{
+    MPI_Datatype var_dt;
+    MPI_Op var_op;
+    int i;
+    struct darshan_variance_dt *var_send_buf = NULL;
+    struct darshan_variance_dt *var_recv_buf = NULL;
+
+    DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_variance_dt),
+        MPI_BYTE, &var_dt);
+    DARSHAN_MPI_CALL(PMPI_Type_commit)(&var_dt);
+
+    DARSHAN_MPI_CALL(PMPI_Op_create)(darshan_variance_reduce, 1, &var_op);
+
+    var_send_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));
+    if(!var_send_buf)
+        return;
+
+    if(my_rank == 0)
+    {
+        var_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));
+
+        if(!var_recv_buf)
+            return;
+    }
+
+    /* get total i/o time variances for shared records */
+
+    for(i=0; i<shared_rec_count; i++)
+    {
+        var_send_buf[i].n = 1;
+        var_send_buf[i].S = 0;
+        var_send_buf[i].T = inrec_array[i].fcounters[POSIX_F_READ_TIME] +
+                            inrec_array[i].fcounters[POSIX_F_WRITE_TIME] +
+                            inrec_array[i].fcounters[POSIX_F_META_TIME];
+    }
+
+    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
+        var_dt, var_op, 0, mod_comm);
+
+    if(my_rank == 0)
+    {
+        for(i=0; i<shared_rec_count; i++)
+        {
+            outrec_array[i].fcounters[POSIX_F_VARIANCE_RANK_TIME] =
+                (var_recv_buf[i].S / var_recv_buf[i].n);
+        }
+    }
+
+    /* get total bytes moved variances for shared records */
+
+    for(i=0; i<shared_rec_count; i++)
+    {
+        var_send_buf[i].n = 1;
+        var_send_buf[i].S = 0;
+        var_send_buf[i].T = (double)
+                            inrec_array[i].counters[POSIX_BYTES_READ] +
+                            inrec_array[i].counters[POSIX_BYTES_WRITTEN];
+    }
+
+    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
+        var_dt, var_op, 0, mod_comm);
+
+    if(my_rank == 0)
+    {
+        for(i=0; i<shared_rec_count; i++)
+        {
+            outrec_array[i].fcounters[POSIX_F_VARIANCE_RANK_BYTES] =
+                (var_recv_buf[i].S / var_recv_buf[i].n);
+        }
+    }
+
+    DARSHAN_MPI_CALL(PMPI_Type_free)(&var_dt);
+    DARSHAN_MPI_CALL(PMPI_Op_free)(&var_op);
+    free(var_send_buf);
+    free(var_recv_buf);
+
+    return;
+}
+
+/************************************************************************
+ * Functions exported by this module for coordinating with darshan-core *
+ ************************************************************************/
+
+static void posix_begin_shutdown()
+{
+    assert(posix_runtime);
+
+    POSIX_LOCK();
+    /* disable further instrumentation while Darshan shuts down */
+    instrumentation_disabled = 1;
+    POSIX_UNLOCK();
+
+    return;
+}
+
 static void posix_get_output_data(
-    void **buffer,
-    int *size)
+    MPI_Comm mod_comm,
+    darshan_record_id *shared_recs,
+    int shared_rec_count,
+    void **posix_buf,
+    int *posix_buf_sz)
 {
+    struct posix_file_runtime *file;
+    struct posix_file_runtime *tmp;
+    int i;
+    double posix_time;
+    struct darshan_posix_file *red_send_buf = NULL;
+    struct darshan_posix_file *red_recv_buf = NULL;
+    MPI_Datatype red_type;
+    MPI_Op red_op;
+
     assert(posix_runtime);
 
-    /* clean up reduction state */
-    if(my_rank == 0)
+    /* go through file access data for each record and set the 4 most common
+     * stride/access size counters.
+     */
+    for(i = 0; i < posix_runtime->file_array_ndx; i++)
     {
-        int tmp_ndx = posix_runtime->file_array_ndx - posix_runtime->shared_rec_count;
-        memcpy(&(posix_runtime->file_record_array[tmp_ndx]), posix_runtime->red_buf,
-            posix_runtime->shared_rec_count * sizeof(struct darshan_posix_file));
-        free(posix_runtime->red_buf);
+        tmp = &(posix_runtime->file_runtime_array[i]);
+
+        /* common accesses */
+        darshan_walk_common_vals(tmp->access_root,
+            &(tmp->file_record->counters[POSIX_ACCESS1_ACCESS]),
+            &(tmp->file_record->counters[POSIX_ACCESS1_COUNT]));
+        /* common strides */
+        darshan_walk_common_vals(tmp->stride_root,
+            &(tmp->file_record->counters[POSIX_STRIDE1_STRIDE]),
+            &(tmp->file_record->counters[POSIX_STRIDE1_COUNT]));
     }
-    else
+
+    /* if there are globally shared files, do a shared file reduction */
+    if(shared_rec_count)
     {
-        posix_runtime->file_array_ndx -= posix_runtime->shared_rec_count;
+        /* necessary initialization of shared records */
+        for(i = 0; i < shared_rec_count; i++)
+        {
+            HASH_FIND(hlink, posix_runtime->file_hash, &shared_recs[i],
+                sizeof(darshan_record_id), file);
+            assert(file);
+
+            posix_time =
+                file->file_record->fcounters[POSIX_F_READ_TIME] +
+                file->file_record->fcounters[POSIX_F_WRITE_TIME] +
+                file->file_record->fcounters[POSIX_F_META_TIME];
+
+            /* initialize fastest/slowest info prior to the reduction */
+            file->file_record->counters[POSIX_FASTEST_RANK] =
+                file->file_record->rank;
+            file->file_record->counters[POSIX_FASTEST_RANK_BYTES] =
+                file->file_record->counters[POSIX_BYTES_READ] +
+                file->file_record->counters[POSIX_BYTES_WRITTEN];
+            file->file_record->fcounters[POSIX_F_FASTEST_RANK_TIME] =
+                posix_time;
+
+            /* until reduction occurs, we assume that this rank is both
+             * the fastest and slowest. It is up to the reduction operator
+             * to find the true min and max.
+             */
+            file->file_record->counters[POSIX_SLOWEST_RANK] =
+                file->file_record->counters[POSIX_FASTEST_RANK];
+            file->file_record->counters[POSIX_SLOWEST_RANK_BYTES] =
+                file->file_record->counters[POSIX_FASTEST_RANK_BYTES];
+            file->file_record->fcounters[POSIX_F_SLOWEST_RANK_TIME] =
+                file->file_record->fcounters[POSIX_F_FASTEST_RANK_TIME];
+
+            file->file_record->rank = -1;
+        }
+
+        /* sort the array of files descending by rank so that we get all of the 
+         * shared files (marked by rank -1) in a contiguous portion at end 
+         * of the array
+         */
+        qsort(posix_runtime->file_record_array, posix_runtime->file_array_ndx,
+            sizeof(struct darshan_posix_file), posix_record_compare);
+
+        /* make *send_buf point to the shared files at the end of sorted array */
+        red_send_buf =
+            &(posix_runtime->file_record_array[posix_runtime->file_array_ndx-(shared_rec_count)]);
+        
+        /* allocate memory for the reduction output on rank 0 */
+        if(my_rank == 0)
+        {
+            red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_posix_file));
+            if(!red_recv_buf)
+                return;
+        }
+
+        /* construct a datatype for a POSIX file record.  This is serving no purpose
+         * except to make sure we can do a reduction on proper boundaries
+         */
+        DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_posix_file),
+            MPI_BYTE, &red_type);
+        DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);
+
+        /* register a POSIX file record reduction operator */
+        DARSHAN_MPI_CALL(PMPI_Op_create)(posix_record_reduction_op, 1, &red_op);
+
+        /* reduce shared POSIX file records */
+        DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
+            shared_rec_count, red_type, red_op, 0, mod_comm);
+
+        /* get the time and byte variances for shared files */
+        posix_shared_record_variance(mod_comm, red_send_buf, red_recv_buf,
+            shared_rec_count);
+
+        /* clean up reduction state */
+        if(my_rank == 0)
+        {
+            int tmp_ndx = posix_runtime->file_array_ndx - shared_rec_count;
+            memcpy(&(posix_runtime->file_record_array[tmp_ndx]), red_recv_buf,
+                shared_rec_count * sizeof(struct darshan_posix_file));
+            free(red_recv_buf);
+        }
+        else
+        {
+            posix_runtime->file_array_ndx -= shared_rec_count;
+        }
+
+        DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
+        DARSHAN_MPI_CALL(PMPI_Op_free)(&red_op);
     }
 
-    *buffer = (void *)(posix_runtime->file_record_array);
-    *size = posix_runtime->file_array_ndx * sizeof(struct darshan_posix_file);
+    *posix_buf = (void *)(posix_runtime->file_record_array);
+    *posix_buf_sz = posix_runtime->file_array_ndx * sizeof(struct darshan_posix_file);
 
     return;
 }


hooks/post-receive
--



More information about the Darshan-commits mailing list