[mpich2-commits] r7785 - in mpich2/trunk/src: include mpi/coll mpi/comm

goodell at mcs.anl.gov goodell at mcs.anl.gov
Thu Jan 20 16:09:01 CST 2011


Author: goodell
Date: 2011-01-20 16:09:01 -0600 (Thu, 20 Jan 2011)
New Revision: 7785

Modified:
   mpich2/trunk/src/include/mpiimpl.h
   mpich2/trunk/src/mpi/coll/ibcast.c
   mpich2/trunk/src/mpi/comm/commutil.c
Log:
initial implementation of MPIX_Ibcast

Only supports a binomial broadcast at this time.  Override functions
will be honored though.

Reviewed by balaji at .

Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h	2011-01-20 22:08:58 UTC (rev 7784)
+++ mpich2/trunk/src/include/mpiimpl.h	2011-01-20 22:09:01 UTC (rev 7785)
@@ -3520,7 +3520,10 @@
 int MPIR_Iexscan_impl(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
 /* end impl functions for NBC */
 
+int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
+int MPIR_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
 
+
 /* random initializers */
 int MPIR_Group_init(void);
 int MPIR_Comm_init(MPID_Comm *);

Modified: mpich2/trunk/src/mpi/coll/ibcast.c
===================================================================
--- mpich2/trunk/src/mpi/coll/ibcast.c	2011-01-20 22:08:58 UTC (rev 7784)
+++ mpich2/trunk/src/mpi/coll/ibcast.c	2011-01-20 22:09:01 UTC (rev 7785)
@@ -24,7 +24,241 @@
 
 /* any non-MPI functions go here, especially non-static ones */
 
+/* Adds operations to the given schedule that correspond to the specified
+ * binomial broadcast.  It does _not_ start the schedule.  This permits callers
+ * to build up a larger hierarchical broadcast from multiple invocations of this
+ * function. */
 #undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_binomial
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int MPIR_Ibcast_binomial(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int mask;
+    int comm_size, rank;
+    int type_size, is_contig, is_homogeneous;
+    int nbytes;
+    int relative_rank;
+    int src, dst;
+    void *tmp_buf = NULL;
+    MPIU_CHKPMEM_DECL(1);
+
+    comm_size = comm_ptr->local_size;
+    rank = comm_ptr->rank;
+
+    if (comm_size == 1) {
+        /* nothing to add, this is a useless broadcast */
+        goto fn_exit;
+    }
+
+    MPID_Datatype_is_contig(datatype, &is_contig);
+
+    is_homogeneous = 1;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+
+    /* MPI_Type_size() might not give the accurate size of the packed
+     * datatype for heterogeneous systems (because of padding, encoding,
+     * etc). On the other hand, MPI_Pack_size() can become very
+     * expensive, depending on the implementation, especially for
+     * heterogeneous systems. We want to use MPI_Type_size() wherever
+     * possible, and MPI_Pack_size() in other places.
+     */
+    if (is_homogeneous)
+        MPID_Datatype_get_size_macro(datatype, type_size);
+    else
+        MPIR_Pack_size_impl(1, datatype, &type_size);
+
+    nbytes = type_size * count;
+
+    if (!is_contig || !is_homogeneous)
+    {
+        MPIU_CHKPMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
+
+        /* TODO: Pipeline the packing and communication */
+        if (rank == root) {
+            mpi_errno = MPID_Sched_copy(buffer, count, datatype, tmp_buf, nbytes, MPI_PACKED, s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            mpi_errno = MPID_Sched_barrier(s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        }
+    }
+
+    relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
+
+    /* Use short message algorithm, namely, binomial tree */
+
+    /* Algorithm:
+       This uses a fairly basic recursive subdivision algorithm.
+       The root sends to the process comm_size/2 away; the receiver becomes
+       a root for a subtree and applies the same process. 
+
+       So that the new root can easily identify the size of its
+       subtree, the (subtree) roots are all powers of two (relative
+       to the root) If m = the first power of 2 such that 2^m >= the
+       size of the communicator, then the subtree at root at 2^(m-k)
+       has size 2^k (with special handling for subtrees that aren't
+       a power of two in size).
+
+       Do subdivision.  There are two phases:
+       1. Wait for arrival of data.  Because of the power of two nature
+       of the subtree roots, the source of this message is always the
+       process whose relative rank has the least significant 1 bit CLEARED.
+       That is, process 4 (100) receives from process 0, process 7 (111)
+       from process 6 (110), etc.
+       2. Forward to my subtree
+
+       Note that the process that is the tree root is handled automatically
+       by this code, since it has no bits set.  */
+
+    mask = 0x1;
+    while (mask < comm_size) {
+        if (relative_rank & mask) {
+            src = rank - mask; 
+            if (src < 0) src += comm_size;
+            if (!is_contig || !is_homogeneous)
+                mpi_errno = MPID_Sched_recv(tmp_buf, nbytes, MPI_BYTE, src, comm_ptr, s);
+            else
+                mpi_errno = MPID_Sched_recv(buffer, count, datatype, src, comm_ptr, s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno = MPID_Sched_barrier(s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            break;
+        }
+        mask <<= 1;
+    }
+
+    /* This process is responsible for all processes that have bits
+       set from the LSB upto (but not including) mask.  Because of
+       the "not including", we start by shifting mask back down one.
+
+       We can easily change to a different algorithm at any power of two
+       by changing the test (mask > 1) to (mask > block_size) 
+
+       One such version would use non-blocking operations for the last 2-4
+       steps (this also bounds the number of MPI_Requests that would
+       be needed).  */
+
+    mask >>= 1;
+    while (mask > 0) {
+        if (relative_rank + mask < comm_size) {
+            dst = rank + mask;
+            if (dst >= comm_size) dst -= comm_size;
+            if (!is_contig || !is_homogeneous)
+                mpi_errno = MPID_Sched_send(tmp_buf, nbytes, MPI_BYTE, dst, comm_ptr, s);
+            else
+                mpi_errno = MPID_Sched_send(buffer, count, datatype, dst, comm_ptr, s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            /* NOTE: This is departure from MPIR_Bcast_binomial.  A true analog
+             * would put an MPID_Sched_barrier here after every send. */
+        }
+        mask >>= 1;
+    }
+
+    if (!is_contig || !is_homogeneous) {
+        if (rank != root) {
+            mpi_errno = MPID_Sched_barrier(s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno = MPID_Sched_copy(tmp_buf, nbytes, MPI_PACKED, buffer, count, datatype, s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno = MPID_Sched_barrier(s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno = MPID_Sched_cb(&MPIR_Sched_cb_free_buf, tmp_buf, s);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        }
+    }
+
+    MPIU_CHKPMEM_COMMIT();
+fn_exit:
+    return mpi_errno;
+fn_fail:
+    MPIU_CHKPMEM_REAP();
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_intra
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPI_Request *request)
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_intra
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIU_Assert(comm_ptr->comm_kind == MPID_INTRACOMM);
+
+    /* simplistic implementation for now */
+    mpi_errno = MPIR_Ibcast_binomial(buffer, count, datatype, root, comm_ptr, s);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+fn_exit:
+    return mpi_errno;
+fn_fail:
+    goto fn_exit;
+}
+
+/* Provides a generic "flat" broadcast for intercommunicators that doesn't know
+ * anything about hierarchy.  It will choose between several different
+ * algorithms based on the given parameters. */
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_inter
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIU_Assert(comm_ptr->comm_kind == MPID_INTERCOMM);
+
+    /* Intercommunicator broadcast.
+     * Root sends to rank 0 in remote group. Remote group does local
+     * intracommunicator broadcast. */
+    if (root == MPI_PROC_NULL)
+    {
+        /* local processes other than root do nothing */
+        mpi_errno = MPI_SUCCESS;
+    }
+    else if (root == MPI_ROOT)
+    {
+        /* root sends to rank 0 on remote group and returns */
+        mpi_errno = MPID_Sched_send(buffer, count, datatype, 0, comm_ptr, s);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    else
+    {
+        /* remote group. rank 0 on remote group receives from root */
+        mpi_errno = MPID_Sched_recv(buffer, count, datatype, root, comm_ptr, s);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+        if (comm_ptr->local_comm == NULL) {
+            mpi_errno = MPIR_Setup_intercomm_localcomm(comm_ptr);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        }
+
+        mpi_errno = MPIR_Ibcast_intra(buffer, count, datatype, root, comm_ptr->local_comm, s);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+
+fn_exit:
+    return mpi_errno;
+fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
 #define FUNCNAME MPIR_Ibcast_impl
 #undef FCNAME
 #define FCNAME MPIU_QUOTE(FUNCNAME)

Modified: mpich2/trunk/src/mpi/comm/commutil.c
===================================================================
--- mpich2/trunk/src/mpi/comm/commutil.c	2011-01-20 22:08:58 UTC (rev 7784)
+++ mpich2/trunk/src/mpi/comm/commutil.c	2011-01-20 22:09:01 UTC (rev 7785)
@@ -241,6 +241,7 @@
         ops->ref_count = 1; /* force existence until finalize time */
 
         /* intracomm default defaults... */
+        ops->Ibcast = &MPIR_Ibcast_intra;
         /* TODO add other fns here as they are added */
 
         /* override defaults, such as for SMP */
@@ -267,7 +268,7 @@
         ops->ref_count = 1; /* force existence until finalize time */
 
         /* intracomm defaults */
-        ops->Ibcast = NULL;
+        ops->Ibcast = &MPIR_Ibcast_inter;
 
         ic_default_collops = ops;
     }



More information about the mpich2-commits mailing list