[mpich2-commits] r7785 - in mpich2/trunk/src: include mpi/coll mpi/comm
goodell at mcs.anl.gov
goodell at mcs.anl.gov
Thu Jan 20 16:09:01 CST 2011
Author: goodell
Date: 2011-01-20 16:09:01 -0600 (Thu, 20 Jan 2011)
New Revision: 7785
Modified:
mpich2/trunk/src/include/mpiimpl.h
mpich2/trunk/src/mpi/coll/ibcast.c
mpich2/trunk/src/mpi/comm/commutil.c
Log:
initial implementation of MPIX_Ibcast
Only supports a binomial broadcast at this time. Override functions
will be honored though.
Reviewed by balaji at .
Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h 2011-01-20 22:08:58 UTC (rev 7784)
+++ mpich2/trunk/src/include/mpiimpl.h 2011-01-20 22:09:01 UTC (rev 7785)
@@ -3520,7 +3520,10 @@
int MPIR_Iexscan_impl(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
/* end impl functions for NBC */
+int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
+int MPIR_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
+
/* random initializers */
int MPIR_Group_init(void);
int MPIR_Comm_init(MPID_Comm *);
Modified: mpich2/trunk/src/mpi/coll/ibcast.c
===================================================================
--- mpich2/trunk/src/mpi/coll/ibcast.c 2011-01-20 22:08:58 UTC (rev 7784)
+++ mpich2/trunk/src/mpi/coll/ibcast.c 2011-01-20 22:09:01 UTC (rev 7785)
@@ -24,7 +24,241 @@
/* any non-MPI functions go here, especially non-static ones */
+/* Adds operations to the given schedule that correspond to the specified
+ * binomial broadcast. It does _not_ start the schedule. This permits callers
+ * to build up a larger hierarchical broadcast from multiple invocations of this
+ * function. */
#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_binomial
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int MPIR_Ibcast_binomial(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int mask;
+ int comm_size, rank;
+ int type_size, is_contig, is_homogeneous;
+ int nbytes;
+ int relative_rank;
+ int src, dst;
+ void *tmp_buf = NULL;
+ MPIU_CHKPMEM_DECL(1);
+
+ comm_size = comm_ptr->local_size;
+ rank = comm_ptr->rank;
+
+ if (comm_size == 1) {
+ /* nothing to add, this is a useless broadcast */
+ goto fn_exit;
+ }
+
+ MPID_Datatype_is_contig(datatype, &is_contig);
+
+ is_homogeneous = 1;
+#ifdef MPID_HAS_HETERO
+ if (comm_ptr->is_hetero)
+ is_homogeneous = 0;
+#endif
+
+ /* MPI_Type_size() might not give the accurate size of the packed
+ * datatype for heterogeneous systems (because of padding, encoding,
+ * etc). On the other hand, MPI_Pack_size() can become very
+ * expensive, depending on the implementation, especially for
+ * heterogeneous systems. We want to use MPI_Type_size() wherever
+ * possible, and MPI_Pack_size() in other places.
+ */
+ if (is_homogeneous)
+ MPID_Datatype_get_size_macro(datatype, type_size);
+ else
+ MPIR_Pack_size_impl(1, datatype, &type_size);
+
+ nbytes = type_size * count;
+
+ if (!is_contig || !is_homogeneous)
+ {
+ MPIU_CHKPMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
+
+ /* TODO: Pipeline the packing and communication */
+ if (rank == root) {
+ mpi_errno = MPID_Sched_copy(buffer, count, datatype, tmp_buf, nbytes, MPI_PACKED, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+
+ relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
+
+ /* Use short message algorithm, namely, binomial tree */
+
+ /* Algorithm:
+ This uses a fairly basic recursive subdivision algorithm.
+ The root sends to the process comm_size/2 away; the receiver becomes
+ a root for a subtree and applies the same process.
+
+ So that the new root can easily identify the size of its
+ subtree, the (subtree) roots are all powers of two (relative
+ to the root) If m = the first power of 2 such that 2^m >= the
+ size of the communicator, then the subtree at root at 2^(m-k)
+ has size 2^k (with special handling for subtrees that aren't
+ a power of two in size).
+
+ Do subdivision. There are two phases:
+ 1. Wait for arrival of data. Because of the power of two nature
+ of the subtree roots, the source of this message is always the
+ process whose relative rank has the least significant 1 bit CLEARED.
+ That is, process 4 (100) receives from process 0, process 7 (111)
+ from process 6 (110), etc.
+ 2. Forward to my subtree
+
+ Note that the process that is the tree root is handled automatically
+ by this code, since it has no bits set. */
+
+ mask = 0x1;
+ while (mask < comm_size) {
+ if (relative_rank & mask) {
+ src = rank - mask;
+ if (src < 0) src += comm_size;
+ if (!is_contig || !is_homogeneous)
+ mpi_errno = MPID_Sched_recv(tmp_buf, nbytes, MPI_BYTE, src, comm_ptr, s);
+ else
+ mpi_errno = MPID_Sched_recv(buffer, count, datatype, src, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ break;
+ }
+ mask <<= 1;
+ }
+
+ /* This process is responsible for all processes that have bits
+ set from the LSB upto (but not including) mask. Because of
+ the "not including", we start by shifting mask back down one.
+
+ We can easily change to a different algorithm at any power of two
+ by changing the test (mask > 1) to (mask > block_size)
+
+ One such version would use non-blocking operations for the last 2-4
+ steps (this also bounds the number of MPI_Requests that would
+ be needed). */
+
+ mask >>= 1;
+ while (mask > 0) {
+ if (relative_rank + mask < comm_size) {
+ dst = rank + mask;
+ if (dst >= comm_size) dst -= comm_size;
+ if (!is_contig || !is_homogeneous)
+ mpi_errno = MPID_Sched_send(tmp_buf, nbytes, MPI_BYTE, dst, comm_ptr, s);
+ else
+ mpi_errno = MPID_Sched_send(buffer, count, datatype, dst, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ /* NOTE: This is departure from MPIR_Bcast_binomial. A true analog
+ * would put an MPID_Sched_barrier here after every send. */
+ }
+ mask >>= 1;
+ }
+
+ if (!is_contig || !is_homogeneous) {
+ if (rank != root) {
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ mpi_errno = MPID_Sched_copy(tmp_buf, nbytes, MPI_PACKED, buffer, count, datatype, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ mpi_errno = MPID_Sched_cb(&MPIR_Sched_cb_free_buf, tmp_buf, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+
+ MPIU_CHKPMEM_COMMIT();
+fn_exit:
+ return mpi_errno;
+fn_fail:
+ MPIU_CHKPMEM_REAP();
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_intra
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPI_Request *request)
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_intra
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+ int mpi_errno = MPI_SUCCESS;
+
+ MPIU_Assert(comm_ptr->comm_kind == MPID_INTRACOMM);
+
+ /* simplistic implementation for now */
+ mpi_errno = MPIR_Ibcast_binomial(buffer, count, datatype, root, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+fn_exit:
+ return mpi_errno;
+fn_fail:
+ goto fn_exit;
+}
+
+/* Provides a generic "flat" broadcast for intercommunicators that doesn't know
+ * anything about hierarchy. It will choose between several different
+ * algorithms based on the given parameters. */
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ibcast_inter
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+ int mpi_errno = MPI_SUCCESS;
+
+ MPIU_Assert(comm_ptr->comm_kind == MPID_INTERCOMM);
+
+ /* Intercommunicator broadcast.
+ * Root sends to rank 0 in remote group. Remote group does local
+ * intracommunicator broadcast. */
+ if (root == MPI_PROC_NULL)
+ {
+ /* local processes other than root do nothing */
+ mpi_errno = MPI_SUCCESS;
+ }
+ else if (root == MPI_ROOT)
+ {
+ /* root sends to rank 0 on remote group and returns */
+ mpi_errno = MPID_Sched_send(buffer, count, datatype, 0, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ else
+ {
+ /* remote group. rank 0 on remote group receives from root */
+ mpi_errno = MPID_Sched_recv(buffer, count, datatype, root, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ if (comm_ptr->local_comm == NULL) {
+ mpi_errno = MPIR_Setup_intercomm_localcomm(comm_ptr);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+
+ mpi_errno = MPIR_Ibcast_intra(buffer, count, datatype, root, comm_ptr->local_comm, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+
+fn_exit:
+ return mpi_errno;
+fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
#define FUNCNAME MPIR_Ibcast_impl
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
Modified: mpich2/trunk/src/mpi/comm/commutil.c
===================================================================
--- mpich2/trunk/src/mpi/comm/commutil.c 2011-01-20 22:08:58 UTC (rev 7784)
+++ mpich2/trunk/src/mpi/comm/commutil.c 2011-01-20 22:09:01 UTC (rev 7785)
@@ -241,6 +241,7 @@
ops->ref_count = 1; /* force existence until finalize time */
/* intracomm default defaults... */
+ ops->Ibcast = &MPIR_Ibcast_intra;
/* TODO add other fns here as they are added */
/* override defaults, such as for SMP */
@@ -267,7 +268,7 @@
ops->ref_count = 1; /* force existence until finalize time */
/* intracomm defaults */
- ops->Ibcast = NULL;
+ ops->Ibcast = &MPIR_Ibcast_inter;
ic_default_collops = ops;
}
More information about the mpich2-commits
mailing list