[mpich2-commits] r7792 - in mpich2/trunk: . src/include src/mpi/coll src/mpi/comm
goodell at mcs.anl.gov
goodell at mcs.anl.gov
Thu Jan 20 16:09:20 CST 2011
Author: goodell
Date: 2011-01-20 16:09:20 -0600 (Thu, 20 Jan 2011)
New Revision: 7792
Modified:
mpich2/trunk/CHANGES
mpich2/trunk/src/include/mpiimpl.h
mpich2/trunk/src/mpi/coll/ialltoallv.c
mpich2/trunk/src/mpi/comm/commutil.c
Log:
default implementation of MPIX_Ialltoallv
This is a straightforward port of the algorithms in MPI_Alltoallv.
No reviewer.
Modified: mpich2/trunk/CHANGES
===================================================================
--- mpich2/trunk/CHANGES 2011-01-20 22:09:17 UTC (rev 7791)
+++ mpich2/trunk/CHANGES 2011-01-20 22:09:20 UTC (rev 7792)
@@ -6,7 +6,7 @@
"MPIX_" functions (e.g. "MPIX_Ibcast"). All functions are provided and are
available for device-level overrides, but only a subset of the functions have
device-independent implementations at this time. The current list of working
- functions is: MPIX_Ibcast, MPIX_Ibarrier, MPIX_Ireduce.
+ functions is: MPIX_Ibcast, MPIX_Ibarrier, MPIX_Ireduce, MPIX_Ialltoallv.
===============================================================================
Modified: mpich2/trunk/src/include/mpiimpl.h
===================================================================
--- mpich2/trunk/src/include/mpiimpl.h 2011-01-20 22:09:17 UTC (rev 7791)
+++ mpich2/trunk/src/include/mpiimpl.h 2011-01-20 22:09:20 UTC (rev 7792)
@@ -3528,6 +3528,8 @@
int MPIR_Ireduce_intra(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
int MPIR_Ireduce_inter(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
int MPIR_Ireduce_binomial(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
+int MPIR_Ialltoallv_intra(void *sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
+int MPIR_Ialltoallv_inter(void *sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
/* random initializers */
Modified: mpich2/trunk/src/mpi/coll/ialltoallv.c
===================================================================
--- mpich2/trunk/src/mpi/coll/ialltoallv.c 2011-01-20 22:09:17 UTC (rev 7791)
+++ mpich2/trunk/src/mpi/coll/ialltoallv.c 2011-01-20 22:09:20 UTC (rev 7792)
@@ -25,6 +25,218 @@
/* any non-MPI functions go here, especially non-static ones */
#undef FUNCNAME
+#define FUNCNAME MPIR_Ialltoallv_intra
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ialltoallv_intra(void *sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int comm_size;
+ int i, j;
+ int ii, ss, bblock;
+ MPI_Aint send_extent, recv_extent, sendtype_size, recvtype_size;
+ int dst, rank;
+ MPIU_CHKPMEM_DECL(1);
+
+ MPIU_Assert(comm_ptr->comm_kind == MPID_INTRACOMM);
+
+ comm_size = comm_ptr->local_size;
+ rank = comm_ptr->rank;
+
+ /* Get extent and size of recvtype, don't look at sendtype for MPI_IN_PLACE */
+ MPID_Datatype_get_extent_macro(recvtype, recv_extent);
+ MPID_Datatype_get_size_macro(recvtype, recvtype_size);
+
+ if (sendbuf == MPI_IN_PLACE) {
+ int max_count;
+ void *tmp_buf = NULL;
+
+ /* The regular MPI_Alltoallv handles MPI_IN_PLACE using pairwise
+ * sendrecv_replace calls. We don't have a sendrecv_replace, so just
+ * malloc the maximum of the counts array entries and then perform the
+ * pairwise exchanges manually with schedule barriers instead.
+ *
+ * Because of this approach all processes must agree on the global
+ * schedule of "sendrecv_replace" operations to avoid deadlock.
+ *
+ * This keeps with the spirit of the MPI-2.2 standard, which is to
+ * conserve memory when using MPI_IN_PLACE for these routines.
+ * Something like MADRE would probably generate a more optimal
+ * algorithm. */
+ max_count = 0;
+ for (i = 0; i < comm_size; ++i) {
+ max_count = MPIU_MAX(max_count, sendcounts[i]);
+ max_count = MPIU_MAX(max_count, recvcounts[i]);
+ }
+
+ MPIU_CHKPMEM_MALLOC(tmp_buf, void *, max_count*recv_extent, mpi_errno, "Ialltoallv tmp_buf");
+
+ for (i = 0; i < comm_size; ++i) {
+ /* start inner loop at i to avoid re-exchanging data */
+ for (j = i; j < comm_size; ++j) {
+ if (rank == i && rank == j) {
+ /* no need to "sendrecv_replace" for ourselves */
+ }
+ else if (rank == i || rank == j) {
+ if (rank == i)
+ dst = j;
+ else
+ dst = i;
+
+ mpi_errno = MPID_Sched_send(((char *)recvbuf + rdispls[dst]*recv_extent),
+ recvcounts[dst], recvtype, dst, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Sched_recv(tmp_buf, recvcounts[dst], recvtype, dst, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ mpi_errno = MPID_Sched_copy(tmp_buf, recvcounts[dst], recvtype,
+ ((char *)recvbuf + rdispls[dst]*recv_extent),
+ recvcounts[dst], recvtype, s);
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+ }
+
+ /* cleanup tmp_buf */
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Sched_cb(&MPIR_Sched_cb_free_buf, tmp_buf, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ else {
+ bblock = MPIR_PARAM_ALLTOALL_THROTTLE;
+ if (bblock == 0)
+ bblock = comm_size;
+
+ /* get size/extent for sendtype */
+ MPID_Datatype_get_extent_macro(sendtype, send_extent);
+ MPID_Datatype_get_size_macro(sendtype, sendtype_size);
+
+ /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */
+ for (ii=0; ii<comm_size; ii+=bblock) {
+ ss = comm_size-ii < bblock ? comm_size-ii : bblock;
+
+ /* do the communication -- post ss sends and receives: */
+ for (i=0; i < ss; i++) {
+ dst = (rank+i+ii) % comm_size;
+ if (recvcounts[dst] && recvtype_size) {
+ MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
+ rdispls[dst]*recv_extent);
+ mpi_errno = MPID_Sched_recv((char *)recvbuf+rdispls[dst]*recv_extent,
+ recvcounts[dst], recvtype, dst, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+
+ for (i=0; i < ss; i++) {
+ dst = (rank-i-ii+comm_size) % comm_size;
+ if (sendcounts[dst] && sendtype_size) {
+ MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT sendbuf +
+ sdispls[dst]*send_extent);
+ mpi_errno = MPID_Sched_send((char *)sendbuf+sdispls[dst]*send_extent,
+ sendcounts[dst], sendtype, dst, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+
+ /* force our block of sends/recvs to complete before starting the next block */
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+ }
+
+ MPIU_CHKPMEM_COMMIT();
+fn_exit:
+ return mpi_errno;
+fn_fail:
+ MPIU_CHKPMEM_REAP();
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIR_Ialltoallv_inter
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int MPIR_Ialltoallv_inter(void *sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s)
+{
+/* Intercommunicator alltoallv. We use a pairwise exchange algorithm
+ similar to the one used in intracommunicator alltoallv. Since the
+ local and remote groups can be of different
+ sizes, we first compute the max of local_group_size,
+ remote_group_size. At step i, 0 <= i < max_size, each process
+ receives from src = (rank - i + max_size) % max_size if src <
+ remote_size, and sends to dst = (rank + i) % max_size if dst <
+ remote_size.
+*/
+ int mpi_errno = MPI_SUCCESS;
+ int local_size, remote_size, max_size, i;
+ MPI_Aint send_extent, recv_extent, sendtype_size, recvtype_size;
+ int src, dst, rank, sendcount, recvcount;
+ char *sendaddr, *recvaddr;
+
+ MPIU_Assert(comm_ptr->comm_kind == MPID_INTERCOMM);
+
+ local_size = comm_ptr->local_size;
+ remote_size = comm_ptr->remote_size;
+ rank = comm_ptr->rank;
+
+ /* Get extent of send and recv types */
+ MPID_Datatype_get_extent_macro(sendtype, send_extent);
+ MPID_Datatype_get_extent_macro(recvtype, recv_extent);
+ MPID_Datatype_get_size_macro(sendtype, sendtype_size);
+ MPID_Datatype_get_size_macro(recvtype, recvtype_size);
+
+ /* Use pairwise exchange algorithm. */
+ max_size = MPIR_MAX(local_size, remote_size);
+ for (i=0; i<max_size; i++) {
+ src = (rank - i + max_size) % max_size;
+ dst = (rank + i) % max_size;
+ if (src >= remote_size) {
+ src = MPI_PROC_NULL;
+ recvaddr = NULL;
+ recvcount = 0;
+ }
+ else {
+ MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
+ rdispls[src]*recv_extent);
+ recvaddr = (char *)recvbuf + rdispls[src]*recv_extent;
+ recvcount = recvcounts[src];
+ }
+ if (dst >= remote_size) {
+ dst = MPI_PROC_NULL;
+ sendaddr = NULL;
+ sendcount = 0;
+ }
+ else {
+ MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT sendbuf +
+ sdispls[dst]*send_extent);
+ sendaddr = (char *)sendbuf + sdispls[dst]*send_extent;
+ sendcount = sendcounts[dst];
+ }
+
+ if (sendcount * sendtype_size == 0)
+ dst = MPI_PROC_NULL;
+ if (recvcount * recvtype_size == 0)
+ src = MPI_PROC_NULL;
+
+ mpi_errno = MPID_Sched_send(sendaddr, sendcount, sendtype, dst, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Sched_recv(recvaddr, recvcount, recvtype, src, comm_ptr, s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ mpi_errno = MPID_Sched_barrier(s);
+ if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+ }
+
+fn_exit:
+ return mpi_errno;
+fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
#define FUNCNAME MPIR_Ialltoallv_impl
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
Modified: mpich2/trunk/src/mpi/comm/commutil.c
===================================================================
--- mpich2/trunk/src/mpi/comm/commutil.c 2011-01-20 22:09:17 UTC (rev 7791)
+++ mpich2/trunk/src/mpi/comm/commutil.c 2011-01-20 22:09:20 UTC (rev 7792)
@@ -244,6 +244,7 @@
ops->Ibcast = &MPIR_Ibcast_intra;
ops->Ibarrier = &MPIR_Ibarrier_intra;
ops->Ireduce = &MPIR_Ireduce_intra;
+ ops->Ialltoallv = &MPIR_Ialltoallv_intra;
/* TODO add other fns here as they are added */
/* override defaults, such as for SMP */
@@ -274,6 +275,7 @@
ops->Ibcast = &MPIR_Ibcast_inter;
ops->Ibarrier = &MPIR_Ibarrier_inter;
ops->Ireduce = &MPIR_Ireduce_inter;
+ ops->Ialltoallv = &MPIR_Ialltoallv_inter;
ic_default_collops = ops;
}
More information about the mpich2-commits
mailing list