Commit c8435ec4 authored by Lisandro Dalcin's avatar Lisandro Dalcin Committed by Sangmin Seo
Browse files

Better implementation of MPI_Allreduce for intercommunicator.



The patch provides a better implementation where each group does a
intra-reduce concurrently, exchanges the reduce result, and broadcasts
in the group. The current implementation had a problem of serializing
intra-reduces of each group.

Fixes ticket #2074.
Signed-off-by: Sangmin Seo's avatarSangmin Seo <sseo@anl.gov>
parent bce35b9f
...@@ -625,60 +625,55 @@ int MPIR_Allreduce_inter ( ...@@ -625,60 +625,55 @@ int MPIR_Allreduce_inter (
int *errflag ) int *errflag )
{ {
/* Intercommunicator Allreduce. /* Intercommunicator Allreduce.
We first do an intercommunicator reduce to rank 0 on left group, We first do intracommunicator reduces to rank 0 on left and right
then an intercommunicator reduce to rank 0 on right group, followed groups, then an exchange between left and right rank 0, and finally
by local intracommunicator broadcasts in each group. intracommunicator broadcasts from rank 0 on left and right group.
We don't do local reduces first and then intercommunicator
broadcasts because it would require allocation of a temporary buffer.
*/ */
int rank, mpi_errno, root; int mpi_errno;
int mpi_errno_ret = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS;
MPI_Aint true_extent, true_lb, extent;
void *tmp_buf=NULL;
MPID_Comm *newcomm_ptr = NULL; MPID_Comm *newcomm_ptr = NULL;
MPI_Comm comm;
rank = comm_ptr->rank; MPIU_CHKLMEM_DECL(1);
/* first do a reduce from right group to rank 0 in left group,
then from left group to rank 0 in right group*/
if (comm_ptr->is_low_group) {
/* reduce from right group to rank 0*/
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
root, comm_ptr, errflag);
if (mpi_errno) {
/* for communication errors, just record the error but continue */
*errflag = TRUE;
MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
}
/* reduce to rank 0 of right group */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op, if (comm_ptr->rank == 0) {
root, comm_ptr, errflag); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
if (mpi_errno) { MPID_Datatype_get_extent_macro(datatype, extent);
/* for communication errors, just record the error but continue */ /* I think this is the worse case, so we can avoid an assert()
*errflag = TRUE; * inside the for loop */
MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); /* Should MPIU_CHKLMEM_MALLOC do this? */
MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
} MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer");
/* adjust for potential negative lower bound in datatype */
tmp_buf = (void *)((char*)tmp_buf - true_lb);
}
comm = comm_ptr->handle;
/* Get the local intracommunicator */
if (!comm_ptr->local_comm)
MPIR_Setup_intercomm_localcomm( comm_ptr );
newcomm_ptr = comm_ptr->local_comm;
/* Do a local reduce on this intracommunicator */
mpi_errno = MPIR_Reduce_intra(sendbuf, tmp_buf, count, datatype,
op, 0, newcomm_ptr, errflag);
if (mpi_errno) {
/* for communication errors, just record the error but continue */
*errflag = TRUE;
MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
} }
else {
/* reduce to rank 0 of left group */
root = 0;
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op,
root, comm_ptr, errflag);
if (mpi_errno) {
/* for communication errors, just record the error but continue */
*errflag = TRUE;
MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
}
/* reduce from right group to rank 0 */ /* Do a exchange between local and remote rank 0 on this intercommunicator */
root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL; if (comm_ptr->rank == 0) {
mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op, mpi_errno = MPIC_Sendrecv(tmp_buf, count, datatype, 0, MPIR_REDUCE_TAG,
root, comm_ptr, errflag); recvbuf, count, datatype, 0, MPIR_REDUCE_TAG,
comm, MPI_STATUS_IGNORE, errflag);
if (mpi_errno) { if (mpi_errno) {
/* for communication errors, just record the error but continue */ /* for communication errors, just record the error but continue */
*errflag = TRUE; *errflag = TRUE;
...@@ -687,13 +682,9 @@ int MPIR_Allreduce_inter ( ...@@ -687,13 +682,9 @@ int MPIR_Allreduce_inter (
} }
} }
/* Get the local intracommunicator */ /* Do a local broadcast on this intracommunicator */
if (!comm_ptr->local_comm) mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype,
MPIR_Setup_intercomm_localcomm( comm_ptr ); 0, newcomm_ptr, errflag);
newcomm_ptr = comm_ptr->local_comm;
mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, newcomm_ptr, errflag);
if (mpi_errno) { if (mpi_errno) {
/* for communication errors, just record the error but continue */ /* for communication errors, just record the error but continue */
*errflag = TRUE; *errflag = TRUE;
...@@ -702,6 +693,8 @@ int MPIR_Allreduce_inter ( ...@@ -702,6 +693,8 @@ int MPIR_Allreduce_inter (
} }
fn_exit: fn_exit:
MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
MPIU_CHKLMEM_FREEALL();
if (mpi_errno_ret) if (mpi_errno_ret)
mpi_errno = mpi_errno_ret; mpi_errno = mpi_errno_ret;
else if (*errflag) else if (*errflag)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment