Commit 13182820 authored by Wesley Bland's avatar Wesley Bland
Browse files

Fix bug in shrink when serializing failed procs



The function to convert the group of failed procs to a bitarray was
incorrectly quiting early if one of the globally known failed processes
was not in the communciator being dealt with.
Signed-off-by: Kenneth Raffenetti's avatarKen Raffenetti <raffenet@mcs.anl.gov>
parent cff52d18
......@@ -50,7 +50,7 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
int mpi_errno = MPI_SUCCESS;
MPID_Group *global_failed, *comm_grp, *new_group_ptr;
int attempts = 0;
mpir_errflag_t errflag = MPIR_ERR_NONE, tmp_errflag = MPIR_ERR_NONE;
mpir_errflag_t errflag = MPIR_ERR_NONE;
MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_SHRINK);
MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_SHRINK);
......@@ -59,7 +59,9 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
MPIR_Comm_group_impl(comm_ptr, &comm_grp);
do {
mpi_errno = MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG);
errflag = MPIR_ERR_NONE;
MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG);
/* Ignore the mpi_errno value here as it will definitely communicate
* with failed procs */
......@@ -68,10 +70,15 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
if (MPID_Group_empty != global_failed) MPIR_Group_release(global_failed);
mpi_errno = MPIR_Comm_create_group(comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, newcomm_ptr);
errflag = mpi_errno || *newcomm_ptr == NULL;
if (*newcomm_ptr == NULL) {
errflag = MPIR_ERR_PROC_FAILED;
} else if (mpi_errno) {
errflag = MPIR_ERR_GET_CLASS(mpi_errno);
MPIR_Comm_release(*newcomm_ptr, 0);
}
mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &errflag, 1, MPI_INT, MPI_MAX, comm_ptr,
new_group_ptr, MPIR_SHRINK_TAG, &tmp_errflag);
new_group_ptr, MPIR_SHRINK_TAG, &errflag);
MPIR_Group_release(new_group_ptr);
if (errflag) MPIU_Object_set_ref(new_group_ptr, 0);
......
......@@ -33,7 +33,8 @@ static int *group_to_bitarray(MPID_Group *group, MPID_Comm *orig_comm) {
MPIR_Group_translate_ranks_impl(group, group->size, group_ranks,
orig_comm->local_group, comm_ranks);
for (i = 0; i < group->size && comm_ranks[i] != MPI_UNDEFINED; i++) {
for (i = 0; i < group->size ; i++) {
if (comm_ranks[i] == MPI_UNDEFINED) continue;
index = comm_ranks[i] / 32;
mask = 0x80000000 >> comm_ranks[i] % 32;
bitarray[index] |= mask;
......@@ -128,7 +129,7 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou
/* Send the list to each rank to be processed locally */
mpi_errno = MPIC_Send(bitarray, bitarray_size, MPI_UINT32_T, i,
tag, comm_ptr->handle, &errflag);
if (mpi_errno) errflag = 1;
if (mpi_errno) errflag = MPIR_ERR_PROC_FAILED;
}
/* Convert the bitarray into a group */
......@@ -137,12 +138,10 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou
/* Send my bitarray to rank 0 */
mpi_errno = MPIC_Send(bitarray, bitarray_size, MPI_UINT32_T, 0,
tag, comm_ptr->handle, &errflag);
if (mpi_errno) errflag = 1;
/* Get the resulting bitarray back from rank 0 */
mpi_errno = MPIC_Recv(remote_bitarray, bitarray_size, MPI_UINT32_T, 0,
tag, comm_ptr->handle, MPI_STATUS_IGNORE, &errflag);
if (mpi_errno) errflag = 1;
/* Convert the bitarray into a group */
*failed_group = bitarray_to_group(comm_ptr, remote_bitarray);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment