Commit 13182820 authored by Wesley Bland's avatar Wesley Bland
Browse files

Fix bug in shrink when serializing failed procs



The function to convert the group of failed procs to a bitarray was
incorrectly quiting early if one of the globally known failed processes
was not in the communciator being dealt with.
Signed-off-by: Kenneth Raffenetti's avatarKen Raffenetti <raffenet@mcs.anl.gov>
parent cff52d18
...@@ -50,7 +50,7 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr) ...@@ -50,7 +50,7 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
int mpi_errno = MPI_SUCCESS; int mpi_errno = MPI_SUCCESS;
MPID_Group *global_failed, *comm_grp, *new_group_ptr; MPID_Group *global_failed, *comm_grp, *new_group_ptr;
int attempts = 0; int attempts = 0;
mpir_errflag_t errflag = MPIR_ERR_NONE, tmp_errflag = MPIR_ERR_NONE; mpir_errflag_t errflag = MPIR_ERR_NONE;
MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_SHRINK); MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_SHRINK);
MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_SHRINK); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_SHRINK);
...@@ -59,7 +59,9 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr) ...@@ -59,7 +59,9 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
MPIR_Comm_group_impl(comm_ptr, &comm_grp); MPIR_Comm_group_impl(comm_ptr, &comm_grp);
do { do {
mpi_errno = MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG); errflag = MPIR_ERR_NONE;
MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG);
/* Ignore the mpi_errno value here as it will definitely communicate /* Ignore the mpi_errno value here as it will definitely communicate
* with failed procs */ * with failed procs */
...@@ -68,10 +70,15 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr) ...@@ -68,10 +70,15 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
if (MPID_Group_empty != global_failed) MPIR_Group_release(global_failed); if (MPID_Group_empty != global_failed) MPIR_Group_release(global_failed);
mpi_errno = MPIR_Comm_create_group(comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, newcomm_ptr); mpi_errno = MPIR_Comm_create_group(comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, newcomm_ptr);
errflag = mpi_errno || *newcomm_ptr == NULL; if (*newcomm_ptr == NULL) {
errflag = MPIR_ERR_PROC_FAILED;
} else if (mpi_errno) {
errflag = MPIR_ERR_GET_CLASS(mpi_errno);
MPIR_Comm_release(*newcomm_ptr, 0);
}
mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &errflag, 1, MPI_INT, MPI_MAX, comm_ptr, mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &errflag, 1, MPI_INT, MPI_MAX, comm_ptr,
new_group_ptr, MPIR_SHRINK_TAG, &tmp_errflag); new_group_ptr, MPIR_SHRINK_TAG, &errflag);
MPIR_Group_release(new_group_ptr); MPIR_Group_release(new_group_ptr);
if (errflag) MPIU_Object_set_ref(new_group_ptr, 0); if (errflag) MPIU_Object_set_ref(new_group_ptr, 0);
......
...@@ -33,7 +33,8 @@ static int *group_to_bitarray(MPID_Group *group, MPID_Comm *orig_comm) { ...@@ -33,7 +33,8 @@ static int *group_to_bitarray(MPID_Group *group, MPID_Comm *orig_comm) {
MPIR_Group_translate_ranks_impl(group, group->size, group_ranks, MPIR_Group_translate_ranks_impl(group, group->size, group_ranks,
orig_comm->local_group, comm_ranks); orig_comm->local_group, comm_ranks);
for (i = 0; i < group->size && comm_ranks[i] != MPI_UNDEFINED; i++) { for (i = 0; i < group->size ; i++) {
if (comm_ranks[i] == MPI_UNDEFINED) continue;
index = comm_ranks[i] / 32; index = comm_ranks[i] / 32;
mask = 0x80000000 >> comm_ranks[i] % 32; mask = 0x80000000 >> comm_ranks[i] % 32;
bitarray[index] |= mask; bitarray[index] |= mask;
...@@ -128,7 +129,7 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou ...@@ -128,7 +129,7 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou
/* Send the list to each rank to be processed locally */ /* Send the list to each rank to be processed locally */
mpi_errno = MPIC_Send(bitarray, bitarray_size, MPI_UINT32_T, i, mpi_errno = MPIC_Send(bitarray, bitarray_size, MPI_UINT32_T, i,
tag, comm_ptr->handle, &errflag); tag, comm_ptr->handle, &errflag);
if (mpi_errno) errflag = 1; if (mpi_errno) errflag = MPIR_ERR_PROC_FAILED;
} }
/* Convert the bitarray into a group */ /* Convert the bitarray into a group */
...@@ -137,12 +138,10 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou ...@@ -137,12 +138,10 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou
/* Send my bitarray to rank 0 */ /* Send my bitarray to rank 0 */
mpi_errno = MPIC_Send(bitarray, bitarray_size, MPI_UINT32_T, 0, mpi_errno = MPIC_Send(bitarray, bitarray_size, MPI_UINT32_T, 0,
tag, comm_ptr->handle, &errflag); tag, comm_ptr->handle, &errflag);
if (mpi_errno) errflag = 1;
/* Get the resulting bitarray back from rank 0 */ /* Get the resulting bitarray back from rank 0 */
mpi_errno = MPIC_Recv(remote_bitarray, bitarray_size, MPI_UINT32_T, 0, mpi_errno = MPIC_Recv(remote_bitarray, bitarray_size, MPI_UINT32_T, 0,
tag, comm_ptr->handle, MPI_STATUS_IGNORE, &errflag); tag, comm_ptr->handle, MPI_STATUS_IGNORE, &errflag);
if (mpi_errno) errflag = 1;
/* Convert the bitarray into a group */ /* Convert the bitarray into a group */
*failed_group = bitarray_to_group(comm_ptr, remote_bitarray); *failed_group = bitarray_to_group(comm_ptr, remote_bitarray);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment