Commit 93e816cc authored by Huiwei Lu's avatar Huiwei Lu
Browse files

FT: Fixes ref counts in shrink and agree



When process fails, fault tolerance scheme takes a different path to
deal with MPI object reference counts than the existing one. Some
reference counts were not properly set in FT path so when configured
with --enable-g=all, some ft tests will show leaked context id, dirty
COMM, GROUP and REQUEST objects and so on when exit.

This patch fixes ft/shrink and ft/agree with "--enable-g=all". Stack
allocated objects of requests, communicators and groups will be freed by
FT.
Signed-off-by: default avatarWesley Bland <wbland@anl.gov>
parent a3dd5f40
......@@ -499,6 +499,10 @@ int MPIC_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
MPIDI_FUNC_EXIT(MPID_STATE_MPIC_SENDRECV);
return mpi_errno;
fn_fail:
if (send_req_ptr)
MPID_Request_release(send_req_ptr);
if (recv_req_ptr)
MPID_Request_release(recv_req_ptr);
goto fn_exit;
}
......
......@@ -81,7 +81,16 @@ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
new_group_ptr, MPIR_SHRINK_TAG, &errflag);
MPIR_Group_release(new_group_ptr);
if (errflag) MPIU_Object_set_ref(new_group_ptr, 0);
if (errflag) {
if (*newcomm_ptr != NULL && MPIU_Object_get_ref(*newcomm_ptr) > 0) {
MPIU_Object_set_ref(*newcomm_ptr, 1);
MPIR_Comm_release(*newcomm_ptr, 0);
}
if (MPIU_Object_get_ref(new_group_ptr) > 0) {
MPIU_Object_set_ref(new_group_ptr, 1);
MPIR_Group_release(new_group_ptr);
}
}
} while (errflag && ++attempts < 5);
if (errflag && attempts >= 5) goto fn_fail;
......
......@@ -107,6 +107,8 @@ int MPID_Comm_get_all_failed_procs(MPID_Comm *comm_ptr, MPID_Group **failed_grou
bitarray = group_to_bitarray(local_fail, comm_ptr);
bitarray_size = (comm_ptr->local_size / 8) + (comm_ptr->local_size % 8 ? 1 : 0);
remote_bitarray = MPIU_Malloc(sizeof(uint32_t) * bitarray_size);
if (local_fail != MPID_Group_empty)
MPIR_Group_release(local_fail);
/* For now, this will be implemented as a star with rank 0 serving as
* the source */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment