Commit cf13c785 authored by Huiwei Lu's avatar Huiwei Lu
Browse files

Adds a CVAR to enable/disable fault tolerance



MPIR_CVAR_ENABLE_FT is added to enable/disable fault tolerance related
code. For performance consideration, FT is disabled by default.

Changes FT related LMT RTS code to use this CVAR.
Signed-off-by: default avatarWesley Bland <wbland@anl.gov>
parent af391387
......@@ -9,17 +9,20 @@
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
categories:
- name : FT
description : cvars that control behavior of fault tolerance
cvars:
- name : MPIR_CVAR_NEM_LMT_RTS_QUEUE_SIZE
category : CH3
type : int
default : 1024
- name : MPIR_CVAR_ENABLE_FT
category : FT
type : boolean
default : false
class : device
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
The initial size of the NEM_LMT_RTS_QUEUE used to track RTS
messages before the LMT setup.
Enable fault tolerance functions
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/
......@@ -123,8 +126,10 @@ int MPID_nem_lmt_RndvSend(MPID_Request **sreq_p, const void * buf, int count,
MPIU_THREAD_CS_ENTER(LMT,);
mpi_errno = vc->ch.lmt_initiate_lmt(vc, &upkt.p, sreq);
if (MPI_SUCCESS == mpi_errno)
MPID_nem_lmt_rtsq_enqueue(&vc->ch.lmt_rts_queue, sreq);
if (MPIR_CVAR_ENABLE_FT) {
if (MPI_SUCCESS == mpi_errno)
MPID_nem_lmt_rtsq_enqueue(&vc->ch.lmt_rts_queue, sreq);
}
MPIU_THREAD_CS_EXIT(LMT,);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
......@@ -319,8 +324,10 @@ static int pkt_CTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t
MPID_Request_get_ptr(cts_pkt->sender_req_id, sreq);
MPIU_THREAD_CS_ENTER(LMT,);
/* Remove the request from the VC RTS queue. */
MPID_nem_lmt_rtsq_search_remove(&vc->ch.lmt_rts_queue, cts_pkt->sender_req_id, &rts_sreq);
if (MPIR_CVAR_ENABLE_FT) {
/* Remove the request from the VC RTS queue. */
MPID_nem_lmt_rtsq_search_remove(&vc->ch.lmt_rts_queue, cts_pkt->sender_req_id, &rts_sreq);
}
MPIU_THREAD_CS_EXIT(LMT,);
sreq->ch.lmt_req_id = cts_pkt->receiver_req_id;
......
......@@ -813,13 +813,15 @@ int MPID_nem_lmt_shm_vc_terminated(MPIDI_VC_t *vc)
}
/* If there is anything in the RTS queue, it needs to be cleared out. */
MPIU_THREAD_CS_ENTER(LMT,);
while (!MPID_nem_lmt_rtsq_empty(vc_ch->lmt_rts_queue)) {
MPID_nem_lmt_rtsq_dequeue(&vc_ch->lmt_rts_queue, &req);
req->status.MPI_ERROR = req_errno;
MPIDI_CH3U_Request_complete(req);
if (MPIR_CVAR_ENABLE_FT) {
MPIU_THREAD_CS_ENTER(LMT,);
while (!MPID_nem_lmt_rtsq_empty(vc_ch->lmt_rts_queue)) {
MPID_nem_lmt_rtsq_dequeue(&vc_ch->lmt_rts_queue, &req);
req->status.MPI_ERROR = req_errno;
MPIDI_CH3U_Request_complete(req);
}
MPIU_THREAD_CS_EXIT(LMT,);
}
MPIU_THREAD_CS_EXIT(LMT,);
/* We empty the vc queue, but don't remove the vc from the global
list. That will eventually happen when lmt_shm_progress()
......
die 4 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
abort 2 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false xfail=ticket1537
sendalive 4 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
isendalive 3 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
multi_isendalive 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
senddead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
recvdead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
isenddead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
irecvdead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
barrier 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
gather 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
reduce 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
bcast 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
scatter 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
anysource 3 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
revoke_nofail 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
shrink 8 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
agree 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
die 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
abort 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false xfail=ticket1537
sendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
isendalive 3 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
multi_isendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
senddead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
recvdead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
isenddead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
irecvdead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
barrier 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
gather 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
reduce 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
bcast 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
scatter 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
anysource 3 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
revoke_nofail 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
shrink 8 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
agree 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment