Commit 34e57aa8 authored by Valentin Petrov's avatar Valentin Petrov Committed by Charles J Archer
Browse files

OFI: MPIR_Barrier_impl should not be called from MPID_nem_ofi_finalize.



It uses nemesis shared memory which is already cleaned up at this stage.
However, w/o any synchronization a hang in the close protocol is possible
since rts/cts/data messages may be on the fly. This change fixes the issue.
Signed-off-by: default avatarCharles J Archer <charles.j.archer@intel.com>
parent 4131f0fb
...@@ -57,6 +57,7 @@ typedef struct { ...@@ -57,6 +57,7 @@ typedef struct {
MPID_Request *persistent_req; /* Unexpected request queue */ MPID_Request *persistent_req; /* Unexpected request queue */
MPID_Request *conn_req; /* Connection request */ MPID_Request *conn_req; /* Connection request */
MPIDI_Comm_ops_t comm_ops; MPIDI_Comm_ops_t comm_ops;
int rts_cts_in_flight;
int api_set; int api_set;
} MPID_nem_ofi_global_t; } MPID_nem_ofi_global_t;
......
...@@ -235,6 +235,7 @@ int MPID_nem_ofi_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_ ...@@ -235,6 +235,7 @@ int MPID_nem_ofi_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_
/* Wait for all the ranks to publish */ /* Wait for all the ranks to publish */
/* their business card */ /* their business card */
/* --------------------------------- */ /* --------------------------------- */
gl_data.rts_cts_in_flight = 0;
PMI_Barrier(); PMI_Barrier();
/* --------------------------------- */ /* --------------------------------- */
...@@ -307,12 +308,9 @@ int MPID_nem_ofi_finalize(void) ...@@ -307,12 +308,9 @@ int MPID_nem_ofi_finalize(void)
mpir_errflag_t ret = MPIR_ERR_NONE; mpir_errflag_t ret = MPIR_ERR_NONE;
BEGIN_FUNC(FCNAME); BEGIN_FUNC(FCNAME);
/* --------------------------------------------- */ while(gl_data.rts_cts_in_flight) {
/* Syncronization */ MPID_nem_ofi_poll(0);
/* Barrier across all ranks in this world */ }
/* --------------------------------------------- */
MPIR_Barrier_impl(MPIR_Process.comm_world, &ret);
/* --------------------------------------------- */ /* --------------------------------------------- */
/* Finalize connection management routines */ /* Finalize connection management routines */
/* Cancels any persistent/global requests and */ /* Cancels any persistent/global requests and */
......
...@@ -61,6 +61,7 @@ ...@@ -61,6 +61,7 @@
/* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */
#define START_COMM() \ #define START_COMM() \
({ \ ({ \
gl_data.rts_cts_in_flight++; \
GET_PGID_AND_SET_MATCH(); \ GET_PGID_AND_SET_MATCH(); \
VC_READY_CHECK(vc); \ VC_READY_CHECK(vc); \
c = 1; \ c = 1; \
...@@ -143,8 +144,9 @@ static int MPID_nem_ofi_data_callback(cq_tagged_entry_t * wc, MPID_Request * sre ...@@ -143,8 +144,9 @@ static int MPID_nem_ofi_data_callback(cq_tagged_entry_t * wc, MPID_Request * sre
vc = REQ_OFI(sreq)->vc; vc = REQ_OFI(sreq)->vc;
MPI_RC(reqFn(vc, sreq, &complete)); MPI_RC(reqFn(vc, sreq, &complete));
} }
} gl_data.rts_cts_in_flight--;
else { break;
case MPID_MSG_RTS:
MPIDI_CH3U_Request_complete(sreq); MPIDI_CH3U_Request_complete(sreq);
} }
END_FUNC_RC(FCNAME); END_FUNC_RC(FCNAME);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment