Commit 34e57aa8 authored by Valentin Petrov's avatar Valentin Petrov Committed by Charles J Archer
Browse files

OFI: MPIR_Barrier_impl should not be called from MPID_nem_ofi_finalize.



It uses nemesis shared memory which is already cleaned up at this stage.
However, w/o any synchronization a hang in the close protocol is possible
since rts/cts/data messages may be on the fly. This change fixes the issue.
Signed-off-by: default avatarCharles J Archer <charles.j.archer@intel.com>
parent 4131f0fb
......@@ -57,6 +57,7 @@ typedef struct {
MPID_Request *persistent_req; /* Unexpected request queue */
MPID_Request *conn_req; /* Connection request */
MPIDI_Comm_ops_t comm_ops;
int rts_cts_in_flight;
int api_set;
} MPID_nem_ofi_global_t;
......
......@@ -235,6 +235,7 @@ int MPID_nem_ofi_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_
/* Wait for all the ranks to publish */
/* their business card */
/* --------------------------------- */
gl_data.rts_cts_in_flight = 0;
PMI_Barrier();
/* --------------------------------- */
......@@ -307,12 +308,9 @@ int MPID_nem_ofi_finalize(void)
mpir_errflag_t ret = MPIR_ERR_NONE;
BEGIN_FUNC(FCNAME);
/* --------------------------------------------- */
/* Syncronization */
/* Barrier across all ranks in this world */
/* --------------------------------------------- */
MPIR_Barrier_impl(MPIR_Process.comm_world, &ret);
while(gl_data.rts_cts_in_flight) {
MPID_nem_ofi_poll(0);
}
/* --------------------------------------------- */
/* Finalize connection management routines */
/* Cancels any persistent/global requests and */
......
......@@ -60,17 +60,18 @@
/* v v */
/* ------------------------------------------------------------------------ */
#define START_COMM() \
({ \
GET_PGID_AND_SET_MATCH(); \
VC_READY_CHECK(vc); \
c = 1; \
MPID_cc_incr(sreq->cc_ptr, &c); \
MPID_cc_incr(sreq->cc_ptr, &c); \
REQ_OFI(sreq)->event_callback = MPID_nem_ofi_data_callback; \
REQ_OFI(sreq)->pack_buffer = pack_buffer; \
REQ_OFI(sreq)->pack_buffer_size = pkt_len; \
REQ_OFI(sreq)->vc = vc; \
REQ_OFI(sreq)->tag = match_bits; \
({ \
gl_data.rts_cts_in_flight++; \
GET_PGID_AND_SET_MATCH(); \
VC_READY_CHECK(vc); \
c = 1; \
MPID_cc_incr(sreq->cc_ptr, &c); \
MPID_cc_incr(sreq->cc_ptr, &c); \
REQ_OFI(sreq)->event_callback = MPID_nem_ofi_data_callback; \
REQ_OFI(sreq)->pack_buffer = pack_buffer; \
REQ_OFI(sreq)->pack_buffer_size = pkt_len; \
REQ_OFI(sreq)->vc = vc; \
REQ_OFI(sreq)->tag = match_bits; \
\
MPID_nem_ofi_create_req(&cts_req, 1); \
cts_req->dev.OnDataAvail = NULL; \
......@@ -143,8 +144,9 @@ static int MPID_nem_ofi_data_callback(cq_tagged_entry_t * wc, MPID_Request * sre
vc = REQ_OFI(sreq)->vc;
MPI_RC(reqFn(vc, sreq, &complete));
}
}
else {
gl_data.rts_cts_in_flight--;
break;
case MPID_MSG_RTS:
MPIDI_CH3U_Request_complete(sreq);
}
END_FUNC_RC(FCNAME);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment