Commit 5324a41f authored by Min Si's avatar Min Si Committed by Pavan Balaji
Browse files

Expose AM flush ordering and issue per OP flush if unordered.



This patch includes three changes:
(1) Added netmod API get_ordering to allow netmod to expose the network
ordering. A netmod may issue some packets via multiple connections in
parallel if those packets (such as RMA) do not require ordering, and
thus the packets may be unordered. This patch sets the network ordering
in every existing netmod (tcp|mxm|ofi|portals|llc) to true, since all
packets are sent orderly via one connection.
(2) Nemesis exposes the window packet orderings such as AM flush
ordering at init time. It supports ordered packets only when netmod
supports ordered network.
(3) If AM flush is ordered (flush must be finished after all previous
operations), then CH3 RMA only requests FLUSH ACK on the last operation.
Otherwise, CH3 must request per-OP FLUSH ACK to ensure all operations
are remotely completed.
Signed-off-by: default avatarXin Zhao <xinzhao3@illinois.edu>
Signed-off-by: Pavan Balaji's avatarPavan Balaji <balaji@anl.gov>
parent c83b6b2d
......@@ -27,6 +27,7 @@ typedef int (* MPID_nem_net_module_anysource_improbe_t)(int tag, MPID_Comm *comm
MPID_Request **message, MPI_Status *status);
typedef void (* MPID_nem_net_module_vc_dbg_print_sendq_t)(FILE *stream, MPIDI_VC_t *vc);
typedef int (* MPID_nem_net_module_get_ordering_t)(int *ordering);
typedef struct MPID_nem_netmod_funcs
{
......@@ -45,6 +46,7 @@ typedef struct MPID_nem_netmod_funcs
MPID_nem_net_module_vc_terminate_t vc_terminate;
MPID_nem_net_module_anysource_iprobe_t anysource_iprobe;
MPID_nem_net_module_anysource_improbe_t anysource_improbe;
MPID_nem_net_module_get_ordering_t get_ordering;
} MPID_nem_netmod_funcs_t;
extern MPID_nem_net_module_vc_dbg_print_sendq_t MPID_nem_net_module_vc_dbg_print_sendq;
......
......@@ -81,6 +81,7 @@ int MPID_nem_llc_anysource_iprobe(int tag, MPID_Comm * comm, int context_offset,
MPI_Status * status);
int MPID_nem_llc_anysource_improbe(int tag, MPID_Comm * comm, int context_offset, int *flag,
MPID_Request ** message, MPI_Status * status);
int MPID_nem_llc_get_ordering(int *ordering);
int MPID_nem_llc_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz,
void *data, MPIDI_msg_sz_t data_sz);
......
......@@ -36,6 +36,7 @@ MPID_nem_netmod_funcs_t MPIDI_nem_llc_funcs = {
.vc_terminate = MPID_nem_llc_vc_terminate,
.anysource_iprobe = MPID_nem_llc_anysource_iprobe,
.anysource_improbe = MPID_nem_llc_anysource_improbe,
.get_ordering = MPID_nem_llc_get_ordering,
};
int MPID_nem_llc_my_llc_rank;
......@@ -221,3 +222,13 @@ int MPID_nem_llc_anysource_improbe(int tag, MPID_Comm * comm, int context_offset
return MPID_nem_llc_improbe(NULL, MPI_ANY_SOURCE, tag, comm, context_offset, flag, message,
status);
}
#undef FUNCNAME
#define FUNCNAME MPID_nem_llc_get_ordering
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPID_nem_llc_get_ordering(int *ordering)
{
(*ordering) = 1;
return MPI_SUCCESS;
}
......@@ -26,6 +26,7 @@ int MPID_nem_mxm_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc)
int MPID_nem_mxm_vc_init(MPIDI_VC_t * vc);
int MPID_nem_mxm_vc_destroy(MPIDI_VC_t * vc);
int MPID_nem_mxm_vc_terminate(MPIDI_VC_t * vc);
int MPID_nem_mxm_get_ordering(int *ordering);
/* alternate interface */
int MPID_nem_mxm_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz,
......
......@@ -71,7 +71,8 @@ MPID_nem_netmod_funcs_t MPIDI_nem_mxm_funcs = {
MPID_nem_mxm_vc_destroy,
MPID_nem_mxm_vc_terminate,
MPID_nem_mxm_anysource_iprobe,
MPID_nem_mxm_anysource_improbe
MPID_nem_mxm_anysource_improbe,
MPID_nem_mxm_get_ordering
};
static MPIDI_Comm_ops_t comm_ops = {
......@@ -388,6 +389,16 @@ int MPID_nem_mxm_vc_terminate(MPIDI_VC_t * vc)
goto fn_exit;
}
#undef FUNCNAME
#define FUNCNAME MPID_nem_mxm_get_ordering
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPID_nem_mxm_get_ordering(int *ordering)
{
(*ordering) = 1;
return MPI_SUCCESS;
}
static int _mxm_conf(void)
{
int mpi_errno = MPI_SUCCESS;
......
......@@ -55,4 +55,5 @@ MPID_nem_netmod_funcs_t MPIDI_nem_ofi_funcs = {
MPID_nem_ofi_vc_terminate,
MPID_nem_ofi_anysource_iprobe,
MPID_nem_ofi_anysource_improbe,
MPID_nem_ofi_get_ordering,
};
......@@ -289,6 +289,7 @@ int MPID_nem_ofi_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc)
int MPID_nem_ofi_vc_destroy(MPIDI_VC_t * vc);
int MPID_nem_ofi_cm_init(MPIDI_PG_t * pg_p, int pg_rank);
int MPID_nem_ofi_cm_finalize();
int MPID_nem_ofi_get_ordering(int *ordering);
extern MPID_nem_ofi_global_t gl_data;
extern MPIDI_Comm_ops_t _g_comm_ops;
......
......@@ -326,6 +326,14 @@ int MPID_nem_ofi_finalize(void)
END_FUNC_RC(FCNAME);
}
#undef FCNAME
#define FCNAME DECL_FUNC(MPID_nem_ofi_get_ordering)
int MPID_nem_ofi_get_ordering(int *ordering)
{
(*ordering) = 1;
return MPI_SUCCESS;
}
static inline int compile_time_checking()
{
OFI_COMPILE_TIME_ASSERT(sizeof(MPID_nem_ofi_vc_t) <= MPID_NEM_VC_NETMOD_AREA_LEN);
......
......@@ -203,6 +203,7 @@ int MPID_nem_ptl_anysource_improbe(int tag, MPID_Comm * comm, int context_offset
void MPID_nem_ptl_anysource_posted(MPID_Request *rreq);
int MPID_nem_ptl_anysource_matched(MPID_Request *rreq);
int MPID_nem_ptl_init_id(MPIDI_VC_t *vc);
int MPID_nem_ptl_get_ordering(int *ordering);
int MPID_nem_ptl_lmt_initiate_lmt(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *rts_pkt, MPID_Request *req);
int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc, MPID_Request *rreq, MPID_IOV s_cookie);
......
......@@ -60,7 +60,8 @@ MPID_nem_netmod_funcs_t MPIDI_nem_portals4_funcs = {
vc_destroy,
vc_terminate,
MPID_nem_ptl_anysource_iprobe,
MPID_nem_ptl_anysource_improbe
MPID_nem_ptl_anysource_improbe,
MPID_nem_ptl_get_ordering
};
static MPIDI_Comm_ops_t comm_ops = {
......@@ -647,6 +648,16 @@ int MPID_nem_ptl_init_id(MPIDI_VC_t *vc)
goto fn_exit;
}
#undef FUNCNAME
#define FUNCNAME MPID_nem_ptl_get_ordering
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPID_nem_ptl_get_ordering(int *ordering)
{
(*ordering) = 1;
return MPI_SUCCESS;
}
#define CASE_STR(x) case x: return #x
......
......@@ -108,6 +108,7 @@ void MPID_nem_tcp_vc_dbg_print_sendq(FILE *stream, MPIDI_VC_t *vc);
int MPID_nem_tcp_socksm_finalize(void);
int MPID_nem_tcp_socksm_init(void);
int MPID_nem_tcp_vc_terminated(MPIDI_VC_t *vc);
int MPID_nem_tcp_get_ordering(int *ordering);
int MPID_nem_tcp_pkt_unpause_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t *buflen, MPID_Request **rreqp);
......
......@@ -104,7 +104,9 @@ MPID_nem_netmod_funcs_t MPIDI_nem_tcp_funcs = {
MPID_nem_tcp_vc_init,
MPID_nem_tcp_vc_destroy,
MPID_nem_tcp_vc_terminate,
NULL /* anysource iprobe */
NULL, /* anysource iprobe */
NULL, /* anysource_improbe */
MPID_nem_tcp_get_ordering
};
/* in case there are no packet types defined (e.g., they're ifdef'ed out) make sure the array is not zero length */
......@@ -726,3 +728,12 @@ int MPID_nem_tcp_vc_terminated(MPIDI_VC_t *vc)
goto fn_exit;
}
#undef FUNCNAME
#define FUNCNAME MPID_nem_tcp_get_ordering
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPID_nem_tcp_get_ordering(int *ordering)
{
(*ordering) = 1;
return MPI_SUCCESS;
}
......@@ -73,6 +73,41 @@ int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t * win_hooks)
}
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Win_pkt_orderings_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_Win_pkt_orderings_init(MPIDI_CH3U_Win_pkt_ordering_t * win_pkt_orderings)
{
int mpi_errno = MPI_SUCCESS;
int netmod_ordering = 0;
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_PKT_ORDERINGS_INIT);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_PKT_ORDERINGS_INIT);
win_pkt_orderings->am_flush_ordered = 0;
if (MPID_nem_netmod_func && MPID_nem_netmod_func->get_ordering) {
mpi_errno = MPID_nem_netmod_func->get_ordering(&netmod_ordering);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
}
if (netmod_ordering > 0) {
/* Guarantees ordered AM flush only on ordered network.
* In other words, it is ordered only when both intra-node and inter-node
* connections are ordered. Otherwise we have to maintain the ordering per
* connection, which causes expensive O(P) structure or per-OP function calls.*/
win_pkt_orderings->am_flush_ordered = 1;
}
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_PKT_ORDERINGS_INIT);
return mpi_errno;
fn_fail:
goto fn_exit;
}
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Win_init
#undef FCNAME
......
......@@ -43,3 +43,25 @@ int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t *win_hooks)
return mpi_errno;
}
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Win_pkt_orderings_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_Win_pkt_orderings_init(MPIDI_CH3U_Win_pkt_ordering_t * win_pkt_orderings)
{
int mpi_errno = MPI_SUCCESS;
int netmod_ordering = 0;
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_PKT_ORDERINGS_INIT);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_PKT_ORDERINGS_INIT);
/* Guarantees ordered AM flush. */
win_pkt_orderings->am_flush_ordered = 1;
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_PKT_ORDERINGS_INIT);
return mpi_errno;
fn_fail:
goto fn_exit;
}
......@@ -1149,6 +1149,18 @@ typedef struct {
extern MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks;
typedef struct MPIDI_CH3U_Win_pkt_ordering {
/* Ordered AM flush.
* It means whether AM flush is guaranteed to be finished after all previous
* RMA operations. It initialized by Nemesis and used by CH3.
* Note that we use single global flag for all targets including both
* intra-node and inter-node processes.*/
int am_flush_ordered;
} MPIDI_CH3U_Win_pkt_ordering_t;
extern MPIDI_CH3U_Win_pkt_ordering_t MPIDI_CH3U_Win_pkt_orderings;
/* CH3 and Channel window functions initializers */
int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
......@@ -1156,6 +1168,8 @@ int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
/* Channel window hooks initializer */
int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t *win_hooks);
int MPIDI_CH3_Win_pkt_orderings_init(MPIDI_CH3U_Win_pkt_ordering_t * win_pkt_orderings);
/* Default window creation functions provided by CH3 */
int MPIDI_CH3U_Win_create(void *, MPI_Aint, int, MPID_Info *, MPID_Comm *,
MPID_Win **);
......
......@@ -298,14 +298,6 @@ typedef struct MPIDI_Win_basic_info {
MPI_Win win_handle;
} MPIDI_Win_basic_info_t;
typedef struct MPIDI_RMA_Pkt_orderings {
int flush_remote; /* ordered FLUSH, for remote completion */
/* FIXME: in future we should also add local completin
ordering: WAW, WAR, RAW, RAR. */
} MPIDI_RMA_Pkt_orderings_t;
extern MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings;
#define MPIDI_DEV_WIN_DECL \
volatile int at_completion_counter; /* completion counter for operations \
targeting this window */ \
......
......@@ -337,20 +337,32 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
first_op = 0;
}
/* piggyback FLUSH on every OP if ordered flush is not guaranteed. */
if (!MPIDI_CH3U_Win_pkt_orderings.am_flush_ordered)
flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
if (curr_op->next == NULL) {
/* piggyback on last OP. */
if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
target->sync.outstanding_acks++;
if (target->win_complete_flag)
flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
}
else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
/* if piggyback UNLOCK then unset FLUSH (set for every
* operation on out-of-order network). */
flags &= ~MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
}
}
/* only increase ack counter when FLUSH or UNLOCK flag is set,
* but without LOCK piggyback. */
if (((flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
|| (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)))
target->sync.outstanding_acks++;
mpi_errno = issue_rma_op(curr_op, win_ptr, target, flags);
if (mpi_errno != MPI_SUCCESS)
MPIU_ERR_POP(mpi_errno);
......
......@@ -41,6 +41,7 @@ MPIDI_Process_t MPIDI_Process = { NULL };
MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool = NULL;
MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns = { NULL };
MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks = { NULL };
MPIDI_CH3U_Win_pkt_ordering_t MPIDI_CH3U_Win_pkt_orderings = { 0 };
#undef FUNCNAME
......@@ -193,6 +194,9 @@ int MPID_Init(int *argc, char ***argv, int requested, int *provided,
mpi_errno = MPIDI_CH3U_Recvq_init();
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
/* Ask channel to expose Window packet ordering. */
MPIDI_CH3_Win_pkt_orderings_init(&MPIDI_CH3U_Win_pkt_orderings);
/*
* Initialize the MPI_COMM_WORLD object
*/
......
......@@ -95,7 +95,6 @@ MPIDI_RMA_Op_t *global_rma_op_pool = NULL, *global_rma_op_pool_tail =
NULL, *global_rma_op_pool_start = NULL;
MPIDI_RMA_Target_t *global_rma_target_pool = NULL, *global_rma_target_pool_tail =
NULL, *global_rma_target_pool_start = NULL;
MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings = NULL;
#undef FUNCNAME
#define FUNCNAME MPIDI_RMA_init
......@@ -128,12 +127,6 @@ int MPIDI_RMA_init(void)
&(global_rma_target_pool_start[i]));
}
MPIU_CHKPMEM_MALLOC(MPIDI_RMA_Pkt_orderings, struct MPIDI_RMA_Pkt_orderings *,
sizeof(struct MPIDI_RMA_Pkt_orderings), mpi_errno, "RMA packet orderings");
/* FIXME: here we should let channel to set ordering flags. For now we just set them
* in CH3 layer. */
MPIDI_RMA_Pkt_orderings->flush_remote = 1;
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_RMA_INIT);
return mpi_errno;
......@@ -156,7 +149,6 @@ void MPIDI_RMA_finalize(void)
MPIU_Free(global_rma_op_pool_start);
MPIU_Free(global_rma_target_pool_start);
MPIU_Free(MPIDI_RMA_Pkt_orderings);
MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_RMA_FINALIZE);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment