Commit 38b20e57 authored by Xin Zhao's avatar Xin Zhao
Browse files

Rewrite all synchronization routines.



We use new algorithms for RMA synchronization
functions and RMA epochs. The old implementation
uses a lazy-issuing algorithm, which queues up
all operations and issues them at end. This
forbid opportunites to do hardware RMA operations
and can use up all memory resources when we
queue up large number of operations.

Here we use a new algorithm, which will initialize
the synchonization at beginning, and issue operations
as soon as the synchronization is finished.
Signed-off-by: Pavan Balaji's avatarPavan Balaji <balaji@anl.gov>
parent 257faca2
......@@ -16,6 +16,8 @@
extern int MPID_nem_lmt_shm_pending;
extern MPID_nem_cell_ptr_t MPID_nem_prefetched_cell;
extern int num_active_issued_win;
extern int num_passive_win;
static inline int MPID_nem_mpich_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again);
static inline int MPID_nem_mpich_sendv (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
......@@ -39,7 +41,8 @@ static inline void MPID_nem_mpich_send_seg (MPID_Segment *segment, MPIDI_msg_sz_
(!MPID_nem_local_lmt_pending && \
!MPIDI_CH3I_shm_active_send && \
!MPIDI_CH3I_Sendq_head(MPIDI_CH3I_shm_sendq) && \
!MPIDU_Sched_are_pending())
!MPIDU_Sched_are_pending() && \
!num_active_issued_win && !num_passive_win)
#undef FUNCNAME
#define FUNCNAME MPID_nem_mpich_send_header
......
......@@ -472,11 +472,13 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
#endif /* HAVE_LIBHCOLL */
/* make progress on RMA */
if (num_active_issued_win > 0 || num_passive_win > 0) {
mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
if (made_progress)
MPIDI_CH3_Progress_signal_completion();
}
/* in the case of progress_wait, bail out if anything completed (CC-1) */
if (is_blocking) {
......
......@@ -10,6 +10,8 @@
#include "mpidimpl.h"
#include "ch3usock.h"
extern int num_active_issued_win;
extern int num_passive_win;
/* This is all socket connection definitions */
......
......@@ -96,9 +96,11 @@ static int MPIDI_CH3i_Progress_test(void)
#endif /* HAVE_LIBHCOLL */
/* make progress on RMA */
if (num_active_issued_win > 0 || num_passive_win > 0) {
mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
}
mpi_errno = MPIDU_Sock_wait(MPIDI_CH3I_sock_set, 0, &event);
......@@ -209,6 +211,7 @@ static int MPIDI_CH3i_Progress_wait(MPID_Progress_state * progress_state)
#endif /* HAVE_LIBHCOLL */
/* make progress on RMA */
if (num_active_issued_win > 0 || num_passive_win > 0) {
mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
......@@ -216,6 +219,7 @@ static int MPIDI_CH3i_Progress_wait(MPID_Progress_state * progress_state)
MPIDI_CH3_Progress_signal_completion();
break;
}
}
# ifdef MPICH_IS_THREADED
......
......@@ -18,6 +18,8 @@ int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress);
extern struct MPIDI_RMA_Op *global_rma_op_pool, *global_rma_op_pool_tail, *global_rma_op_pool_start;
extern struct MPIDI_RMA_Target *global_rma_target_pool, *global_rma_target_pool_tail, *global_rma_target_pool_start;
extern int num_active_issued_win;
extern int num_passive_win;
/* MPIDI_CH3I_Win_op_alloc(): get a new op element from op pool and
* initialize it. If we cannot get one, return NULL. */
......
......@@ -353,14 +353,6 @@ struct MPIDI_Win_target_state {
struct MPIDI_RMA_Op *at_rma_ops_list_tail; \
enum MPIDI_Win_epoch_states epoch_state; \
int epoch_count; \
int fence_issued; /* Indicates if fence has been called, and if an \
active target fence epoch is possible. This \
is maintained separately from the epoch state;\
this state must be updated collectively (in \
fence) to ensure that the fence state across \
all processes remains consistent. */ \
MPID_Group *start_group_ptr; /* group passed in MPI_Win_start */ \
int start_assert; /* assert passed to MPI_Win_start */ \
int shm_allocated; /* flag: TRUE iff this window has a shared memory \
region associated with it */ \
struct MPIDI_RMA_Op *op_pool_start; /* start pointer used for freeing */\
......@@ -383,6 +375,16 @@ struct MPIDI_Win_target_state {
int active_req_cnt; /* keep track of number of active requests in \
current epoch, i.e., number of issued but \
incomplete RMA operations. */ \
MPI_Request fence_sync_req; \
MPI_Request *start_req; \
int *start_ranks_in_win_grp; \
int start_grp_size; \
int lock_all_assert; \
int lock_epoch_count; /* number of lock access epoch on this process */ \
int outstanding_locks; /* when issuing multiple lock requests in \
MPI_WIN_LOCK_ALL, this counter keeps track \
of number of locks not being granted yet. */ \
int outstanding_unlocks; \
#ifdef MPIDI_CH3_WIN_DECL
#define MPID_DEV_WIN_DECL \
......
......@@ -124,6 +124,9 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int
MPID_Request_release(fence_req_ptr);
win_ptr->fence_sync_req = MPI_REQUEST_NULL;
num_active_issued_win--;
MPIU_Assert(num_active_issued_win >= 0);
(*made_progress) = 1;
}
else {
......@@ -137,6 +140,9 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int
we do not create PSCW requests on window. */
win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
num_active_issued_win--;
MPIU_Assert(num_active_issued_win >= 0);
(*made_progress) = 1;
}
else {
......@@ -157,6 +163,9 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int
MPIU_Assert(i == win_ptr->start_grp_size);
win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
num_active_issued_win--;
MPIU_Assert(num_active_issued_win >= 0);
(*made_progress) = 1;
MPIU_Free(win_ptr->start_req);
......
This diff is collapsed.
......@@ -30,6 +30,8 @@ cvars:
MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list = NULL, *MPIDI_RMA_Win_list_tail = NULL;
int num_active_issued_win = 0;
int num_passive_win = 0;
static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
......@@ -304,7 +306,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
MPIU_Object_set_ref(*win_ptr, 1);
(*win_ptr)->fence_issued = 0;
/* (*win_ptr)->errhandler is set by upper level; */
/* (*win_ptr)->base is set by caller; */
(*win_ptr)->size = size;
......@@ -312,8 +313,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
(*win_ptr)->create_flavor = create_flavor;
(*win_ptr)->model = model;
(*win_ptr)->attributes = NULL;
(*win_ptr)->start_group_ptr = NULL;
(*win_ptr)->start_assert = 0;
(*win_ptr)->comm_ptr = win_comm_ptr;
(*win_ptr)->at_completion_counter = 0;
......@@ -334,6 +333,14 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
(*win_ptr)->non_empty_slots = 0;
(*win_ptr)->posted_ops_cnt = 0;
(*win_ptr)->active_req_cnt = 0;
(*win_ptr)->fence_sync_req = MPI_REQUEST_NULL;
(*win_ptr)->start_req = NULL;
(*win_ptr)->start_ranks_in_win_grp = NULL;
(*win_ptr)->start_grp_size = 0;
(*win_ptr)->lock_all_assert = 0;
(*win_ptr)->lock_epoch_count = 0;
(*win_ptr)->outstanding_locks = 0;
(*win_ptr)->outstanding_unlocks = 0;
/* Initialize the passive target lock state */
MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,
......
......@@ -147,6 +147,20 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
/* it is possible that there is a IBARRIER in MPI_WIN_FENCE with
MODE_NOPRECEDE not being completed, we let the progress engine
to delete its request when it is completed. */
if ((*win_ptr)->fence_sync_req != MPI_REQUEST_NULL) {
MPID_Request *req_ptr;
MPID_Request_get_ptr((*win_ptr)->fence_sync_req, req_ptr);
MPID_Request_release(req_ptr);
(*win_ptr)->fence_sync_req = MPI_REQUEST_NULL;
(*win_ptr)->states.access_state = MPIDI_RMA_NONE;
}
if ((*win_ptr)->states.access_state == MPIDI_RMA_FENCE_GRANTED)
(*win_ptr)->states.access_state = MPIDI_RMA_NONE;
MPIU_ERR_CHKANDJUMP((*win_ptr)->states.access_state != MPIDI_RMA_NONE ||
(*win_ptr)->states.exposure_state != MPIDI_RMA_NONE,
mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment