Commit c76aa786 authored by Xin Zhao's avatar Xin Zhao Committed by Pavan Balaji
Browse files

Bug-fix: always waiting for remote completion in Win_unlock.



The original implementation includes an optimization which
allows Win_unlock for exclusive lock to return without
waiting for remote completion. This relys on the
assumption that window memory on target process will not
be accessed by a third party until that target process
finishes all RMA operations and grants the lock to other
processes. However, this assumption is not correct if user
uses assert MPI_MODE_NOCHECK. Consider the following code:

          P0                              P1           P2
    MPI_Win_lock(P1, NULL, exclusive);
    MPI_Put(X);
    MPI_Win_unlock(P1, exclusive);
    MPI_Send (P2);                                MPI_Recv(P0);
                                                  MPI_Win_lock(P1, MODE_NOCHECK, exclusive);
                                                  MPI_Get(X);
                                                  MPI_Win_unlock(P1, exclusive);

Both P0 and P2 issue exclusive lock to P1, and P2 uses assert
MPI_MODE_NOCHECK because the lock should be granted to P2 after
synchronization between P2 and P0. However, in the original
implementation, GET operation on P2 might not get the updated
value since Win_unlock on P0 return without waiting for remote
completion.

In this patch we delete this optimization. In Win_free, since every
Win_unlock guarantees the remote completion, target process no
longer needs to do additional counting works to detect target-side
completion, but only needs to do a global barrier.
Signed-off-by: Pavan Balaji's avatarPavan Balaji <balaji@anl.gov>
parent 9164deed
......@@ -63,6 +63,7 @@ fn_fail:
int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
{
int mpi_errno = MPI_SUCCESS;
int errflag = FALSE;
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
......@@ -72,8 +73,9 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
goto fn_exit;
}
mpi_errno = MPIDI_CH3I_Wait_for_pt_ops_finish(*win_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
/* Free shared memory region */
if ((*win_ptr)->shm_allocated) {
......
......@@ -338,11 +338,6 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
comm_size*sizeof(MPI_Win),
mpi_errno, "(*win_ptr)->all_win_handles");
MPIU_CHKPMEM_MALLOC((*win_ptr)->pt_rma_puts_accs, int *,
comm_size*sizeof(int),
mpi_errno, "(*win_ptr)->pt_rma_puts_accs");
for (i=0; i<comm_size; i++) (*win_ptr)->pt_rma_puts_accs[i] = 0;
/* get the sizes of the windows and window objectsof
all processes. allocate temp. buffer for communication */
MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 3*comm_size*sizeof(MPI_Aint), mpi_errno, "tmp_buf");
......
......@@ -277,12 +277,6 @@ struct MPIDI_Win_target_state {
volatile int shared_lock_ref_cnt; \
struct MPIDI_Win_lock_queue volatile *lock_queue; /* list of unsatisfied locks */ \
\
int *pt_rma_puts_accs; /* array containing the no. of passive target\
puts/accums issued from this process to other \
processes. */ \
volatile int my_pt_rma_puts_accs; /* no. of passive target puts/accums \
that this process has \
completed as target */ \
MPI_Aint *sizes; /* array of sizes of all windows */ \
struct MPIDI_Win_info_args info_args; \
struct MPIDI_Win_target_state *targets; /* Target state and ops \
......
......@@ -1064,57 +1064,6 @@ static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_ad
}
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Wait_for_pt_ops_finish
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Wait_for_pt_ops_finish(MPID_Win * win_ptr)
{
int mpi_errno = MPI_SUCCESS, total_pt_rma_puts_accs;
MPID_Comm *comm_ptr;
int errflag = FALSE;
MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WAIT_FOR_PT_OPS_FINISH);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WAIT_FOR_PT_OPS_FINISH);
comm_ptr = win_ptr->comm_ptr;
MPIR_T_PVAR_TIMER_START(RMA, rma_winfree_rs);
mpi_errno = MPIR_Reduce_scatter_block_impl(win_ptr->pt_rma_puts_accs,
&total_pt_rma_puts_accs, 1,
MPI_INT, MPI_SUM, comm_ptr, &errflag);
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
MPIR_T_PVAR_TIMER_END(RMA, rma_winfree_rs);
if (total_pt_rma_puts_accs != win_ptr->my_pt_rma_puts_accs) {
MPID_Progress_state progress_state;
/* poke the progress engine until the two are equal */
MPIR_T_PVAR_TIMER_START(RMA, rma_winfree_complete);
MPID_Progress_start(&progress_state);
while (total_pt_rma_puts_accs != win_ptr->my_pt_rma_puts_accs) {
mpi_errno = MPID_Progress_wait(&progress_state);
/* --BEGIN ERROR HANDLING-- */
if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
}
/* --END ERROR HANDLING-- */
}
MPID_Progress_end(&progress_state);
MPIR_T_PVAR_TIMER_END(RMA, rma_winfree_complete);
}
fn_exit:
MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WAIT_FOR_PT_OPS_FINISH);
return mpi_errno;
fn_fail:
goto fn_exit;
}
#undef FUNCNAME
#undef FCNAME
......
......@@ -1037,9 +1037,6 @@ int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
MPIDI_CH3_Finish_rma_op_target(). That call cannot be used
here, because it would enter this function recursively. */
/* increment counter */
win_ptr->my_pt_rma_puts_accs++;
mpi_errno =
MPIDI_CH3I_Send_pt_rma_done_pkt(lock_queue->vc, win_ptr,
lock_queue->source_win_handle);
......
......@@ -40,6 +40,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
int mpi_errno = MPI_SUCCESS;
int in_use;
MPID_Comm *comm_ptr;
int errflag = FALSE;
MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FREE);
MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
......@@ -47,9 +48,13 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
MPIU_ERR_CHKANDJUMP((*win_ptr)->epoch_state != MPIDI_EPOCH_NONE,
mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
mpi_errno = MPIDI_CH3I_Wait_for_pt_ops_finish(*win_ptr);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
if (!(*win_ptr)->shm_allocated) {
/* when SHM is allocated, we already did a global barrier in
MPIDI_CH3_SHM_Win_free, so we do not need to do it again here. */
mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
if (mpi_errno)
MPIU_ERR_POP(mpi_errno);
}
comm_ptr = (*win_ptr)->comm_ptr;
mpi_errno = MPIR_Comm_free_impl(comm_ptr);
......@@ -61,7 +66,6 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
MPIU_Free((*win_ptr)->sizes);
MPIU_Free((*win_ptr)->disp_units);
MPIU_Free((*win_ptr)->all_win_handles);
MPIU_Free((*win_ptr)->pt_rma_puts_accs);
/* Free the attached buffer for windows created with MPI_Win_allocate() */
if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
......
......@@ -932,16 +932,12 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
const void *o_addr, int o_count,
MPI_Datatype o_datatype, MPID_Datatype ** combined_dtp);
/* Issue an RMA operation -- Before calling this macro, you must define the
* MPIDI_CH3I_TRACK_RMA_WRITE helper macro. This macro defines any extra action
* that should be taken when a write (put/acc) operation is encountered. */
#define MPIDI_CH3I_ISSUE_RMA_OP(op_ptr_, win_ptr_, flags_, source_win_handle_, target_win_handle_,err_) \
do { \
switch ((op_ptr_)->type) \
{ \
case (MPIDI_RMA_PUT): \
case (MPIDI_RMA_ACCUMULATE): \
MPIDI_CH3I_TRACK_RMA_WRITE(op_ptr_, win_ptr_); \
(err_) = send_rma_msg((op_ptr_), (win_ptr_), (flags_), (source_win_handle_), \
(target_win_handle_), &(op_ptr_)->dtype_info, \
&(op_ptr_)->dataloop, &(op_ptr_)->request); \
......@@ -962,7 +958,6 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
(target_win_handle_), &(op_ptr_)->dtype_info, \
&(op_ptr_)->dataloop, &(op_ptr_)->request); \
} else { \
MPIDI_CH3I_TRACK_RMA_WRITE(op_ptr_, win_ptr_); \
(err_) = send_rma_msg((op_ptr_), (win_ptr_), (flags_), (source_win_handle_), \
(target_win_handle_), &(op_ptr_)->dtype_info, \
&(op_ptr_)->dataloop, &(op_ptr_)->request); \
......@@ -970,7 +965,6 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
if (err_) { MPIU_ERR_POP(err_); } \
break; \
case MPIDI_RMA_ACC_CONTIG: \
MPIDI_CH3I_TRACK_RMA_WRITE(op_ptr_, win_ptr_); \
(err_) = send_contig_acc_msg((op_ptr_), (win_ptr_), (flags_), \
(source_win_handle_), (target_win_handle_), \
&(op_ptr_)->request); \
......@@ -985,7 +979,6 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
break; \
case (MPIDI_RMA_COMPARE_AND_SWAP): \
case (MPIDI_RMA_FETCH_AND_OP): \
MPIDI_CH3I_TRACK_RMA_WRITE(op_ptr_, win_ptr_); \
(err_) = send_immed_rmw_msg((op_ptr_), (win_ptr_), (flags_), \
(source_win_handle_), (target_win_handle_), \
&(op_ptr_)->request); \
......@@ -1022,31 +1015,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
win_ptr->epoch_state != MPIDI_EPOCH_FENCE,
mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
/* In case this process was previously the target of passive target rma
* operations, we need to take care of the following...
* Since we allow MPI_Win_unlock to return without a done ack from
* the target in the case of multiple rma ops and exclusive lock,
* we need to check whether there is a lock on the window, and if
* there is a lock, poke the progress engine until the operartions
* have completed and the lock is released. */
if (win_ptr->current_lock_type != MPID_LOCK_NONE) {
MPIR_T_PVAR_TIMER_START(RMA, rma_winfence_clearlock);
MPID_Progress_start(&progress_state);
while (win_ptr->current_lock_type != MPID_LOCK_NONE) {
/* poke the progress engine */
mpi_errno = MPID_Progress_wait(&progress_state);
/* --BEGIN ERROR HANDLING-- */
if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
}
/* --END ERROR HANDLING-- */
MPIR_T_PVAR_COUNTER_INC(RMA, rma_winfence_clearlock_aux, 1);
}
MPID_Progress_end(&progress_state);
MPIR_T_PVAR_TIMER_END(RMA, rma_winfence_clearlock);
}
/* Note that the NOPRECEDE and NOSUCCEED must be specified by all processes
* in the window's group if any specify it */
if (assert & MPI_MODE_NOPRECEDE) {
......@@ -1169,10 +1137,8 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
source_win_handle = win_ptr->handle;
target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
#define MPIDI_CH3I_TRACK_RMA_WRITE(op_ptr_, win_ptr_) /* Not used by active mode */
MPIDI_CH3I_ISSUE_RMA_OP(curr_ptr, win_ptr, flags,
source_win_handle, target_win_handle, mpi_errno);
#undef MPIDI_CH3I_TRACK_RMA_WRITE
i++;
curr_ops_cnt[curr_ptr->target_rank]++;
......@@ -2114,33 +2080,6 @@ int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
* synchronization, we cannot do this because fence_issued must be
* updated collectively */
/* In case this process was previously the target of passive target rma
* operations, we need to take care of the following...
* Since we allow MPI_Win_unlock to return without a done ack from
* the target in the case of multiple rma ops and exclusive lock,
* we need to check whether there is a lock on the window, and if
* there is a lock, poke the progress engine until the operations
* have completed and the lock is therefore released. */
if (win_ptr->current_lock_type != MPID_LOCK_NONE) {
MPID_Progress_state progress_state;
MPIR_T_PVAR_TIMER_START(RMA, rma_winpost_clearlock);
/* poke the progress engine */
MPID_Progress_start(&progress_state);
while (win_ptr->current_lock_type != MPID_LOCK_NONE) {
mpi_errno = MPID_Progress_wait(&progress_state);
/* --BEGIN ERROR HANDLING-- */
if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
}
/* --END ERROR HANDLING-- */
MPIR_T_PVAR_COUNTER_INC(RMA, rma_winpost_clearlock_aux, 1);
}
MPID_Progress_end(&progress_state);
MPIR_T_PVAR_TIMER_END(RMA, rma_winpost_clearlock);
}
post_grp_size = post_grp_ptr->size;
/* Ensure ordering of load/store operations. */
......@@ -2395,33 +2334,6 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
* synchronization, we cannot do this because fence_issued must be
* updated collectively */
/* In case this process was previously the target of passive target rma
* operations, we need to take care of the following...
* Since we allow MPI_Win_unlock to return without a done ack from
* the target in the case of multiple rma ops and exclusive lock,
* we need to check whether there is a lock on the window, and if
* there is a lock, poke the progress engine until the operations
* have completed and the lock is therefore released. */
if (win_ptr->current_lock_type != MPID_LOCK_NONE) {
MPID_Progress_state progress_state;
MPIR_T_PVAR_TIMER_START(RMA, rma_winstart_clearlock);
/* poke the progress engine */
MPID_Progress_start(&progress_state);
while (win_ptr->current_lock_type != MPID_LOCK_NONE) {
mpi_errno = MPID_Progress_wait(&progress_state);
/* --BEGIN ERROR HANDLING-- */
if (mpi_errno != MPI_SUCCESS) {
MPID_Progress_end(&progress_state);
MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
}
/* --END ERROR HANDLING-- */
MPIR_T_PVAR_COUNTER_INC(RMA, rma_winstart_clearlock_aux, 1);
}
MPID_Progress_end(&progress_state);
MPIR_T_PVAR_TIMER_END(RMA, rma_winstart_clearlock);
}
win_ptr->start_group_ptr = group_ptr;
MPIR_Group_add_ref(group_ptr);
win_ptr->start_assert = assert;
......@@ -2570,10 +2482,8 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
source_win_handle = win_ptr->handle;
target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
#define MPIDI_CH3I_TRACK_RMA_WRITE(op_ptr_, win_ptr_) /* Not used by active mode */
MPIDI_CH3I_ISSUE_RMA_OP(curr_ptr, win_ptr, flags,
source_win_handle, target_win_handle, mpi_errno);
#undef MPIDI_CH3I_TRACK_RMA_WRITE
i++;
curr_ops_cnt[curr_ptr->target_rank]++;
......@@ -3512,19 +3422,7 @@ static int do_passive_target_rma(MPID_Win * win_ptr, int target_rank,
(win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED &&
win_ptr->targets[target_rank].remote_lock_assert & MPI_MODE_NOCHECK));
if (win_ptr->targets[target_rank].remote_lock_mode == MPI_LOCK_EXCLUSIVE &&
win_ptr->targets[target_rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_CALLED &&
win_ptr->targets[target_rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_FLUSH) {
/* Exclusive lock already held -- no need to wait for rma done pkt at
* the end. This is because the target won't grant another process
* access to the window until all of our operations complete at that
* target. Thus, there is no third-party communication issue.
* However, flush still needs to wait for rma done, otherwise result
* may be unknown if user reads the updated location from a shared window of
* another target process after this flush. */
*wait_for_rma_done_pkt = 0;
}
else if (MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[target_rank].rma_ops_list)) {
if (MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[target_rank].rma_ops_list)) {
/* The ops list is empty -- NOTE: we assume this is because the epoch
* was flushed. Any issued ops are already remote complete; done
* packet is not needed for safe third party communication. */
......@@ -3635,15 +3533,8 @@ static int do_passive_target_rma(MPID_Win * win_ptr, int target_rank,
source_win_handle = win_ptr->handle;
}
/* Track passive target write operations. This is used during Win_free
* to ensure that all writes to a given target have completed at that
* process before the window is freed. */
#define MPIDI_CH3I_TRACK_RMA_WRITE(op_, win_ptr_) \
do { (win_ptr_)->pt_rma_puts_accs[(op_)->target_rank]++; } while (0)
MPIDI_CH3I_ISSUE_RMA_OP(curr_ptr, win_ptr, flags, source_win_handle,
target_win_handle, mpi_errno);
#undef MPIDI_CH3I_TRACK_RMA_WRITE
/* If the request is null, we can remove it immediately */
if (!curr_ptr->request) {
......@@ -3969,8 +3860,6 @@ static int send_lock_put_or_acc(MPID_Win * win_ptr, int target_rank)
rma_op = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
win_ptr->pt_rma_puts_accs[rma_op->target_rank]++;
if (rma_op->type == MPIDI_RMA_PUT) {
MPIDI_Pkt_init(lock_put_unlock_pkt, MPIDI_CH3_PKT_LOCK_PUT_UNLOCK);
lock_put_unlock_pkt->flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK |
......@@ -6001,12 +5890,6 @@ int MPIDI_CH3_Finish_rma_op_target(MPIDI_VC_t * vc, MPID_Win * win_ptr, int is_r
/* This function should be called by the target process after each RMA
* operation is completed, to update synchronization state. */
/* If this is a passive target RMA update operation, increment counter. This is
* needed in Win_free to ensure that all ops are completed before a window
* is freed. */
if (win_ptr->current_lock_type != MPID_LOCK_NONE && is_rma_update)
win_ptr->my_pt_rma_puts_accs++;
/* Last RMA operation from source. If active target RMA, decrement window
* counter. */
if (flags & MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE) {
......
......@@ -71,11 +71,6 @@ int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit,
comm_size*sizeof(MPI_Win),
mpi_errno, "(*win_ptr)->all_win_handles");
MPIU_CHKPMEM_MALLOC((*win_ptr)->pt_rma_puts_accs, int *,
comm_size*sizeof(int),
mpi_errno, "(*win_ptr)->pt_rma_puts_accs");
for (i=0; i<comm_size; i++) (*win_ptr)->pt_rma_puts_accs[i] = 0;
/* get the addresses of the windows, window objects, and completion
counters of all processes. allocate temp. buffer for communication */
MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4*comm_size*sizeof(MPI_Aint),
......
......@@ -307,8 +307,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
(*win_ptr)->current_lock_type = MPID_LOCK_NONE;
(*win_ptr)->shared_lock_ref_cnt = 0;
(*win_ptr)->lock_queue = NULL;
(*win_ptr)->pt_rma_puts_accs = NULL;
(*win_ptr)->my_pt_rma_puts_accs = 0;
(*win_ptr)->epoch_state = MPIDI_EPOCH_NONE;
(*win_ptr)->epoch_count = 0;
(*win_ptr)->at_rma_ops_list = NULL;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment