Commit a9d968cc authored by Xin Zhao's avatar Xin Zhao
Browse files

Perf-tuning: issue FLUSH, FLUSH ACK, UNLOCK ACK messages only when needed.



When operation pending list and request lists are all empty, FLUSH message
needs to be sent by origin only when origin issued PUT/ACC operations since
the last synchronization calls, otherwise origin does not need to issue FLUSH
at all and does not need to wait for FLUSH ACK message.

Similiarly, origin waits for ACK of UNLOCK message only when origin issued
PUT/ACC operations since the last synchronization calls. However, UNLOCK
message always needs to be sent out because origin needs to unlock the
target process. This patch avoids issuing unnecessary
FLUSH / FLUSH ACK / UNLOCK ACK messages.
Signed-off-by: Pavan Balaji's avatarPavan Balaji <balaji@anl.gov>
parent c26861ba
......@@ -118,6 +118,7 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
e->outstanding_lock = 0;
e->disable_flush_local = 0;
e->win_complete_flag = 0;
e->put_acc_issued = 0;
e->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
e->sync.outstanding_acks = 0;
......
......@@ -88,6 +88,8 @@ typedef struct MPIDI_RMA_Target {
int outstanding_lock;
int disable_flush_local;
int win_complete_flag;
int put_acc_issued; /* indicate if PUT/ACC is issued in this epoch
after the previous synchronization calls. */
/* The target structure is free to be cleaned up when all of the
* following conditions hold true:
......
......@@ -118,7 +118,8 @@ typedef enum {
MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE = 128,
MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK = 256,
MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK = 512,
MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED = 1024
MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED = 1024,
MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK = 2048
} MPIDI_CH3_Pkt_flags_t;
typedef struct MPIDI_CH3_Pkt_send {
......@@ -551,6 +552,7 @@ typedef struct MPIDI_CH3_Pkt_unlock {
int target_rank; /* Used in unluck/flush response to look up the
* target state at the origin. */
int origin_rank;
MPIDI_CH3_Pkt_flags_t flags;
} MPIDI_CH3_Pkt_unlock_t;
typedef struct MPIDI_CH3_Pkt_flush {
......
......@@ -351,7 +351,6 @@ extern MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings;
int outstanding_locks; /* when issuing multiple lock requests in \
MPI_WIN_LOCK_ALL, this counter keeps track \
of number of locks not being granted yet. */ \
int outstanding_unlocks; \
#ifdef MPIDI_CH3_WIN_DECL
#define MPID_DEV_WIN_DECL \
......
......@@ -60,7 +60,8 @@ static inline int send_lock_msg(int dest, int lock_type, MPID_Win * win_ptr)
#define FUNCNAME send_unlock_msg
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int send_unlock_msg(int dest, MPID_Win * win_ptr)
static inline int send_unlock_msg(int dest, MPID_Win * win_ptr,
MPIDI_CH3_Pkt_flags_t flags)
{
int mpi_errno = MPI_SUCCESS;
MPIDI_CH3_Pkt_t upkt;
......@@ -78,6 +79,7 @@ static inline int send_unlock_msg(int dest, MPID_Win * win_ptr)
MPIDI_Pkt_init(unlock_pkt, MPIDI_CH3_PKT_UNLOCK);
unlock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
unlock_pkt->source_win_handle = win_ptr->handle;
unlock_pkt->flags = flags;
MPIU_THREAD_CS_ENTER(CH3COMM, vc);
mpi_errno = MPIDI_CH3_iStartMsg(vc, unlock_pkt, sizeof(*unlock_pkt), &req);
......@@ -324,14 +326,10 @@ static inline int MPIDI_CH3I_RMA_Handle_flush_ack(MPID_Win * win_ptr, int target
mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &t);
if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
if (t == NULL) {
win_ptr->outstanding_unlocks--;
MPIU_Assert(win_ptr->outstanding_unlocks >= 0);
}
else {
t->sync.outstanding_acks--;
MPIU_Assert(t->sync.outstanding_acks >= 0);
}
t->sync.outstanding_acks--;
MPIU_Assert(t->sync.outstanding_acks >= 0);
t->put_acc_issued = 0; /* reset PUT_ACC_FLAG after FLUSH is completed */
fn_exit:
return mpi_errno;
......
......@@ -233,10 +233,21 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
target->sync.outstanding_acks--;
MPIU_Assert(target->sync.outstanding_acks == 0);
}
else if (target->read_op_list == NULL &&
target->write_op_list == NULL &&
target->dt_op_list == NULL &&
target->put_acc_issued == 0) {
/* both pending list and all waiting lists for
this target are empty, we do not need to send
FLUSH message then. */
target->sync.outstanding_acks--;
MPIU_Assert(target->sync.outstanding_acks >= 0);
}
else {
mpi_errno = send_flush_msg(target->target_rank, win_ptr);
if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
(*made_progress) = 1;
goto finish_issue;
}
......@@ -247,10 +258,22 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
target->sync.outstanding_acks--;
MPIU_Assert(target->sync.outstanding_acks == 0);
}
else if (target->read_op_list == NULL &&
target->write_op_list == NULL &&
target->dt_op_list == NULL &&
target->put_acc_issued == 0) {
/* send message to unlock target, but don't need ACK */
mpi_errno = send_unlock_msg(target->target_rank, win_ptr, MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK);
if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
target->sync.outstanding_acks--;
MPIU_Assert(target->sync.outstanding_acks >= 0);
}
else {
mpi_errno = send_unlock_msg(target->target_rank, win_ptr);
mpi_errno = send_unlock_msg(target->target_rank, win_ptr, MPIDI_CH3_PKT_FLAG_NONE);
if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
}
(*made_progress) = 1;
goto finish_issue;
}
......@@ -327,6 +350,8 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE) {
MPIDI_CH3I_RMA_Ops_append(&(target->write_op_list),
&(target->write_op_list_tail), curr_op);
target->put_acc_issued = 1; /* set PUT_ACC_FLAG when sending
PUT/ACC operation. */
}
else {
MPIDI_CH3I_RMA_Ops_append(&(target->read_op_list),
......
......@@ -1519,9 +1519,11 @@ int MPIDI_CH3_PktHandler_Unlock(MPIDI_VC_t * vc ATTRIBUTE((unused)),
mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, MPIDI_CH3_PKT_FLAG_NONE,
unlock_pkt->source_win_handle);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
if (!(unlock_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK)) {
mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, MPIDI_CH3_PKT_FLAG_NONE,
unlock_pkt->source_win_handle);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
MPIDI_CH3_Progress_signal_completion();
......
......@@ -1488,8 +1488,6 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
OPA_read_write_barrier();
}
MPIU_Assert(win_ptr->outstanding_unlocks == 0);
/* Unlock MYSELF and processes on SHM. */
if (!(win_ptr->lock_all_assert & MPI_MODE_NOCHECK)) {
mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
......@@ -1503,8 +1501,7 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
if (i == rank) continue;
MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
if (orig_vc->node_id == target_vc->node_id) {
win_ptr->outstanding_unlocks++;
mpi_errno = send_unlock_msg(i, win_ptr);
mpi_errno = send_unlock_msg(i, win_ptr, MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK);
if (mpi_errno != MPI_SUCCESS)
MPIU_ERR_POP(mpi_errno);
}
......@@ -1561,8 +1558,7 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
continue;
}
win_ptr->outstanding_unlocks++;
mpi_errno = send_unlock_msg(i, win_ptr);
mpi_errno = send_unlock_msg(i, win_ptr, MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK);
if (mpi_errno != MPI_SUCCESS)
MPIU_ERR_POP(mpi_errno);
}
......@@ -1579,12 +1575,12 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
&remote_completed);
if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
if (!remote_completed || win_ptr->outstanding_unlocks) {
if (!remote_completed) {
mpi_errno = wait_progress_engine();
if (mpi_errno != MPI_SUCCESS)
MPIU_ERR_POP(mpi_errno);
}
} while (!remote_completed || win_ptr->outstanding_unlocks);
} while (!remote_completed);
/* Cleanup all targets on this window. */
mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
......
......@@ -336,7 +336,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
(*win_ptr)->lock_all_assert = 0;
(*win_ptr)->lock_epoch_count = 0;
(*win_ptr)->outstanding_locks = 0;
(*win_ptr)->outstanding_unlocks = 0;
/* Initialize the info flags */
(*win_ptr)->info_args.no_locks = 0;
......
......@@ -169,6 +169,24 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
(*win_ptr)->states.exposure_state != MPIDI_RMA_NONE,
mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
/* 1. Here we must wait until all passive locks are released on this target,
because for some UNLOCK messages, we do not send ACK back to origin,
we must wait until lock is released so that we can free window.
2. We also need to wait until AT completion counter being zero, because
this counter is increment everytime we meet a GET-like operation, it is
possible that when target entering Win_free, passive epoch is not finished
yet and there are still GETs doing on this target.
3. We also need to wait until lock queue becomes empty. It is possible
that some lock requests is still waiting in the queue when target is
entering Win_free. */
while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE ||
(*win_ptr)->at_completion_counter != 0 ||
(*win_ptr)->lock_queue != NULL) {
mpi_errno = wait_progress_engine();
if (mpi_errno != MPI_SUCCESS)
MPIU_ERR_POP(mpi_errno);
}
if (!(*win_ptr)->shm_allocated) {
/* when SHM is allocated, we already did a global barrier in
MPIDI_CH3_SHM_Win_free, so we do not need to do it again here. */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment