Commit 89b04336 authored by Darius Buntinas's avatar Darius Buntinas
Browse files

[svn-r7702] Make use of the atomic increment of the completion_count to bump...

[svn-r7702] Make use of the atomic increment of the completion_count to bump the progress engine when a checkpoint is initiated of a failed process is detected.  Also jump out of blocking_recv if the completion_count is bumped.
parent 9e65c23d
......@@ -23,7 +23,7 @@ static inline void MPID_nem_mpich2_enqueue_fastbox (int local_rank);
static inline int MPID_nem_mpich2_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
static inline int MPID_nem_mpich2_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
static inline int MPID_nem_mpich2_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
static inline int MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
static inline int MPID_nem_mpich2_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
static inline void MPID_nem_mpich2_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
......@@ -863,10 +863,9 @@ MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int tim
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int
MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
{
int mpi_errno = MPI_SUCCESS;
unsigned completions = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
#ifndef ENABLE_NO_YIELD
int pollcount = 0;
#endif
......@@ -905,8 +904,8 @@ MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
if (mpi_errno) MPIU_ERR_POP (mpi_errno);
if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count) || MPID_nem_local_lmt_pending
|| MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE] || MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
if (MPID_nem_local_lmt_pending || MPIDI_CH3I_active_send[CH3_NORMAL_QUEUE]
|| MPIDI_CH3I_SendQ_head(CH3_NORMAL_QUEUE))
{
*cell = NULL;
*in_fbox = 0;
......@@ -921,6 +920,12 @@ MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
}
++pollcount;
#endif
if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
*cell = NULL;
*in_fbox = 0;
goto exit_l;
}
}
MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);
......
......@@ -68,9 +68,12 @@ static int ckpt_cb(void *arg)
int rc, ret;
const struct cr_restart_info* ri;
if (MPIDI_Process.my_pg_rank == 0)
if (MPIDI_Process.my_pg_rank == 0) {
MPIDI_nem_ckpt_start_checkpoint = TRUE;
/* poke the progress engine in case we're waiting in a blocking recv */
MPIDI_CH3_Progress_signal_completion();
}
ret = sem_wait(&ckpt_sem);
CHECK_ERR(ret, "sem_wait");
......
......@@ -71,6 +71,8 @@ static qn_ent_t *qn_head = NULL;
static void sigusr1_handler(int sig)
{
++sigusr1_count;
/* poke the progress engine in case we're waiting in a blocking recv */
MPIDI_CH3_Progress_signal_completion();
}
/* MPIDI_CH3I_Shm_send_progress() this function makes progress sending
......@@ -248,6 +250,12 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
MPIU_Assert(progress_state != NULL);
}
if (sigusr1_count > my_sigusr1_count) {
my_sigusr1_count = sigusr1_count;
mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
#ifdef ENABLE_CHECKPOINTING
if (MPIR_PARAM_ENABLE_CKPOINT) {
if (MPIDI_nem_ckpt_start_checkpoint) {
......@@ -333,7 +341,7 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
#endif
)
{
mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox);
mpi_errno = MPID_nem_mpich2_blocking_recv(&cell, &in_fbox, progress_state->ch.completion_count);
}
else
{
......@@ -436,12 +444,6 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
if (sigusr1_count > my_sigusr1_count) {
my_sigusr1_count = sigusr1_count;
mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
/* in the case of progress_wait, bail out if anything completed (CC-1) */
if (is_blocking) {
int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment