Commit 3f6f6aba authored by David Goodell's avatar David Goodell
Browse files

[svn-r1051] Several logical changes that are unfortunately too much work to disentangle,

test, and commit separately, so here they are all together:

1) Add the first cut of LL/SC support for BG/P.
2) Fix a couple of minor issues in the atomic primitives like "volatile" in the
   wrong place.
3) Remove all the nemesis atomic inline assembly and memory barriers and replace
   them with the new atomic primitives.  This is a baby-step towards switching
   nemesis over to the new atomic abstractions.
4) Fix a bug in the fastbox handling code that was aggravated by dynamic process
   changes.  Also, re-enable a recently disabled fastbox optimization.
parent 3cfd3315
......@@ -75,17 +75,6 @@ static inline int MPIDU_Ref_release_and_test(volatile int *ptr)
} while (oldv != MPIDU_Atomic_cas_int(ptr, oldv, newv));
retval = (0 == newv);
goto fn_exit;
/* sketch of an LL/SC impl
#elif defined(ATOMIC_DECR_AND_TEST_IS_EMULATED) && defined(ATOMIC_LL_SC_SUPPORTED)
int val;
do {
val = LL(ptr);
--val;
} while (!SC(ptr, val));
retval = (0 == val);
goto fn_exit;
*/
#else
retval = MPIDU_Atomic_decr_and_test(ptr);
goto fn_exit;
......
......@@ -10,285 +10,7 @@
#include <mpichconf.h>
#include <mpidi_ch3i_nemesis_conf.h>
static inline void *MPID_NEM_SWAP (volatile void *ptr, void *val)
{
#ifdef HAVE_GCC_AND_PENTIUM_ASM
__asm__ __volatile__ ("xchgl %0,%1"
:"=r" (val), "=m" (*(void **)ptr)
: "0" (val), "m" (*(void **)ptr));
return val;
#elif defined(HAVE_GCC_AND_X86_64_ASM)
__asm__ __volatile__ ("xchgq %0,%1"
:"=r" (val), "=m" (*(void **)ptr)
: "0" (val), "m" (*(void **)ptr));
return val;
#elif defined(HAVE_GCC_AND_IA64_ASM)
__asm__ __volatile__ ("xchg8 %0=[%2],%3"
: "=r" (val), "=m" (*(void **)val)
: "r" (ptr), "0" (val));
return val;
#else
#error No swap function defined for this architecture
#endif
}
static inline void *MPID_NEM_CAS (volatile void *ptr, void *oldv, void *newv)
{
#ifdef HAVE_GCC_AND_PENTIUM_ASM
void *prev;
__asm__ __volatile__ ("lock ; cmpxchgl %2,%3"
: "=a" (prev), "=m" (*(void **)ptr)
: "q" (newv), "m" (*(void **)ptr), "0" (oldv));
return prev;
#elif defined(HAVE_GCC_AND_X86_64_ASM)
void *prev;
__asm__ __volatile__ ("lock ; cmpxchgq %2,%3"
: "=a" (prev), "=m" (*(void **)ptr)
: "q" (newv), "m" (*(void **)ptr), "0" (oldv));
return prev;
#elif defined(HAVE_GCC_AND_IA64_ASM)
void *prev;
__asm__ __volatile__ ("mov ar.ccv=%2;;"
"cmpxchg8.rel %0=[%3],%4,ar.ccv"
: "=r"(prev), "=m"(*(void **)ptr)
: "r"(oldv), "r"(ptr), "r"(newv));
return prev;
#else
#error No compare-and-swap function defined for this architecture
#endif
}
static inline int MPID_NEM_CAS_INT (volatile int *ptr, int oldv, int newv)
{
#ifdef HAVE_GCC_AND_PENTIUM_ASM
int prev;
__asm__ __volatile__ ("lock ; cmpxchg %2,%3"
: "=a" (prev), "=m" (*ptr)
: "q" (newv), "m" (*ptr), "0" (oldv));
return prev;
#elif defined(HAVE_GCC_AND_X86_64_ASM)
int prev;
__asm__ __volatile__ ("lock ; cmpxchg %2,%3"
: "=a" (prev), "=m" (*ptr)
: "q" (newv), "m" (*ptr), "0" (oldv));
return prev;
#elif defined(HAVE_GCC_AND_IA64_ASM)
int prev;
switch (sizeof(int)) /* this switch statement should be optimized out */
{
case 8:
__asm__ __volatile__ ("mov ar.ccv=%2;;"
"cmpxchg8.rel %0=[%3],%4,ar.ccv"
: "=r"(prev), "=m"(*ptr)
: "r"(oldv), "r"(ptr), "r"(newv));
break;
case 4:
__asm__ __volatile__ ("zxt4 %2=%2;;" /* don't want oldv sign-extended to 64 bits */
"mov ar.ccv=%2;;"
"cmpxchg4.rel %0=[%3],%4,ar.ccv"
: "=r"(prev), "=m"(*ptr)
: "r"(oldv), "r"(ptr), "r"(newv));
break;
default:
MPIU_Assertp (0);
}
return prev;
#else
#error No compare-and-swap function defined for this architecture
#endif
}
static inline int MPID_NEM_FETCH_AND_ADD (volatile int *ptr, int val)
{
#if defined(HAVE_GCC_AND_PENTIUM_ASM) || defined(HAVE_GCC_AND_X86_64_ASM)
__asm__ __volatile__ ("lock ; xadd %0,%1"
: "=r" (val), "=m" (*ptr)
: "0" (val), "m" (*ptr));
return val;
#elif defined(HAVE_GCC_AND_IA64_ASM)
int new;
int prev;
int old;
do
{
old = *ptr;
new = old + val;
__asm__ __volatile__ ("mov ar.ccv=%1;;"
"cmpxchg4.rel %0=[%3],%4,ar.ccv"
: "=r"(prev), "=m"(*ptr)
: "rO"(old), "r"(ptr), "r"(new));
}
while (prev != old);
return new;
#else
/* default implementation */
int new;
int prev;
int old;
MPIU_Assert (sizeof(int) == sizeof(void *));
do
{
old = *ptr;
new = old + val;
prev = MPID_NEM_CAS_INT(ptr, old, new);
}
while (prev != old);
return new;
#endif
}
static inline void MPID_NEM_ATOMIC_ADD (int *ptr, int val)
{
#if defined(HAVE_GCC_AND_PENTIUM_ASM) || defined(HAVE_GCC_AND_X86_64_ASM)
__asm__ __volatile__ ("lock ; add %1,%0"
:"=m" (*ptr)
:"ir" (val), "m" (*ptr));
return;
#elif defined(HAVE_GCC_AND_IA64_ASM)
MPIU_Assertp (0);; /* implement atomic add */
#else
#error No fetch-and-add function defined for this architecture
#endif
}
static inline int MPID_NEM_FETCH_AND_INC (volatile int *ptr)
{
#ifdef HAVE_GCC_AND_IA64_ASM
int val;
__asm__ __volatile__ ("fetchadd4.rel %0=[%2],%3"
: "=r"(val), "=m" (*ptr)
: "r"(ptr), "i" (1));
return val;
#else
/* default implementation */
return MPID_NEM_FETCH_AND_ADD (ptr, 1);
#endif
}
static inline int MPID_NEM_FETCH_AND_DEC (volatile int *ptr)
{
#ifdef HAVE_GCC_AND_IA64_ASM
int val;
__asm__ __volatile__ ("fetchadd4.rel %0=[%2],%3"
: "=r"(val), "=m"(*ptr)
: "r"(ptr), "i" (-1));
return val;
#else
/* default implementation */
return MPID_NEM_FETCH_AND_ADD (ptr, -1);
#endif
}
static inline void MPID_NEM_ATOMIC_INC (volatile int *ptr)
{
#if defined(HAVE_GCC_AND_PENTIUM_ASM) || defined(HAVE_GCC_AND_X86_64_ASM)
switch(sizeof(*ptr))
{
case 4:
__asm__ __volatile__ ("lock ; incl %0"
:"=m" (*ptr)
:"m" (*ptr));
break;
case 8:
__asm__ __volatile__ ("lock ; incq %0"
:"=m" (*ptr)
:"m" (*ptr));
break;
default:
/* int is not 64 or 32 bits */
MPIU_Assert(0);
}
return;
#elif defined(HAVE_GCC_AND_IA64_ASM)
int val;
__asm__ __volatile__ ("fetchadd4.rel %0=[%2],%3"
: "=r"(val), "=m"(*ptr)
: "r"(ptr), "i"(1));
return;
#else
#error No fetch-and-add function defined for this architecture
#endif
}
static inline void MPID_NEM_ATOMIC_DEC (volatile int *ptr)
{
#if defined(HAVE_GCC_AND_PENTIUM_ASM) || defined(HAVE_GCC_AND_X86_64_ASM)
switch(sizeof(*ptr))
{
case 4:
__asm__ __volatile__ ("lock ; decl %0"
:"=m" (*ptr)
:"m" (*ptr));
break;
case 8:
__asm__ __volatile__ ("lock ; decq %0"
:"=m" (*ptr)
:"m" (*ptr));
break;
default:
/* int is not 64 or 32 bits */
MPIU_Assert(0);
}
return;
#elif defined(HAVE_GCC_AND_IA64_ASM)
int val;
__asm__ __volatile__ ("fetchadd4.rel %0=[%2],%3"
: "=r"(val), "=m"(*ptr)
: "r"(ptr), "i"(-1));
return;
#else
#error No fetch-and-add function defined for this architecture
#endif
}
#ifdef HAVE_GCC_AND_PENTIUM_ASM
#ifdef HAVE_GCC_ASM_AND_X86_SFENCE
#define MPID_NEM_WRITE_BARRIER() __asm__ __volatile__ ( "sfence" ::: "memory" )
#else /* HAVE_GCC_ASM_AND_X86_SFENCE */
#define MPID_NEM_WRITE_BARRIER()
#endif /* HAVE_GCC_ASM_AND_X86_SFENCE */
#ifdef HAVE_GCC_ASM_AND_X86_LFENCE
/*
#define MPID_NEM_READ_BARRIER() __asm__ __volatile__ ( ".byte 0x0f, 0xae, 0xe8" ::: "memory" ) */
#define MPID_NEM_READ_BARRIER() __asm__ __volatile__ ( "lfence" ::: "memory" )
#else /* HAVE_GCC_ASM_AND_X86_LFENCE */
#define MPID_NEM_READ_BARRIER()
#endif /* HAVE_GCC_ASM_AND_X86_LFENCE */
#ifdef HAVE_GCC_ASM_AND_X86_MFENCE
/*
#define MPID_NEM_READ_WRITE_BARRIER() __asm__ __volatile__ ( ".byte 0x0f, 0xae, 0xf0" ::: "memory" )
*/
#define MPID_NEM_READ_WRITE_BARRIER() __asm__ __volatile__ ( "mfence" ::: "memory" )
#else /* HAVE_GCC_ASM_AND_X86_MFENCE */
#define MPID_NEM_READ_WRITE_BARRIER()
#endif /* HAVE_GCC_ASM_AND_X86_MFENCE */
#elif defined(HAVE_MASM_AND_X86)
#define MPID_NEM_WRITE_BARRIER()
#define MPID_NEM_READ_BARRIER() __asm { __asm _emit 0x0f __asm _emit 0xae __asm _emit 0xe8 }
#define MPID_NEM_READ_WRITE_BARRIER()
#elif defined(HAVE_GCC_AND_IA64_ASM)
#define MPID_NEM_WRITE_BARRIER() __asm__ __volatile__ ("mf" ::: "memory" )
#define MPID_NEM_READ_BARRIER() __asm__ __volatile__ ("mf" ::: "memory" )
#define MPID_NEM_READ_WRITE_BARRIER() __asm__ __volatile__ ("mf" ::: "memory" )
#else
#define MPID_NEM_WRITE_BARRIER()
#define MPID_NEM_READ_BARRIER()
#define MPID_NEM_READ_WRITE_BARRIER()
#endif /* HAVE_GCC_AND_PENTIUM_ASM */
#include <mpidu_atomic_primitives.h>
#include <mpidu_mem_barriers.h>
#endif /* MPID_NEM_ATOMICS_H */
......@@ -86,6 +86,10 @@ typedef struct MPID_nem_seg_info
char *addr;
} MPID_nem_seg_info_t, *MPID_nem_seg_info_ptr_t;
/* NOTE: MPID_NEM_IS_LOCAL should only be used when the process is known to be
in your comm_world (such as at init time). This will generally not work for
dynamic processes. Check vc_ch->is_local instead. If that is true, then
it's safe to use MPID_NEM_LOCAL_RANK. */
#define MPID_NEM_NON_LOCAL -1
#define MPID_NEM_IS_LOCAL(grank) (MPID_nem_mem_region.local_ranks[grank] != MPID_NEM_NON_LOCAL)
#define MPID_NEM_LOCAL_RANK(grank) (MPID_nem_mem_region.local_ranks[grank])
......
......@@ -60,31 +60,39 @@ extern unsigned short *MPID_nem_recv_seqno;
*(_cell) = NULL; \
} while (0)
#else
#define poll_fboxes(_cell, do_found) do { \
MPID_nem_fboxq_elem_t *orig_fboxq_elem; \
\
if (MPID_nem_fboxq_head != NULL) \
{ \
orig_fboxq_elem = MPID_nem_curr_fboxq_elem; \
do \
{ \
MPID_nem_fbox_mpich2_t *fbox; \
\
fbox = MPID_nem_curr_fboxq_elem->fbox; \
if (fbox->flag.value == 1 && fbox->cell.pkt.mpich2.seqno == MPID_nem_recv_seqno[MPID_nem_curr_fboxq_elem->grank]) \
{ \
++MPID_nem_recv_seqno[MPID_nem_curr_fboxq_elem->grank]; \
*(_cell) = &fbox->cell; \
do_found; \
} \
MPID_nem_curr_fboxq_elem = MPID_nem_curr_fboxq_elem->next; \
if (MPID_nem_curr_fboxq_elem == NULL) \
MPID_nem_curr_fboxq_elem = MPID_nem_fboxq_head; \
} \
while (MPID_nem_curr_fboxq_elem != orig_fboxq_elem); \
} \
*(_cell) = NULL; \
} while (0)
static inline int poll_fboxes(MPID_nem_cell_ptr_t *cell)
{
MPID_nem_fboxq_elem_t *orig_fboxq_elem;
int found = FALSE;
if (MPID_nem_fboxq_head != NULL)
{
orig_fboxq_elem = MPID_nem_curr_fboxq_elem;
do
{
MPID_nem_fbox_mpich2_t *fbox;
fbox = MPID_nem_curr_fboxq_elem->fbox;
MPIU_Assert(fbox != NULL);
if (fbox->flag.value == 1 &&
fbox->cell.pkt.mpich2.seqno == MPID_nem_recv_seqno[MPID_nem_curr_fboxq_elem->grank])
{
++MPID_nem_recv_seqno[MPID_nem_curr_fboxq_elem->grank];
*cell = &fbox->cell;
found = TRUE;
goto fn_exit;
}
MPID_nem_curr_fboxq_elem = MPID_nem_curr_fboxq_elem->next;
if (MPID_nem_curr_fboxq_elem == NULL)
MPID_nem_curr_fboxq_elem = MPID_nem_fboxq_head;
}
while (MPID_nem_curr_fboxq_elem != orig_fboxq_elem);
}
*cell = NULL;
fn_exit:
return found;
}
#endif
#define poll_all_fboxes(_cell, do_found) do { \
......
......@@ -106,7 +106,7 @@ MPID_nem_mpich2_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again)
payload_32[9] = buf_32[9];
}
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
pbox->flag.value = 1;
MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
......@@ -411,7 +411,7 @@ MPID_nem_mpich2_sendv_header (struct iovec **iov, int *n_iov, MPIDI_VC_t *vc, in
payload_32[9] = buf_32[9];
}
MPID_NEM_MEMCPY ((char *)pbox->cell.pkt.mpich2.payload +sizeof(MPIDI_CH3_Pkt_t), (*iov)[1].iov_base, (*iov)[1].iov_len);
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
pbox->flag.value = 1;
*n_iov = 0;
......@@ -602,7 +602,7 @@ MPID_nem_mpich2_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_
MPID_Segment_pack(segment, *segment_first, &last, (char *)pbox->cell.pkt.mpich2.payload + sizeof(MPIDI_CH3_Pkt_t));
MPIU_Assert(last == segment_size);
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
pbox->flag.value = 1;
*segment_first = last;
......@@ -795,7 +795,10 @@ MPID_nem_mpich2_dequeue_fastbox (int local_rank)
int mpi_errno = MPI_SUCCESS;
MPID_nem_fboxq_elem_t *el;
MPIU_Assert(local_rank < MPID_nem_mem_region.num_local);
el = &MPID_nem_fboxq_elem_list[local_rank];
MPIU_Assert(el->fbox != NULL);
MPIU_ERR_CHKANDJUMP (!el->usage, mpi_errno, MPI_ERR_OTHER, "**intern");
......@@ -843,8 +846,11 @@ int MPID_nem_mpich2_enqueue_fastbox (int local_rank)
int mpi_errno = MPI_SUCCESS;
MPID_nem_fboxq_elem_t *el;
MPIU_Assert(local_rank < MPID_nem_mem_region.num_local);
el = &MPID_nem_fboxq_elem_list[local_rank];
MPIU_Assert(el->fbox != NULL);
if (el->usage)
{
++el->usage;
......@@ -920,7 +926,7 @@ MPID_nem_mpich2_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox)
#endif
#ifdef USE_FASTBOX
poll_fboxes (cell, goto fbox_l);
if (poll_fboxes(cell)) goto fbox_l;
#endif/* USE_FASTBOX */
/* FIXME the ext_procs bit is an optimization for the all-local-procs case.
......@@ -991,7 +997,7 @@ MPID_nem_mpich2_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int tim
int mpi_errno = MPI_SUCCESS;
#ifdef USE_FASTBOX
poll_fboxes (cell, goto fbox_l);
if (poll_fboxes(cell)) goto fbox_l;
#endif/* USE_FASTBOX */
/* FIXME the ext_procs bit is an optimization for the all-local-procs case.
......@@ -1083,7 +1089,7 @@ MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
#ifdef USE_FASTBOX
poll_fboxes (cell, goto fbox_l);
if (poll_fboxes(cell)) goto fbox_l;
#endif /*USE_FASTBOX */
/* FIXME the ext_procs bit is an optimization for the all-local-procs case.
......@@ -1102,7 +1108,7 @@ MPID_nem_mpich2_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox)
#ifdef USE_FASTBOX
poll_all_fboxes (cell, goto fbox_l);
poll_fboxes (cell, goto fbox_l);
if (poll_fboxes(cell)) goto fbox_l;
#endif /*USE_FASTBOX */
/* FIXME the ext_procs bit is an optimization for the all-local-procs case.
......
......@@ -35,7 +35,7 @@ int MPID_nem_network_poll (MPID_nem_poll_dir_t in_or_out);
static inline MPID_nem_cell_rel_ptr_t MPID_NEM_SWAP_REL (volatile MPID_nem_cell_rel_ptr_t *ptr, MPID_nem_cell_rel_ptr_t val)
{
MPID_nem_cell_rel_ptr_t ret;
ret.p = MPID_NEM_SWAP (&(ptr->p), val.p);
ret.p = MPIDU_Atomic_swap_char_ptr(&(ptr->p), val.p);
return ret;
}
......@@ -43,7 +43,7 @@ static inline MPID_nem_cell_rel_ptr_t MPID_NEM_SWAP_REL (volatile MPID_nem_cell_
static inline MPID_nem_cell_rel_ptr_t MPID_NEM_CAS_REL_NULL (volatile MPID_nem_cell_rel_ptr_t *ptr, MPID_nem_cell_rel_ptr_t oldv)
{
MPID_nem_cell_rel_ptr_t ret;
ret.p = MPID_NEM_CAS (&(ptr->p), oldv.p, MPID_NEM_REL_NULL);
ret.p = MPIDU_Atomic_cas_char_ptr(&(ptr->p), oldv.p, MPID_NEM_REL_NULL);
return ret;
}
......
......@@ -26,7 +26,7 @@ int MPID_nem_barrier_init (MPID_nem_barrier_t *barrier_region)
MPID_nem_mem_region.barrier->wait = 0;
sense = 0;
barrier_init = 1;
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_BARRIER_INIT);
return MPI_SUCCESS;
......@@ -46,11 +46,11 @@ int MPID_nem_barrier (int num_processes, int rank)
MPIU_ERR_CHKANDJUMP1 (!barrier_init, mpi_errno, MPI_ERR_INTERN, "**intern", "**intern %s", "barrier not initialized");
if (MPID_NEM_FETCH_AND_INC (&MPID_nem_mem_region.barrier->val) == MPID_nem_mem_region.num_local - 1)
if (MPIDU_Atomic_fetch_and_incr(&MPID_nem_mem_region.barrier->val) == MPID_nem_mem_region.num_local - 1)
{
MPID_nem_mem_region.barrier->val = 0;
MPID_nem_mem_region.barrier->wait = 1 - sense;
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
}
else
{
......
......@@ -316,7 +316,7 @@ static int get_next_req(MPIDI_VC_t *vc)
MPIDI_FUNC_ENTER(MPID_STATE_GET_NEXT_REQ);
prev_owner_rank = MPID_NEM_CAS_INT(&copy_buf->owner_info.val.rank, NO_OWNER, MPIDI_Process.my_pg_rank);
prev_owner_rank = MPIDU_Atomic_cas_int(&copy_buf->owner_info.val.rank, NO_OWNER, MPIDI_Process.my_pg_rank);
if (prev_owner_rank == MPIDI_Process.my_pg_rank)
{
......@@ -328,11 +328,11 @@ static int get_next_req(MPIDI_VC_t *vc)
{
int i;
/* successfully grabbed idle copy buf */
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
for (i = 0; i < NUM_BUFS; ++i)
copy_buf->len[i].val = 0;
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
LMT_SHM_Q_DEQUEUE(&vc_ch->lmt_queue, &vc_ch->lmt_active_lmt);
copy_buf->owner_info.val.remote_req_id = vc_ch->lmt_active_lmt->req->ch.lmt_req_id;
......@@ -343,7 +343,7 @@ static int get_next_req(MPIDI_VC_t *vc)
/* remote side chooses next transfer */
int i = 0;
MPID_NEM_READ_BARRIER();
MPIDU_Shm_read_barrier();
while (copy_buf->owner_info.val.remote_req_id == MPI_REQUEST_NULL)
{
if (i == NUM_BUSY_POLLS)
......@@ -354,7 +354,7 @@ static int get_next_req(MPIDI_VC_t *vc)
++i;
}
MPID_NEM_READ_BARRIER();
MPIDU_Shm_read_barrier();
LMT_SHM_Q_SEARCH_REMOVE(&vc_ch->lmt_queue, copy_buf->owner_info.val.remote_req_id, &vc_ch->lmt_active_lmt);
if (vc_ch->lmt_active_lmt == NULL)
......@@ -447,7 +447,7 @@ static int lmt_shm_send_progress(MPIDI_VC_t *vc, MPID_Request *req, int *done)
++i;
}
MPID_NEM_READ_WRITE_BARRIER();
MPIDU_Shm_read_write_barrier();
/* we have a free buffer, fill it */
......@@ -457,7 +457,7 @@ static int lmt_shm_send_progress(MPIDI_VC_t *vc, MPID_Request *req, int *done)
copy_limit = MPID_NEM_COPY_BUF_LEN;
last = (data_sz - first <= copy_limit) ? data_sz : first + copy_limit;
MPID_Segment_pack(req->dev.segment_ptr, first, &last, (void *)copy_buf->buf[buf_num]); /* cast away volatile */
MPID_NEM_WRITE_BARRIER();
MPIDU_Shm_write_barrier();
copy_buf->len[buf_num].val = last - first;
first = last;
......@@ -543,7 +543,7 @@ static int lmt_shm_recv_progress(MPIDI_VC_t *vc, MPID_Request *req, int *done)
++i;
}
MPID_NEM_READ_BARRIER();
MPIDU_Shm_read_barrier();
/* unpack data including any leftover from the previous buffer */
src_buf = ((char *)copy_buf->buf[buf_num]) - surfeit; /* cast away volatile */
......@@ -556,7 +556,7 @@ static int lmt_shm_recv_progress(MPIDI_VC_t *vc, MPID_Request *req, int *done)
/* we had leftover data from the previous buffer, we can
now mark that buffer as empty */
MPID_NEM_READ_WRITE_BARRIER();
MPIDU_Shm_read_write_barrier();
copy_buf->len[(buf_num-1)].val = 0;
/* Make sure we copied at least the leftover data from last time */
MPIU_Assert(last - first > surfeit);
......@@ -575,7 +575,7 @@ static int lmt_shm_recv_progress(MPIDI_VC_t *vc, MPID_Request *req, int *done)
/* if we're wrapping back to buf 0, then we can copy it directly */
memcpy(((char *)copy_buf->buf[0]) - surfeit, surfeit_ptr, surfeit);
MPID_NEM_READ_WRITE_BARRIER();
MPIDU_Shm_read_write_barrier();
copy_buf->len[buf_num].val = 0;
}
else
......@@ -590,7 +590,7 @@ static int lmt_shm_recv_progress(MPIDI_VC_t *vc, MPID_Request *req, int *done)
/* all data was unpacked, we can mark this buffer as empty */