Commit e0b7dc2b authored by James Dinan's avatar James Dinan
Browse files

[svn-r10214] Implementation of MPIX_NO_OP and MPI_REPLACE.

Added builtin op implementations for RMA-only ops.  MPI_REPLACE was not defined
in the op_table prior to this commit.  Also includes an alternate
implementation of the linked list test that uses fetch-and-op with these
operations to enable the use of shared locks.

Reviewer: goodell
parent 72320371
......@@ -3811,6 +3811,8 @@ void MPIR_LXOR ( void *, void *, int *, MPI_Datatype * ) ;
void MPIR_BXOR ( void *, void *, int *, MPI_Datatype * ) ;
void MPIR_MAXLOC ( void *, void *, int *, MPI_Datatype * ) ;
void MPIR_MINLOC ( void *, void *, int *, MPI_Datatype * ) ;
void MPIR_REPLACE ( void *, void *, int *, MPI_Datatype * ) ;
void MPIR_NO_OP ( void *, void *, int *, MPI_Datatype * ) ;
int MPIR_MAXF_check_dtype ( MPI_Datatype ) ;
int MPIR_MINF_check_dtype ( MPI_Datatype ) ;
......@@ -3824,8 +3826,10 @@ int MPIR_LXOR_check_dtype ( MPI_Datatype ) ;
int MPIR_BXOR_check_dtype ( MPI_Datatype ) ;
int MPIR_MAXLOC_check_dtype ( MPI_Datatype ) ;
int MPIR_MINLOC_check_dtype ( MPI_Datatype ) ;
int MPIR_REPLACE_check_dtype ( MPI_Datatype ) ;
int MPIR_NO_OP_check_dtype ( MPI_Datatype ) ;
#define MPIR_PREDEF_OP_COUNT 12
#define MPIR_PREDEF_OP_COUNT 14
extern MPI_User_function *MPIR_Op_table[];
typedef int (MPIR_Op_check_dtype_fn) ( MPI_Datatype );
......
......@@ -64,6 +64,8 @@ lib_lib@MPILIBNAME@_la_SOURCES += \
src/mpi/coll/opprod.c \
src/mpi/coll/opminloc.c \
src/mpi/coll/opmaxloc.c \
src/mpi/coll/opno_op.c \
src/mpi/coll/opreplace.c \
src/mpi/coll/nbcutil.c \
src/mpi/coll/rmatypeutil.c
......
......@@ -29,7 +29,8 @@ MPI_User_function *MPIR_Op_table[] = { MPIR_MAXF, MPIR_MINF, MPIR_SUM,
MPIR_PROD, MPIR_LAND,
MPIR_BAND, MPIR_LOR, MPIR_BOR,
MPIR_LXOR, MPIR_BXOR,
MPIR_MINLOC, MPIR_MAXLOC, };
MPIR_MINLOC, MPIR_MAXLOC,
MPIR_REPLACE, MPIR_NO_OP };
MPIR_Op_check_dtype_fn *MPIR_Op_check_dtype_table[] = {
MPIR_MAXF_check_dtype, MPIR_MINF_check_dtype,
......@@ -37,7 +38,8 @@ MPIR_Op_check_dtype_fn *MPIR_Op_check_dtype_table[] = {
MPIR_PROD_check_dtype, MPIR_LAND_check_dtype,
MPIR_BAND_check_dtype, MPIR_LOR_check_dtype, MPIR_BOR_check_dtype,
MPIR_LXOR_check_dtype, MPIR_BXOR_check_dtype,
MPIR_MINLOC_check_dtype, MPIR_MAXLOC_check_dtype, };
MPIR_MINLOC_check_dtype, MPIR_MAXLOC_check_dtype,
MPIR_REPLACE_check_dtype, MPIR_NO_OP_check_dtype };
/* This is the default implementation of allreduce. The algorithm is:
......
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* (C) 2012 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "mpiimpl.h"
#undef FUNCNAME
#define FUNCNAME MPIR_NO_OP
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
void MPIR_NO_OP( void *invec, void *inoutvec, int *Len, MPI_Datatype *type )
{
return;
}
#undef FUNCNAME
#define FUNCNAME MPIR_NO_OP_check_dtype
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPIR_NO_OP_check_dtype( MPI_Datatype type )
{
return MPI_SUCCESS;
}
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* (C) 2012 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "mpiimpl.h"
#undef FUNCNAME
#define FUNCNAME MPIR_REPLACE
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
void MPIR_REPLACE( void *invec, void *inoutvec, int *Len, MPI_Datatype *type )
{
int mpi_errno = MPI_SUCCESS;
mpi_errno = MPIR_Localcopy(invec, *Len, *type, inoutvec, *Len, *type);
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
fn_exit:
return;
/* --BEGIN ERROR HANDLING-- */
fn_fail:
goto fn_exit;
/* --END ERROR HANDLING-- */
}
#undef FUNCNAME
#define FUNCNAME MPIR_REPLACE_check_dtype
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPIR_REPLACE_check_dtype( MPI_Datatype type )
{
return MPI_SUCCESS;
}
......@@ -99,7 +99,10 @@ int MPIX_Fetch_and_op(const void *origin_addr, void *result_addr,
MPID_Win_valid_ptr( win_ptr, mpi_errno );
if (mpi_errno) goto fn_fail;
MPIR_ERRTEST_ARGNULL(origin_addr, "origin_addr", mpi_errno);
if (op != MPIX_NO_OP) {
MPIR_ERRTEST_ARGNULL(origin_addr, "origin_addr", mpi_errno);
}
MPIR_ERRTEST_ARGNULL(result_addr, "result_addr", mpi_errno);
if (mpi_errno) goto fn_fail;
......
......@@ -918,7 +918,9 @@ static int MPIDI_CH3I_Send_immed_rmw_msg(MPIDI_RMA_ops *rma_op,
fop_pkt->request_handle = resp_req->handle;
fop_pkt->op = rma_op->op;
MPIU_Memcpy( (void *) &fop_pkt->origin_data, rma_op->origin_addr, len );
if (rma_op->op != MPIX_NO_OP) {
MPIU_Memcpy( (void *) &fop_pkt->origin_data, rma_op->origin_addr, len );
}
comm_ptr = win_ptr->comm_ptr;
MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
......
......@@ -83,6 +83,7 @@ noinst_PROGRAMS = \
get_acc_local \
compare_and_swap \
linked_list \
linked_list_fop \
fetch_and_op
strided_acc_indexed_LDADD = $(LDADD) -lm
......
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* (C) 2003 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
/* MPI-3 distributed linked list construction example
* --------------------------------------------------
*
* Construct a distributed shared linked list using proposed MPI-3 dynamic
* windows. Initially process 0 creates the head of the list, attaches it to
* the window, and broadcasts the pointer to all processes. All processes then
* concurrently append N new elements to the list. When a process attempts to
* attach its element to the tail of list it may discover that its tail pointer
* is stale and it must chase ahead to the new tail before the element can be
* attached.
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <mpi.h>
#include <assert.h>
/* MPI-3 is not yet standardized -- allow MPI-3 routines to be switched off.
*/
#if !defined(USE_STRICT_MPI) && defined(MPICH2)
# define TEST_MPI3_ROUTINES 1
#endif
#define NUM_ELEMS 32
#define NPROBE 100
#define ELEM_PER_ROW 16
/* Linked list pointer */
typedef struct {
int rank;
MPI_Aint disp;
} llist_ptr_t;
/* Linked list element */
typedef struct {
int value;
llist_ptr_t next;
} llist_elem_t;
static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
static const int verbose = 0;
/* List of locally allocated list elements. */
static llist_elem_t **my_elems = NULL;
static int my_elems_size = 0;
static int my_elems_count = 0;
/* Allocate a new shared linked list element */
MPI_Aint alloc_elem(int value, MPI_Win win) {
MPI_Aint disp;
llist_elem_t *elem_ptr;
/* Allocate the new element and register it with the window */
MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
elem_ptr->value = value;
elem_ptr->next = nil;
MPIX_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
/* Add the element to the list of local elements so we can free it later. */
if (my_elems_size == my_elems_count) {
my_elems_size += 100;
my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
}
my_elems[my_elems_count] = elem_ptr;
my_elems_count++;
MPI_Get_address(elem_ptr, &disp);
return disp;
}
int main(int argc, char **argv) {
int procid, nproc, i;
MPI_Win llist_win;
llist_ptr_t head_ptr, tail_ptr;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
#ifdef TEST_MPI3_ROUTINES
MPIX_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
/* Process 0 creates the head node */
if (procid == 0)
head_ptr.disp = alloc_elem(-1, llist_win);
/* Broadcast the head pointer to everyone */
head_ptr.rank = 0;
MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
tail_ptr = head_ptr;
/* All processes concurrently append NUM_ELEMS elements to the list */
for (i = 0; i < NUM_ELEMS; i++) {
llist_ptr_t new_elem_ptr;
int success;
/* Create a new list element and register it with the window */
new_elem_ptr.rank = procid;
new_elem_ptr.disp = alloc_elem(procid, llist_win);
/* Append the new node to the list. This might take multiple attempts if
others have already appended and our tail pointer is stale. */
do {
llist_ptr_t next_tail_ptr = nil;
MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
MPIX_Compare_and_swap((void*) &new_elem_ptr.rank, (void*) &nil.rank,
(void*) &next_tail_ptr.rank, MPI_INT, tail_ptr.rank,
(MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.rank), llist_win);
MPI_Win_unlock(tail_ptr.rank, llist_win);
success = (next_tail_ptr.rank == nil.rank);
if (success) {
int i, flag;
MPI_Aint result;
MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
MPIX_Fetch_and_op(&new_elem_ptr.disp, &result, MPI_AINT, tail_ptr.rank,
(MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
MPI_REPLACE, llist_win);
/* Note: accumulate is faster, since we don't need the result. Replacing with
Fetch_and_op to create a more complete test case. */
/*
MPI_Accumulate(&new_elem_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
(MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp), 1,
MPI_AINT, MPI_REPLACE, llist_win);
*/
MPI_Win_unlock(tail_ptr.rank, llist_win);
tail_ptr = new_elem_ptr;
/* For implementations that use pt-to-pt messaging, force progress for other threads'
RMA operations. */
for (i = 0; i < NPROBE; i++)
MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
} else {
/* Tail pointer is stale, fetch the displacement. May take multiple tries
if it is being updated. */
do {
MPI_Aint junk = 0;
MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
MPIX_Fetch_and_op(NULL, &next_tail_ptr.disp, MPI_AINT, tail_ptr.rank,
(MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
MPIX_NO_OP, llist_win);
MPI_Win_unlock(tail_ptr.rank, llist_win);
} while (next_tail_ptr.disp == nil.disp);
tail_ptr = next_tail_ptr;
}
} while (!success);
}
MPI_Barrier(MPI_COMM_WORLD);
/* Traverse the list and verify that all processes inserted exactly the correct
number of elements. */
if (procid == 0) {
int have_root = 0;
int errors = 0;
int *counts, count = 0;
counts = (int*) malloc(sizeof(int) * nproc);
assert(counts != NULL);
for (i = 0; i < nproc; i++)
counts[i] = 0;
tail_ptr = head_ptr;
/* Walk the list and tally up the number of elements inserted by each rank */
while (tail_ptr.disp != nil.disp) {
llist_elem_t elem;
MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
MPI_Win_unlock(tail_ptr.rank, llist_win);
tail_ptr = elem.next;
/* This is not the root */
if (have_root) {
assert(elem.value >= 0 && elem.value < nproc);
counts[elem.value]++;
count++;
if (verbose) {
int last_elem = tail_ptr.disp == nil.disp;
printf("%2d%s", elem.value, last_elem ? "" : " -> ");
if (count % ELEM_PER_ROW == 0 && !last_elem)
printf("\n");
}
}
/* This is the root */
else {
assert(elem.value == -1);
have_root = 1;
}
}
if (verbose)
printf("\n\n");
/* Verify the counts we collected */
for (i = 0; i < nproc; i++) {
int expected = NUM_ELEMS;
if (counts[i] != expected) {
printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
errors++;
}
}
printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
free(counts);
}
MPI_Win_free(&llist_win);
/* Free all the elements in the list */
for ( ; my_elems_count > 0; my_elems_count--)
MPI_Free_mem(my_elems[my_elems_count-1]);
#else /* ! TEST_MPI3_ROUTINES */
if (rank == 0)
printf(" No Errors\n");
#endif
MPI_Finalize();
return 0;
}
......@@ -67,5 +67,6 @@ win_shared_noncontig_put 4 strict=false
win_dynamic_acc 4 strict=false
get_acc_local 1 strict=false
linked_list 4 strict=false
linked_list_fop 4 strict=false
compare_and_swap 4 strict=false
fetch_and_op 4 strict=false
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment