Commit 5c4f21d0 authored by Paul Coffman's avatar Paul Coffman Committed by Rob Latham
Browse files

PAMI_Rput_typed / PAMI_Rget_typed utilization for derived types



Optimization to use the PAMI_Rput_typed / PAMI_Rget_typed call in the case where PAMID
MPI_Put / MPI_Get is called with a derived (non-contiguous) datatype.  Instead of breaking
the MPI datatype up into contiguous chunks on the MPICH side and repeatedly calling
PAMI_Rput / PAMI_Rget for each chunk with the associated overhead, create a PAMI datatype
to represent the MPI derived type and make just 1 call to
PAMI_Rput_typed / PAMI_Rget_typed.

We deal with non-contiguous buffers by avoiding packing and using origin
buffers (as in PAMI)

Guarded by the PAMID_TYPED_ONESIDED environment variable.
Signed-off-by: Rob Latham's avatarRob Latham <robl@mcs.anl.gov>
parent 18516baf
......@@ -30,7 +30,7 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
MPID_Datatype_get_ptr(*datatype_p, datatype_ptr);
if (datatype_ptr->is_committed == 0) {
datatype_ptr->is_committed = 1;
datatype_ptr->is_committed = 1;
#ifdef MPID_NEEDS_DLOOP_ALL_BYTES
/* If MPID implementation needs use to reduce everything to
......@@ -62,8 +62,11 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
MPIDI_Dataloop_dot_printf(datatype_ptr->dataloop, 0, 1);
#endif
}
#ifdef MPID_Dev_datatype_commit_hook
MPID_Dev_datatype_commit_hook(datatype_p);
#endif /* MPID_Dev_datatype_commit_hook */
}
return mpi_errno;
}
......@@ -82,7 +82,6 @@ int MPID_Type_dup(MPI_Datatype oldtype,
new_dtp->hetero_dloop = NULL;
new_dtp->hetero_dloop_size = old_dtp->hetero_dloop_size;
new_dtp->hetero_dloop_depth = old_dtp->hetero_dloop_depth;
*newtype = new_dtp->handle;
if (old_dtp->is_committed) {
......@@ -98,7 +97,11 @@ int MPID_Type_dup(MPI_Datatype oldtype,
old_dtp->hetero_dloop_size,
&new_dtp->hetero_dloop);
}
}
#ifdef MPID_Dev_datatype_commit_hook
MPID_Dev_datatype_dup_hook(new_dtp);
#endif /* MPID_Dev_datatype_commit_hook */
}
}
MPIU_DBG_MSG_D(DATATYPE,VERBOSE, "dup type %x created.", *newtype);
......
......@@ -148,6 +148,7 @@ typedef struct
unsigned mpir_nbc; /**< Enable MPIR_* non-blocking collectives implementations. */
int numTasks; /* total number of tasks on a job */
unsigned typed_onesided; /**< Enable typed PAMI calls for derived types within MPID_Put and MPID_Get. */
#ifdef DYNAMIC_TASKING
struct MPIDI_PG_t * my_pg; /**< Process group I belong to */
int my_pg_rank; /**< Rank in process group */
......
......@@ -70,5 +70,13 @@
#define MPID_MAX_SMP_BCAST_MSG_SIZE (16384)
#define MPID_MAX_SMP_REDUCE_MSG_SIZE (16384)
#define MPID_MAX_SMP_ALLREDUCE_MSG_SIZE (16384)
#ifdef MPID_DEV_DATATYPE_DECL
#error 'Conflicting definitions of MPID_DEV_DATATYPE_DECL'
#else
#define MPID_DEV_DATATYPE_DECL void *device_datatype;
#endif
#define MPID_Dev_datatype_commit_hook(ptr) MPIDI_PAMI_datatype_commit_hook(ptr)
#define MPID_Dev_datatype_destroy_hook(ptr) MPIDI_PAMI_datatype_destroy_hook(ptr)
#define MPID_Dev_datatype_dup_hook(ptr) MPIDI_PAMI_datatype_dup_hook(ptr)
#endif
......@@ -59,7 +59,8 @@ mpi_core_sources += \
src/mpid/pamid/src/mpid_imrecv.c \
src/mpid/pamid/src/mpid_improbe.c \
src/mpid/pamid/src/mpid_aint.c \
src/mpid/pamid/src/mpidi_nbc_sched.c
src/mpid/pamid/src/mpidi_nbc_sched.c \
src/mpid/pamid/src/mpidi_pami_datatype.c
if QUEUE_BINARY_SEARCH_SUPPORT
mpi_core_sources += \
......
......@@ -135,6 +135,7 @@ MPIDI_Process_t MPIDI_Process = {
.mpir_nbc = 1,
.numTasks = 0,
.typed_onesided = 0,
};
......@@ -1016,6 +1017,7 @@ MPIDI_PAMI_init(int* rank, int* size, int* threading)
" optimized.num_requests: %u\n"
" mpir_nbc : %u\n"
" numTasks : %u\n",
" typed_onesided : %u\n",
MPIDI_Process.verbose,
MPIDI_Process.statistics,
MPIDI_Process.avail_contexts,
......@@ -1052,7 +1054,8 @@ MPIDI_PAMI_init(int* rank, int* size, int* threading)
MPIDI_Process.optimized.memory,
MPIDI_Process.optimized.num_requests,
MPIDI_Process.mpir_nbc,
MPIDI_Process.numTasks);
MPIDI_Process.numTasks,
MPIDI_Process.typed_onesided);
switch (*threading)
{
case MPI_THREAD_MULTIPLE:
......
......@@ -938,6 +938,11 @@ MPIDI_Env_setup(int rank, int requested)
ENV_Unsigned(names, &MPIDI_Process.mpir_nbc, 1, &found_deprecated_env_var, rank);
}
/* Enable typed PAMI calls for derived types within MPID_Put and MPID_Get. */
{
char* names[] = {"PAMID_TYPED_ONESIDED", NULL};
ENV_Unsigned(names, &MPIDI_Process.typed_onesided, 1, &found_deprecated_env_var, rank);
}
/* Check for deprecated collectives environment variables. These variables are
* used in src/mpid/pamid/src/comm/mpid_selectcolls.c */
{
......
/* begin_generated_IBM_copyright_prolog */
/* */
/* This is an automatically generated copyright prolog. */
/* After initializing, DO NOT MODIFY OR MOVE */
/* --------------------------------------------------------------- */
/* Licensed Materials - Property of IBM */
/* Blue Gene/Q 5765-PER 5765-PRP */
/* */
/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved */
/* US Government Users Restricted Rights - */
/* Use, duplication, or disclosure restricted */
/* by GSA ADP Schedule Contract with IBM Corp. */
/* */
/* --------------------------------------------------------------- */
/* */
/* end_generated_IBM_copyright_prolog */
/* (C)Copyright IBM Corp. 2007, 2011 */
/**
* \file src/mpidi_pami_datatype.c
* \brief pami_type_t datatype hooks
*/
#include <pami.h>
#include <mpidimpl.h>
/**
* \brief Create PAMI datatype representation of MPI Datatype during commit.
*
* Signifcant performance improvements can be realized for one-sided communication
* utilizing the PAMI_Rput_typed and PAMI_Rget_typed interface which requires a
* PAMI representation of the MPI Datatype.
*/
void MPIDI_PAMI_datatype_commit_hook (MPI_Datatype *ptr)
{
/* If the PAMID optimization to utilize the PAMI_Rput_typed / PAMI_Rget_typed call for
* one-sided comm for derived types is enabled then we need to create the PAMI datatype.
*/
if (MPIDI_Process.typed_onesided == 1) {
MPID_Datatype *datatype_ptr;
MPID_Datatype_get_ptr(*ptr, datatype_ptr);
pami_result_t pami_dtop_result;
datatype_ptr->device_datatype = (pami_type_t *) MPIU_Malloc(sizeof(pami_type_t));
pami_dtop_result = PAMI_Type_create ((pami_type_t *)datatype_ptr->device_datatype);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
/* Flatten the non-contiguous data type into arrays describing the contiguous chunks.
*/
MPI_Aint *dt_offset_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
MPI_Aint *dt_size_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
MPI_Aint dt_array_len = datatype_ptr->max_contig_blocks;
int rc = MPIR_Type_flatten(*ptr, dt_offset_array, dt_size_array, &dt_array_len);
/* Build the PAMI datatype adding one contiguous chunk at a time with the PAMI_Type_add_simple
* interface.
*/
int i;
for (i=0;i<dt_array_len;i++) {
size_t num_bytes_this_entry = dt_size_array[i];
size_t cursor_offset;
if (i == 0)
cursor_offset = (size_t) dt_offset_array[i];
else
cursor_offset = (size_t) dt_offset_array[i] - (size_t)dt_offset_array[i-1];
pami_dtop_result = PAMI_Type_add_simple (*(pami_type_t*)(datatype_ptr->device_datatype), num_bytes_this_entry, cursor_offset, 1, 0);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
}
/* Complete the PAMI datatype and free arrays.
*/
pami_dtop_result = PAMI_Type_complete (*(pami_type_t*)(datatype_ptr->device_datatype),1);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
MPIU_Free(dt_offset_array);
MPIU_Free(dt_size_array);
}
return;
}
/**
* \brief Destroy PAMI datatype representation of MPI Datatype.
*
*/
void MPIDI_PAMI_datatype_destroy_hook (MPID_Datatype *ptr)
{
/* If a PAMI datatype was created, destroy it if this is the
* last reference to the MPID_Datatype ptr.
*/
if ((MPIDI_Process.typed_onesided == 1) && (ptr->is_committed)) {
if (ptr->device_datatype) {
pami_result_t pami_dtop_result;
pami_dtop_result = PAMI_Type_destroy ((pami_type_t *)ptr->device_datatype);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
MPIU_Free(ptr->device_datatype);
}
}
}
/**
* \brief Create PAMI datatype representation of MPI Datatype during dup.
*
* Signifcant performance improvements can be realized for one-sided communication
* utilizing the PAMI_Rput_typed and PAMI_Rget_typed interface which requires a
* PAMI representation of the MPI Datatype.
*/
void MPIDI_PAMI_datatype_dup_hook (MPI_Datatype *ptr)
{
/* If the PAMID optimization to utilize the PAMI_Rput_typed / PAMI_Rget_typed call for
* one-sided comm for derived types is enabled then we need to create the PAMI datatype.
*/
if (MPIDI_Process.typed_onesided == 1) {
MPID_Datatype *datatype_ptr;
MPID_Datatype_get_ptr(*ptr, datatype_ptr);
pami_result_t pami_dtop_result;
datatype_ptr->device_datatype = (pami_type_t *) MPIU_Malloc(sizeof(pami_type_t));
pami_dtop_result = PAMI_Type_create ((pami_type_t *)datatype_ptr->device_datatype);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
/* Flatten the non-contiguous data type into arrays describing the contiguous chunks.
*/
MPI_Aint *dt_offset_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
MPI_Aint *dt_size_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
MPI_Aint dt_array_len = datatype_ptr->max_contig_blocks;
int rc = MPIR_Type_flatten(*ptr, dt_offset_array, dt_size_array, &dt_array_len);
/* Build the PAMI datatype adding one contiguous chunk at a time with the PAMI_Type_add_simple
* interface.
*/
int i;
for (i=0;i<dt_array_len;i++) {
size_t num_bytes_this_entry = dt_size_array[i];
size_t cursor_offset;
if (i == 0)
cursor_offset = (size_t) dt_offset_array[i];
else
cursor_offset = (size_t) dt_offset_array[i] - (size_t)dt_offset_array[i-1];
pami_dtop_result = PAMI_Type_add_simple (*(pami_type_t*)(datatype_ptr->device_datatype), num_bytes_this_entry, cursor_offset, 1, 0);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
}
/* Complete the PAMI datatype and free arrays.
*/
pami_dtop_result = PAMI_Type_complete (*(pami_type_t*)(datatype_ptr->device_datatype),1);
MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
MPIU_Free(dt_offset_array);
MPIU_Free(dt_size_array);
}
return;
}
......@@ -22,7 +22,6 @@
#include "mpidi_onesided.h"
#include "mpidi_util.h"
static inline int
MPIDI_Get_use_pami_rget(pami_context_t context, MPIDI_Win_request * req)
__attribute__((__always_inline__));
......@@ -60,6 +59,39 @@ MPIDI_Get(pami_context_t context,
static inline int
MPIDI_Get_use_pami_rget(pami_context_t context, MPIDI_Win_request * req)
{
int use_typed_rdma = 0;
if (!req->target.dt.contig || !req->origin.dt.contig) {
use_typed_rdma = 0;
if (MPIDI_Process.typed_onesided == 1)
use_typed_rdma = 1;
}
if (use_typed_rdma) {
pami_result_t rc;
pami_rget_typed_t params;
/* params need to zero out to avoid passing garbage to PAMI */
params=zero_rget_typed_parms;
params.rma.dest=req->dest;
params.rma.hints.buffer_registered = PAMI_HINT_ENABLE;
params.rma.hints.use_rdma = PAMI_HINT_ENABLE;
params.rma.bytes = req->target.dt.size;
params.rma.cookie = req;
params.rma.done_fn = MPIDI_Win_DoneCB;
params.rdma.local.mr=&req->origin.memregion;
params.rdma.remote.mr=&req->win->mpid.info[req->target.rank].memregion;
params.rdma.remote.offset= req->offset;
params.rdma.local.offset = req->state.local_offset;
params.type.local = *(pami_type_t *)(req->origin.dt.pointer->device_datatype);
params.type.remote = *(pami_type_t *)(req->target.dt.pointer->device_datatype);
rc = PAMI_Rget_typed(context, &params);
MPID_assert(rc == PAMI_SUCCESS);
}
else {
pami_result_t rc;
pami_rget_simple_t params;
......@@ -110,6 +142,7 @@ MPIDI_Get_use_pami_rget(pami_context_t context, MPIDI_Win_request * req)
++req->state.index;
}
}
}
return PAMI_SUCCESS;
}
......@@ -203,6 +236,7 @@ MPID_Get(void *origin_addr,
MPI_Datatype target_datatype,
MPID_Win *win)
{
int mpi_errno = MPI_SUCCESS;
int shm_locked=0;
void *target_addr;
......@@ -293,10 +327,17 @@ MPID_Get(void *origin_addr,
req->target.rank = target_rank;
if (req->origin.dt.contig)
/* Only pack the origin data if the origin is non-contiguous and we are using the simple PAMI_Rget.
* If we are using the typed PAMI_Rget_typed use the origin address as is, if we are using the simple
* PAMI_Rget with contiguous data use the origin address with the lower-bound adjustment.
*/
if (req->origin.dt.contig || (!req->origin.dt.contig && (MPIDI_Process.typed_onesided == 1)))
{
req->buffer_free = 0;
req->buffer = origin_addr + req->origin.dt.true_lb;
if ((req->origin.dt.contig && req->target.dt.contig && (MPIDI_Process.typed_onesided == 1)) || (!(MPIDI_Process.typed_onesided == 1))) // use simple rput
req->buffer = (void *) ((uintptr_t) origin_addr + req->origin.dt.true_lb);
else
req->buffer = (void *) ((uintptr_t) origin_addr);
}
else
{
......@@ -356,8 +397,14 @@ MPID_Get(void *origin_addr,
MPIDI_Win_datatype_map(&req->target.dt);
win->mpid.sync.total += req->target.dt.num_contig;
if ((!req->target.dt.contig || !req->origin.dt.contig) && (MPIDI_Process.typed_onesided == 1))
/* If the datatype is non-contiguous and the PAMID typed_onesided optimization
* is enabled then we will be using the typed interface and will only make 1 call.
*/
win->mpid.sync.total = 1;
else
win->mpid.sync.total += req->target.dt.num_contig;
/* The pamid one-sided design requires context post in order to handle the
* case where the number of pending rma operation exceeds the
......
......@@ -61,6 +61,39 @@ MPIDI_Put(pami_context_t context,
static inline int
MPIDI_Put_use_pami_rput(pami_context_t context, MPIDI_Win_request * req)
{
int use_typed_rdma = 0;
if (!req->target.dt.contig || !req->origin.dt.contig) {
use_typed_rdma = 0;
if (MPIDI_Process.typed_onesided == 1)
use_typed_rdma = 1;
}
if (use_typed_rdma) {
pami_result_t rc;
pami_rput_typed_t params;
/* params need to zero out to avoid passing garbage to PAMI */
params=zero_rput_typed_parms;
params.rma.dest=req->dest;
params.rma.hints.buffer_registered = PAMI_HINT_ENABLE;
params.rma.hints.use_rdma = PAMI_HINT_ENABLE;
params.rma.bytes = req->target.dt.size;
params.rma.cookie = req;
params.rma.done_fn = NULL;
params.rdma.local.mr=&req->origin.memregion;
params.rdma.remote.mr=&req->win->mpid.info[req->target.rank].memregion;
params.rdma.remote.offset= req->offset;
params.rdma.local.offset = req->state.local_offset;
params.put.rdone_fn= MPIDI_Win_DoneCB;
params.type.local = *(pami_type_t *)(req->origin.dt.pointer->device_datatype);
params.type.remote = *(pami_type_t *)(req->target.dt.pointer->device_datatype);
rc = PAMI_Rput_typed(context, &params);
MPID_assert(rc == PAMI_SUCCESS);
}
else {
pami_result_t rc;
pami_rput_simple_t params;
/* params need to zero out to avoid passing garbage to PAMI */
......@@ -113,6 +146,7 @@ MPIDI_Put_use_pami_rput(pami_context_t context, MPIDI_Win_request * req)
++req->state.index;
}
}
}
return PAMI_SUCCESS;
}
......@@ -296,10 +330,17 @@ MPID_Put(const void *origin_addr,
req->target.rank = target_rank;
if (req->origin.dt.contig)
/* Only pack the origin data if the origin is non-contiguous and we are using the simple PAMI_Rput.
* If we are using the typed PAMI_Rput_typed use the origin address as-is, if we are using the simple
* PAMI_Rput with contiguous data use the origin address with the lower-bound adjustment.
*/
if (req->origin.dt.contig || (!req->origin.dt.contig && (MPIDI_Process.typed_onesided == 1)))
{
req->buffer_free = 0;
req->buffer = (void *) ((uintptr_t) origin_addr + req->origin.dt.true_lb);
if ((req->origin.dt.contig && req->target.dt.contig && (MPIDI_Process.typed_onesided == 1)) || (!(MPIDI_Process.typed_onesided == 1)))
req->buffer = (void *) ((uintptr_t) origin_addr + req->origin.dt.true_lb);
else
req->buffer = (void *) ((uintptr_t) origin_addr);
}
else
{
......@@ -357,7 +398,13 @@ MPID_Put(const void *origin_addr,
MPIDI_Win_datatype_map(&req->target.dt);
win->mpid.sync.total += req->target.dt.num_contig;
if ((!req->target.dt.contig || !req->origin.dt.contig) && (MPIDI_Process.typed_onesided == 1))
/* If the datatype is non-contiguous and the PAMID typed_onesided optimization
* is enabled then we will be using the typed interface and will only make 1 call.
*/
win->mpid.sync.total = 1;
else
win->mpid.sync.total += req->target.dt.num_contig;
/* The pamid one-sided design requires context post in order to handle the
* case where the number of pending rma operation exceeds the
......
......@@ -29,6 +29,8 @@ pami_rget_simple_t zero_rget_parms;
pami_get_simple_t zero_get_parms;
pami_rput_simple_t zero_rput_parms;
pami_put_simple_t zero_put_parms;
pami_rput_typed_t zero_rput_typed_parms;
pami_rget_typed_t zero_rget_typed_parms;
pami_send_t zero_send_parms;
pami_send_immediate_t zero_send_immediate_parms;
pami_recv_t zero_recv_parms;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment