Commit 1c5c5945 authored by William Gropp's avatar William Gropp Committed by Rob Latham
Browse files

Address many of the perf problems in #1788

This adds a step that optimizes the dataloop representation, primarily
merging CONTIG blocks with the parent dataloop, such as a VECTOR loop.
It also performs a change of VECTOR of CONTIG with extent > size to
VECTOR of VECTOR; this reduces the stack operations needed to perform
the move.

This is a temporary fix for the dataloop performance.  See the DAME
wiki page (http://wiki.mpich.org/mpich/index.php/DAME

) for current work
on a replacement, higher performance datatype system.

A partial, but not complete, fix for ticket #1788
Signed-off-by: Rob Latham's avatarRob Latham <robl@mcs.anl.gov>
parent 514214bc
......@@ -21,7 +21,8 @@ mpi_core_sources += \
src/mpid/common/datatype/dataloop/segment_count.c \
src/mpid/common/datatype/dataloop/segment_flatten.c \
src/mpid/common/datatype/dataloop/segment_packunpack.c \
src/mpid/common/datatype/dataloop/subarray_support.c
src/mpid/common/datatype/dataloop/subarray_support.c \
src/mpid/common/datatype/dataloop/dataloop_optimize.c
# several headers are included by the rest of MPICH
AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/datatype
......
......@@ -94,5 +94,16 @@ DLOOP_Count PREPEND_PREFIX(Type_blockindexed_count_contig)(DLOOP_Count count,
const void *disp_array,
int dispinbytes,
DLOOP_Offset old_extent);
int PREPEND_PREFIX(Dataloop_optimize)( DLOOP_Dataloop *dlpOld_p, int level );
int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *,
MPI_Aint *, MPI_Aint *);
int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int,
const int [],
const DLOOP_Type [],
MPI_Aint *,
MPI_Aint * );
void PREPEND_PREFIX(Dataloop_debug_print)( DLOOP_Dataloop *dp );
#endif
......@@ -11,6 +11,57 @@
#error "You must explicitly include a header that sets the PREPEND_PREFIX and includes dataloop_parts.h"
#endif
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
categories :
- name : DATATYPE
description : Datatype optimization parameters
cvars:
- name : MPIR_CVAR_DATALOOP_OPTIMIZE
category : DATATYPE
type : boolean
default : true
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
By default, the internal representation of an MPI datatype that
is used by MPICH to move data is very similar to the original
description of the datatype. If this flag is true, additional
optimizations are used to improve the performance of datatypes.
- name : MPIR_CVAR_DATALOOP_FLATTEN
category : DATATYPE
type : boolean
class : none
default : true
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
If true, attempt to "flatten" the internal representation of
MPI struct datatypes (created with MPI_Type_create_struct).
- name : MPIR_CVAR_DATALOOP_FLATTEN_MULT
category : DATATYPE
type : int
class : none
default : 2
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
Flattening an MPI struct datatype does not always improve
performance. This parameter is a threshold that is used in
comparing the size of the description with the amount of data
moved. Larger values make it more likely that a struct datatype
will be flattened. The default value is adequate for flattening
simple structs, and will usually avoid flattening structs
containing vectors or block-indexed data.
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/
static int DLOOP_Dataloop_create_struct_memory_error(void);
static int DLOOP_Dataloop_create_unique_type_struct(DLOOP_Count count,
const int *blklens,
......@@ -238,19 +289,37 @@ int PREPEND_PREFIX(Dataloop_create_struct)(DLOOP_Count count,
* if caller asked for homogeneous or all bytes representation,
* flatten the type and store it as an indexed type so that
* there are no branches in the dataloop tree.
*
* Note that this is not always an optimization - for example,
* replacing two long block_indexed with one longer indexed (with
* the additional blockcount array) is likely to be slower, because
* of the additional memory motion required.
*/
if ((flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
(flag == DLOOP_DATALOOP_ALL_BYTES))
{
return DLOOP_Dataloop_create_flattened_struct(count,
blklens,
disps,
oldtypes,
dlp_p,
dlsz_p,
dldepth_p,
flag);
}
if (MPIR_CVAR_DATALOOP_FLATTEN && (
(flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
(flag == DLOOP_DATALOOP_ALL_BYTES) ))
{
MPI_Aint nElms = 0, nDesc = 0;
PREPEND_PREFIX(Dataloop_est_struct_complexity)( count,
blklens,
oldtypes,
&nElms,
&nDesc );
/* Only convert to flattened if the flattened description
is likely to be more efficient. The magic number of 24 was
determined emperically. */
if ( nDesc * 24 * MPIR_CVAR_DATALOOP_FLATTEN_MULT > nElms) {
return DLOOP_Dataloop_create_flattened_struct(count,
blklens,
disps,
oldtypes,
dlp_p,
dlsz_p,
dldepth_p,
flag);
}
}
/* scan through types and gather derived type info */
for (i=0; i < count; i++)
......
This diff is collapsed.
......@@ -15,6 +15,28 @@
/* NOTE: bufp values are unused, ripe for removal */
/* #define MPICH_DEBUG_SEGMENT_MOVE */
/* TODO: Consider integrating this with the general debug support. */
/* Note: This does not use the CVAR support for the environment variable
because (a) this is a temporary code and (b) it is expert developer
only */
#ifdef MPICH_DEBUG_SEGMENT_MOVE
static int printSegment = -1;
static void setPrint( void ) {
char *s = getenv( "MPICH_DATALOOP_PRINT" );
if (s && (strcmp(s,"yes")==0 || strcmp(s,"YES") == 0)) {
printSegment = 1;
}
else {
printSegment = 0;
}
}
#define DBG_SEGMENT(_a) do { if (printSegment < 0) setPrint(); \
if (printSegment) { _a; } } while( 0 )
#else
#define DBG_SEGMENT(_a)
#endif
int PREPEND_PREFIX(Segment_contig_m2m)(DLOOP_Offset *blocks_p,
DLOOP_Type el_type,
DLOOP_Offset rel_off,
......@@ -52,6 +74,7 @@ void PREPEND_PREFIX(Segment_pack)(DLOOP_Segment *segp,
{
struct PREPEND_PREFIX(m2m_params) params; /* defined in dataloop_parts.h */
DBG_SEGMENT(printf( "Segment_pack...\n" ));
/* experimenting with discarding buf value in the segment, keeping in
* per-use structure instead. would require moving the parameters around a
* bit.
......@@ -77,6 +100,7 @@ void PREPEND_PREFIX(Segment_unpack)(DLOOP_Segment *segp,
{
struct PREPEND_PREFIX(m2m_params) params;
DBG_SEGMENT(printf( "Segment_unpack...\n" ));
/* experimenting with discarding buf value in the segment, keeping in
* per-use structure instead. would require moving the parameters around a
* bit.
......@@ -110,6 +134,8 @@ int PREPEND_PREFIX(Segment_contig_m2m)(DLOOP_Offset *blocks_p,
DLOOP_Handle_get_size_macro(el_type, el_size);
size = *blocks_p * el_size;
DBG_SEGMENT(printf( "element type = %lx\n", (long)el_type ));
DBG_SEGMENT(printf( "contig m2m: elsize = %d, size = %d\n", (int)el_size, (int)size ));
#ifdef MPID_SU_VERBOSE
dbg_printf("\t[contig unpack: do=" DLOOP_OFFSET_FMT_DEC_SPEC ", dp=%x, bp=%x, sz=" DLOOP_OFFSET_FMT_DEC_SPEC ", blksz=" DLOOP_OFFSET_FMT_DEC_SPEC "]\n",
rel_off,
......@@ -165,6 +191,7 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
DLOOP_Ensure_Offset_fits_in_pointer((DLOOP_VOID_PTR_CAST_TO_OFFSET (paramp->userbuf)) + rel_off);
cbufp = (char*) paramp->userbuf + rel_off;
DLOOP_Handle_get_size_macro(el_type, el_size);
DBG_SEGMENT(printf( "vector m2m: elsize = %d, count = %d, stride = %d, blocksize = %d\n", (int)el_size, (int)count, (int)stride, (int)blksz ));
whole_count = (DLOOP_Count)((blksz > 0) ? (*blocks_p / (DLOOP_Offset) blksz) : 0);
blocks_left = (DLOOP_Count)((blksz > 0) ? (*blocks_p % (DLOOP_Offset) blksz) : 0);
......@@ -195,6 +222,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
else {
for (i=0; i < whole_count; i++) {
DLOOP_Memcpy(cbufp, paramp->streambuf, ((DLOOP_Offset) blksz) * el_size);
DBG_SEGMENT(printf("vec: memcpy %p %p %d\n", cbufp,
paramp->streambuf,
(int)(blksz * el_size) ));
/* Ensure that pointer increment fits in a pointer */
/* streambuf is a pointer (not a displacement) since it is being used for a memory copy */
DLOOP_Ensure_Offset_fits_in_pointer((DLOOP_VOID_PTR_CAST_TO_OFFSET (paramp->streambuf)) +
......@@ -206,6 +236,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
}
if (blocks_left) {
DLOOP_Memcpy(cbufp, paramp->streambuf, ((DLOOP_Offset) blocks_left) * el_size);
DBG_SEGMENT(printf("vec(left): memcpy %p %p %d\n", cbufp,
paramp->streambuf,
(int)(blocks_left * el_size) ));
/* Ensure that pointer increment fits in a pointer */
/* streambuf is a pointer (not a displacement) since
* it is being used for a memory copy */
......@@ -244,6 +277,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
/* Ensure that pointer increment fits in a pointer */
/* streambuf is a pointer (not a displacement) since
* it is being used for a memory copy */
DBG_SEGMENT(printf("vec: memcpy %p %p %d\n",
paramp->streambuf, cbufp,
(int)(blksz * el_size) ));
DLOOP_Ensure_Offset_fits_in_pointer((DLOOP_VOID_PTR_CAST_TO_OFFSET (paramp->streambuf)) +
(DLOOP_Offset) blksz * el_size);
paramp->streambuf += (DLOOP_Offset) blksz * el_size;
......@@ -251,6 +287,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
}
if (blocks_left) {
DLOOP_Memcpy(paramp->streambuf, cbufp, (DLOOP_Offset) blocks_left * el_size);
DBG_SEGMENT(printf("vec(left): memcpy %p %p %d\n",
paramp->streambuf, cbufp,
(int)(blocks_left * el_size) ));
/* Ensure that pointer increment fits in a pointer */
/* streambuf is a pointer (not a displacement) since
* it is being used for a memory copy */
......@@ -282,6 +321,23 @@ int PREPEND_PREFIX(Segment_blkidx_m2m)(DLOOP_Offset *blocks_p,
struct PREPEND_PREFIX(m2m_params) *paramp = v_paramp;
DLOOP_Handle_get_size_macro(el_type, el_size);
DBG_SEGMENT(printf( "blkidx m2m: elsize = %d, count = %d, blocklen = %d\n", (int)el_size, (int)count, (int)blocklen ));
/* If the blocklen * el_size is relatively small, then for
performance reasons, its important to hoist most of these
tests out of the loop. Ignoring some of the issues of handling
the available buffer size (blocks_left), this should translate
directly into code that looks like this for blocksize == 1
for (i=0; i<count; i++) {
dest[i] = userbuf[offsetarray[i]];
}
where "dest" and "userbuf" are pointers to objects of the correct
size. If blocksize is > 1, then various unrollings are important
until blocksize is large enough to make the overhead of memcpy
negligible. Datatypes such as this are used in LAMMPS, for example.
*/
while (blocks_left) {
char *src, *dest;
......@@ -326,6 +382,7 @@ int PREPEND_PREFIX(Segment_blkidx_m2m)(DLOOP_Offset *blocks_p,
}
else {
DLOOP_Memcpy(dest, src, (DLOOP_Offset) blocklen * el_size);
DBG_SEGMENT(printf( "blkidx m3m:memcpy(%p,%p,%d)\n",dest,src,(int)(blocklen*el_size)));
}
/* Ensure that pointer increment fits in a pointer */
......@@ -358,6 +415,7 @@ int PREPEND_PREFIX(Segment_index_m2m)(DLOOP_Offset *blocks_p,
char *cbufp;
struct PREPEND_PREFIX(m2m_params) *paramp = v_paramp;
DBG_SEGMENT(printf( "index m2m: elsize = %d, count = %d\n", (int)el_size, (int)count ));
DLOOP_Handle_get_size_macro(el_type, el_size);
while (blocks_left) {
......
......@@ -19,6 +19,7 @@ Output Parameters:
Return Value:
0 on success, -1 on failure.
@*/
int MPID_Type_commit(MPI_Datatype *datatype_p)
{
int mpi_errno=MPI_SUCCESS;
......@@ -57,9 +58,21 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
MPIU_DBG_PRINTF(("# contig blocks = %d\n",
(int) datatype_ptr->max_contig_blocks));
if (MPIR_CVAR_DATALOOP_OPTIMIZE) {
MPID_Dataloop_optimize(datatype_ptr->dataloop, 0 );
}
else {
/* This allows the developer to output the final dataloops
in the case where the dataloops are not optimized.
It does nothing if that printing is not enabled.
*/
MPID_Dataloop_debug_print( datatype_ptr->dataloop );
}
#if 0
MPIDI_Dataloop_dot_printf(datatype_ptr->dataloop, 0, 1);
#endif
}
return mpi_errno;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment