Commit 38ef5818 authored by Rob Latham's avatar Rob Latham
Browse files

a partial round of datatype optimizations



Some datatype performance tests in the MPICH test suite fail:
(perf/twovec,  perf/nestvec, perf/nestvec2, perf/indexperf,
perf/transp-datatype).

This changeset introduces a few optimizations that operate on the
dataloop representation to make it more performant.  perf/indexperf
should still fail under these changes.

Original-author: Bill Gropp <wgropp@illinois.edu>

See #1788, for which this resolves some but not all performance issues.
Signed-off-by: Rob Latham's avatarRob Latham <robl@mcs.anl.gov>
parent 4e1b470d
......@@ -21,7 +21,8 @@ lib_lib@MPILIBNAME@_la_SOURCES += \
src/mpid/common/datatype/dataloop/segment_count.c \
src/mpid/common/datatype/dataloop/segment_flatten.c \
src/mpid/common/datatype/dataloop/segment_packunpack.c \
src/mpid/common/datatype/dataloop/subarray_support.c
src/mpid/common/datatype/dataloop/subarray_support.c \
src/mpid/common/datatype/dataloop/dataloop_optimize.c
# several headers are included by the rest of MPICH
AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/datatype
......
......@@ -54,7 +54,7 @@
/*@
Dataloop_free - deallocate the resources used to store a dataloop
Input Parameters:
Input/output Parameters:
. dataloop - pointer to dataloop structure
@*/
void PREPEND_PREFIX(Dataloop_free)(DLOOP_Dataloop **dataloop)
......@@ -483,7 +483,7 @@ void PREPEND_PREFIX(Dataloop_alloc_and_copy)(int kind,
/*@
Dataloop_struct_alloc - allocate the resources used to store a dataloop and
copy in old dataloop as appropriate. this version
copy in old dataloop as appropriate. This version
is specifically for use when a struct dataloop is
being created; the space to hold old dataloops in
this case must be described back to the
......
......@@ -94,5 +94,14 @@ DLOOP_Count PREPEND_PREFIX(Type_blockindexed_count_contig)(DLOOP_Count count,
const void *disp_array,
int dispinbytes,
DLOOP_Offset old_extent);
int PREPEND_PREFIX(Dataloop_optimize)( DLOOP_Dataloop *dlpOld_p );
int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *,
MPI_Aint *, MPI_Aint *);
int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int,
const int [],
const DLOOP_Type [],
MPI_Aint *,
MPI_Aint * );
#endif
......@@ -11,14 +11,17 @@
Dataloop_contiguous - create the dataloop representation for a
contiguous datatype
Arguments:
Input Parameters:
+ int icount,
. MPI_Datatype oldtype,
. DLOOP_Dataloop **dlp_p,
. int *dlsz_p,
. int *dldepth_p,
. DLOOP_Type oldtype
- int flag
Output Parameters:
+ DLOOP_Dataloop **dlp_p,
. DLOOP_Size *dlsz_p,
- int *dldepth_p,
.N Errors
.N Returns 0 on success, -1 on failure.
@*/
......
......@@ -11,6 +11,57 @@
#error "You must explicitly include a header that sets the PREPEND_PREFIX and includes dataloop_parts.h"
#endif
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
categories :
- name : DATATYPE
description : Datatype optimization parameters
cvars:
- name : MPIR_CVAR_DATALOOP_OPTIMIZE
category : DATATYPE
type : boolean
default : true
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
By default, the internal representation of an MPI datatype that
is used by MPICH to move data is very similar to the original
description of the datatype. If this flag is true, additional
optimizations are used to improve the performance of datatypes.
- name : MPIR_CVAR_DATALOOP_FLATTEN
category : DATATYPE
type : boolean
class : none
default : true
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
If true, attempt to "flatten" the internal representation of
MPI struct datatypes (created with MPI_Type_create_struct).
- name : MPIR_CVAR_DATALOOP_FLATTEN_MULT
category : DATATYPE
type : int
class : none
default : 2
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
Flattening an MPI struct datatype does not always improve
performance. This parameter is a threshold that is used in
comparing the size of the description with the amount of data
moved. Larger values make it more likely that a struct datatype
will be flattened. The default value is adequate for flattening
simple structs, and will usually avoid flattening structs
containing vectors or block-indexed data.
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/
static int DLOOP_Dataloop_create_struct_memory_error(void);
static int DLOOP_Dataloop_create_unique_type_struct(DLOOP_Count count,
const int *blklens,
......@@ -238,19 +289,37 @@ int PREPEND_PREFIX(Dataloop_create_struct)(DLOOP_Count count,
* if caller asked for homogeneous or all bytes representation,
* flatten the type and store it as an indexed type so that
* there are no branches in the dataloop tree.
*
* Note that this is not always an optimization - for example,
* replacing two long block_indexed with one longer indexed (with
* the additional blockcount array) is likely to be slower, because
* of the additional memory motion required.
*/
if ((flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
(flag == DLOOP_DATALOOP_ALL_BYTES))
{
return DLOOP_Dataloop_create_flattened_struct(count,
blklens,
disps,
oldtypes,
dlp_p,
dlsz_p,
dldepth_p,
flag);
}
if (MPIR_CVAR_DATALOOP_FLATTEN && (
(flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
(flag == DLOOP_DATALOOP_ALL_BYTES) ))
{
MPI_Aint nElms = 0, nDesc = 0;
PREPEND_PREFIX(Dataloop_est_struct_complexity)( count,
blklens,
oldtypes,
&nElms,
&nDesc );
/* Only convert to flattened if the flattened description
is likely to be more efficient. We estimate this
by */
if ( nDesc * 24 * MPIR_CVAR_DATALOOP_FLATTEN_MULT > nElms) {
return DLOOP_Dataloop_create_flattened_struct(count,
blklens,
disps,
oldtypes,
dlp_p,
dlsz_p,
dldepth_p,
flag);
}
}
/* scan through types and gather derived type info */
for (i=0; i < count; i++)
......
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* (C) 2001 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "dataloop.h"
#define MPICH_DEBUG_DATALOOP
#ifdef MPICH_DEBUG_DATALOOP
static int level = 0;
static int printDataloop = 0;
static int printIfOptimized = 0;
/* Print format:
(spaces for level).(el_size,el_extent,el_type)(count)....
*/
static void dl_print_dataloop( int, DLOOP_Dataloop * );
static void dl_print_contig( int, DLOOP_Dataloop * );
static void dl_print_vector( int, DLOOP_Dataloop * );
static void dl_print_blockindexed( int, DLOOP_Dataloop * );
static void dl_print_struct( int, DLOOP_Dataloop * );
static void dl_print( int, const char * );
static void dl_print_tab( int l )
{
int i;
for (i=2*l; i!=0; i--) printf( "%c", ' ' );
}
static void dl_print_base( DLOOP_Dataloop *dp )
{
printf( "(%ld,%ld,%lx)(%ld)", (long)dp->el_size, (long)dp->el_extent,
(long)dp->el_type, (long)dp->loop_params.count );
}
static void dl_print( int l, const char *s )
{
dl_print_tab(l);
printf( "%s", s );
}
static void dl_print_contig( int l, DLOOP_Dataloop *dp )
{
dl_print_tab(l);
printf( "CONTIG " );
dl_print_base( dp );
printf( "\n" );
}
static void dl_print_vector( int l, DLOOP_Dataloop *dp )
{
int stride = dp->loop_params.v_t.stride;
int blocksize = dp->loop_params.v_t.blocksize ;
dl_print_tab(l);
printf( "VECTOR " );
dl_print_base( dp );
printf( ":Stride %d Blocksize %d\n", stride, blocksize );
}
static void dl_print_blockindexed( int l, DLOOP_Dataloop *dp )
{
int blocksize = dp->loop_params.bi_t.blocksize ;
DLOOP_Offset *offarray = dp->loop_params.bi_t.offset_array;
int i, n;
dl_print_tab(l);
printf( "BLOCKINDEXED " );
dl_print_base( dp );
printf( ":Blocksize %d:", blocksize );
n = dp->loop_params.bi_t.count;
if (n > 8) n = 8;
for (i=0; i<n; i++) {
printf( "%lx,", (long)offarray[i] );
}
if (dp->loop_params.bi_t.count > n) printf( "..." );
printf( "\n" );
}
static void dl_print_indexed( int l, DLOOP_Dataloop *dp )
{
DLOOP_Count *blocksizearray = dp->loop_params.i_t.blocksize_array ;
DLOOP_Offset *offarray = dp->loop_params.i_t.offset_array;
int i, n;
int minblock, maxblock;
dl_print_tab(l);
printf( "INDEXED " );
dl_print_base( dp );
n = dp->loop_params.i_t.count;
minblock = maxblock = (n>0) ? blocksizearray[0] : 0;
for (i=0; i<n; i++) {
if (blocksizearray[i] > maxblock) maxblock = blocksizearray[i];
if (blocksizearray[i] < minblock) minblock = blocksizearray[i];
}
printf( "blocks in [%d,%d]", minblock, maxblock );
if (n > 8) n = 8;
for (i=0; i<n; i++) {
printf( "(%lx,%ld)", (long)offarray[i], (long)blocksizearray[i] );
}
if (dp->loop_params.i_t.count > n) printf( "..." );
printf( "\n" );
}
static void dl_print_struct( int l, DLOOP_Dataloop *dp )
{
DLOOP_Count *blocksizearray = dp->loop_params.s_t.blocksize_array ;
DLOOP_Offset *offarray = dp->loop_params.s_t.offset_array;
DLOOP_Dataloop **looparray = dp->loop_params.s_t.dataloop_array;
int i, n;
dl_print_tab(l);
printf( "STRUCT " );
dl_print_base( dp );
printf( "\n" );
n = dp->loop_params.i_t.count;
if (n > 8) n = 8;
for (i=0; i<n; i++) {
dl_print_tab(l+1);
printf( "(%lx,%ld)", (long)offarray[i], (long)blocksizearray[i] );
dl_print_dataloop( l+1, looparray[i] );
printf( "\n" );
}
if (dp->loop_params.i_t.count > n) printf( "...\n" );
}
static void dl_print_dataloop( int l, DLOOP_Dataloop *dp )
{
dl_print_tab( l );
dl_print_base( dp );
switch (dp->kind & DLOOP_KIND_MASK) {
case DLOOP_KIND_CONTIG:
dl_print_contig( l, dp );
break;
case DLOOP_KIND_VECTOR:
dl_print_vector( l, dp );
break;
case DLOOP_KIND_BLOCKINDEXED:
dl_print_blockindexed( l, dp );
break;
case DLOOP_KIND_INDEXED:
dl_print_indexed( l, dp );
break;
case DLOOP_KIND_STRUCT:
dl_print_struct( l, dp );
break;
default:
dl_print( l, "Unknown dataloop type " );
printf( "\n" );
break;
}
}
#endif
/*
* Indicates whether a dataloop is a basic and final contig type.
* This can be used to determine when a contig type can be removed
* in a dataloop.
*/
static int dl_contig_isFinal( DLOOP_Dataloop *dp )
{
if ((dp->kind & DLOOP_KIND_MASK) != DLOOP_KIND_CONTIG) return 0;
if (dp->el_size == dp->el_extent &&
(dp->kind & DLOOP_FINAL_MASK))
return 1;
return 0;
}
/*
* Optimize a dataloop
*
* Apply the following transformations and return a new dataloop.
* 1. Convert all predefined types to UINTS with the best alignment (may be BYTE
* in worst case)
* 2. Convert blocks of contiguous into a single block of basic unit (e.g.,
* a vector type with a block count of 27 applied to a contiguous type of
* 6 ints will be turned into a block count of (27*6) UINTs)
* 3. Convert struct (with different dataloops (from different MPI datatypes)
* into indexed when all types are contig
* 4. Convert dataloops with counts of 1 into simpler types (e.g., q vector
* with 1 element is really a contig type)
*
* Value of these optimizations
* A 2012 paper compared performance of Open MPI, MPICH2, and user-written code
* for some datatypes, and found MPICH2 often performed poorer than other
* options. An investigation showed that some of the issues are due to
* a failure to perform optimizations of these type (especially #1 and 2).
* It may also be necessary to enhance the dataloop execution engine, but
* that will b a separate step.
*/
int PREPEND_PREFIX(Dataloop_optimize)(DLOOP_Dataloop *dlpOld_p )
{
int i;
#ifdef MPICH_DEBUG_DATALOOP
/* Temp for debugging */
static int firstCall = 1;
/* This is threadsafe in the sense that we don't care */
if (firstCall) {
if (getenv("MPICH_DATALOOP_PRINT")) {
printDataloop = 1;
printIfOptimized = 1;
}
firstCall = 0;
}
#endif
switch (dlpOld_p->kind & DLOOP_KIND_MASK) {
case DLOOP_KIND_CONTIG:
#ifdef MPICH_DEBUG_DATALOOP
if (printDataloop)
dl_print_contig( level, dlpOld_p );
#endif
/* replace contig of (non-basic) contig with contig (basic) */
if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.c_t.dataloop;
level++;
PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
level--;
if (dl_contig_isFinal( dlpChild_p ) ) {
dlpOld_p->loop_params.c_t.count *= dlpChild_p->loop_params.c_t.count;
dlpOld_p->el_size = dlpChild_p->el_size;
dlpOld_p->el_extent = dlpChild_p->el_extent;
dlpOld_p->kind |= DLOOP_FINAL_MASK;
dlpOld_p->loop_params.c_t.dataloop = 0;
#ifdef MPICH_DEBUG_DATALOOP
if (printIfOptimized || printDataloop) {
printf( "replacement contig is:\n" );
dl_print_contig( level, dlpOld_p );
}
#endif
}
}
break;
case DLOOP_KIND_VECTOR:
/* if sub-dloop is (non-basic) contig, merge with blockcount */
#ifdef MPICH_DEBUG_DATALOOP
if (printDataloop)
dl_print_vector( level, dlpOld_p );
#endif
if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.v_t.dataloop;
level++;
PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
level--;
if (dl_contig_isFinal( dlpChild_p ) ) {
/* We can replace the contig type by enlarging the blocksize */
/* Reset the kind to final, free the child type, set to null */
dlpOld_p->loop_params.v_t.blocksize *= dlpChild_p->loop_params.count;
dlpOld_p->el_size = dlpChild_p->el_size;
dlpOld_p->el_extent = dlpChild_p->el_extent;
dlpOld_p->kind |= DLOOP_FINAL_MASK;
dlpOld_p->loop_params.v_t.dataloop = 0;
#ifdef MPICH_DEBUG_DATALOOP
if (printIfOptimized || printDataloop) {
printf( "replacement Vector is:\n" );
dl_print_vector( level, dlpOld_p );
}
#endif
}
}
/* replace vector of a single element with contig */
if ((dlpOld_p->kind & DLOOP_FINAL_MASK)) {
int blocksize = dlpOld_p->loop_params.v_t.blocksize;
int count = dlpOld_p->loop_params.v_t.count;
if (dlpOld_p->el_size * blocksize ==
dlpOld_p->loop_params.v_t.stride ) {
dlpOld_p->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK;
dlpOld_p->loop_params.c_t.dataloop = 0;
dlpOld_p->loop_params.c_t.count = count * blocksize;
#ifdef MPICH_DEBUG_DATALOOP
if (printIfOptimized || printDataloop) {
printf( "replacement Contig is:\n" );
dl_print_contig( level, dlpOld_p );
}
#endif
}
}
/* replace vector that is contiguous with contiguous */
break;
case DLOOP_KIND_BLOCKINDEXED:
/* if subdloop is (non-basic) contig, merge with blockcount */
#ifdef MPICH_DEBUG_DATALOOP
if (printDataloop)
dl_print_blockindexed( level, dlpOld_p );
#endif
if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.bi_t.dataloop;
level++;
PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
level--;
if (dl_contig_isFinal( dlpChild_p ) ) {
/* We can replace the contig type by enlarging the blocksize */
/* Reset the kind to final, free the child type, set to null */
dlpOld_p->loop_params.bi_t.blocksize *= dlpChild_p->loop_params.count;
dlpOld_p->el_size = dlpChild_p->el_size;
dlpOld_p->el_extent = dlpChild_p->el_extent;
dlpOld_p->kind |= DLOOP_FINAL_MASK;
dlpOld_p->loop_params.bi_t.dataloop = 0;
#ifdef MPICH_DEBUG_DATALOOP
if (printIfOptimized || printDataloop) {
printf( "replacement BlockIndexed is:\n" );
dl_print_blockindexed( level, dlpOld_p );
}
#endif
}
}
/* replace blockindexed of a single element with contig */
break;
case DLOOP_KIND_INDEXED:
/* if sub-dloop is (non-basic) contig, merge with blockcount */
#ifdef MPICH_DEBUG_DATALOOP
if (printDataloop)
dl_print_indexed( level, dlpOld_p );
#endif
if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.i_t.dataloop;
level++;
PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
level--;
if (dl_contig_isFinal( dlpChild_p ) ) {
/* Could include the child type in the blocksize counts */
}
}
/* replace indexed of constant block count with blockindexed */
/* replace indexed of a single element with contig */
/* If all block counts are multiples of the smallest, and if most
blocks are smallest, then the other blocks could be split into
separate blocks with appropriate offsets, replacing indexed with
blockindexed */
break;
case DLOOP_KIND_STRUCT:
/* if sub-dloops are all contig, replace with indexed */
#ifdef MPICH_DEBUG_DATALOOP
if (printDataloop)
dl_print_struct( level, dlpOld_p );
#endif
if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
for (i=0; i<dlpOld_p->loop_params.s_t.count; i++) {
level ++;
PREPEND_PREFIX(Dataloop_optimize)(
dlpOld_p->loop_params.s_t.dataloop_array[i] );
level --;
}
}
break;
default:
#ifdef MPICH_DEBUG_DATALOOP
if (printDataloop)
dl_print( level, "Unknown type!" );
#endif
break;
}
return 0;
}
/*
* Make an estimate at the complexity of a datatype. This can be used
* to determine whether flattening the datatype to an indexed type is
* likely to be efficient.
*/
int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *dlp_p,
MPI_Aint *nElms,
MPI_Aint *nDesc )
{
int i;
MPI_Aint myElms = 0;
MPI_Aint myDesc = 0;
MPI_Aint childElms = 0, childDesc = 0;
DLOOP_Dataloop *dlpChild_p;
switch (dlp_p->kind & DLOOP_KIND_MASK) {
case DLOOP_KIND_CONTIG:
/* Data moved is count*size of the child type */
if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
dlpChild_p = dlp_p->loop_params.c_t.dataloop;
PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
&childDesc );
}
else {
childElms = dlp_p->el_size;
childDesc = 0;
}
myElms += dlp_p->loop_params.c_t.count * childElms;
myDesc += childDesc + 1;
break;
case DLOOP_KIND_VECTOR:
/* Data moved is count*size of the child type */
if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
dlpChild_p = dlp_p->loop_params.v_t.dataloop;
PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
&childDesc );
}
else {
childElms = dlp_p->el_size;
childDesc = 0;
}
myElms += dlp_p->loop_params.v_t.count *
dlp_p->loop_params.v_t.blocksize * childElms;
myDesc += childDesc + 2;
break;
case DLOOP_KIND_BLOCKINDEXED:
if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
dlpChild_p = dlp_p->loop_params.bi_t.dataloop;
PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
&childDesc );
}
else {