Commit 142b9440 authored by Rob Latham's avatar Rob Latham
Browse files

Revert "Address many of the perf problems in #1788"

This reverts commit 1c5c5945

.

Reopens #1788  (datatype performance tests failing)

but better to have poor performance than incorrect performance

Closes #2115 (RMA fails with derived type containing struct of struct)
Closes #2126 (Data Integrity issue in MPI_Gather ...)

Conflicts:
	src/mpid/common/datatype/dataloop/dataloop_optimize.c

but only because a subsequent commit removed bits of this optimization.
This commit fully removes this optimization, but we leave behind test
cases to help us make sure we get it right next time.  We also leave
behind some additional debugging support routines.
Signed-off-by: default avatarJunchao Zhang <jczhang@mcs.anl.gov>
parent 7669873d
......@@ -21,8 +21,7 @@ mpi_core_sources += \
src/mpid/common/datatype/dataloop/segment_count.c \
src/mpid/common/datatype/dataloop/segment_flatten.c \
src/mpid/common/datatype/dataloop/segment_packunpack.c \
src/mpid/common/datatype/dataloop/subarray_support.c \
src/mpid/common/datatype/dataloop/dataloop_optimize.c
src/mpid/common/datatype/dataloop/subarray_support.c
# several headers are included by the rest of MPICH
AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/datatype
......
......@@ -88,22 +88,11 @@ DLOOP_Count PREPEND_PREFIX(Type_indexed_count_contig)(DLOOP_Count count,
const void *displacement_array,
int dispinbytes,
DLOOP_Offset old_extent);
DLOOP_Count PREPEND_PREFIX(Type_blockindexed_count_contig)(DLOOP_Count count,
DLOOP_Count blklen,
const void *disp_array,
int dispinbytes,
DLOOP_Offset old_extent);
int PREPEND_PREFIX(Dataloop_optimize)( DLOOP_Dataloop *dlpOld_p, int level );
int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *,
MPI_Aint *, MPI_Aint *);
int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int,
const int [],
const DLOOP_Type [],
MPI_Aint *,
MPI_Aint * );
void PREPEND_PREFIX(Dataloop_debug_print)( DLOOP_Dataloop *dp );
#endif
......@@ -11,57 +11,6 @@
#error "You must explicitly include a header that sets the PREPEND_PREFIX and includes dataloop_parts.h"
#endif
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
categories :
- name : DATATYPE
description : Datatype optimization parameters
cvars:
- name : MPIR_CVAR_DATALOOP_OPTIMIZE
category : DATATYPE
type : boolean
default : true
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
By default, the internal representation of an MPI datatype that
is used by MPICH to move data is very similar to the original
description of the datatype. If this flag is true, additional
optimizations are used to improve the performance of datatypes.
- name : MPIR_CVAR_DATALOOP_FLATTEN
category : DATATYPE
type : boolean
class : none
default : true
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
If true, attempt to "flatten" the internal representation of
MPI struct datatypes (created with MPI_Type_create_struct).
- name : MPIR_CVAR_DATALOOP_FLATTEN_MULT
category : DATATYPE
type : int
class : none
default : 2
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
Flattening an MPI struct datatype does not always improve
performance. This parameter is a threshold that is used in
comparing the size of the description with the amount of data
moved. Larger values make it more likely that a struct datatype
will be flattened. The default value is adequate for flattening
simple structs, and will usually avoid flattening structs
containing vectors or block-indexed data.
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/
static int DLOOP_Dataloop_create_struct_memory_error(void);
static int DLOOP_Dataloop_create_unique_type_struct(DLOOP_Count count,
const int *blklens,
......@@ -289,37 +238,19 @@ int PREPEND_PREFIX(Dataloop_create_struct)(DLOOP_Count count,
* if caller asked for homogeneous or all bytes representation,
* flatten the type and store it as an indexed type so that
* there are no branches in the dataloop tree.
*
* Note that this is not always an optimization - for example,
* replacing two long block_indexed with one longer indexed (with
* the additional blockcount array) is likely to be slower, because
* of the additional memory motion required.
*/
if (MPIR_CVAR_DATALOOP_FLATTEN && (
(flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
(flag == DLOOP_DATALOOP_ALL_BYTES) ))
{
MPI_Aint nElms = 0, nDesc = 0;
PREPEND_PREFIX(Dataloop_est_struct_complexity)( count,
blklens,
oldtypes,
&nElms,
&nDesc );
/* Only convert to flattened if the flattened description
is likely to be more efficient. The magic number of 24 was
determined emperically. */
if ( nDesc * 24 * MPIR_CVAR_DATALOOP_FLATTEN_MULT > nElms) {
return DLOOP_Dataloop_create_flattened_struct(count,
blklens,
disps,
oldtypes,
dlp_p,
dlsz_p,
dldepth_p,
flag);
}
}
if ((flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
(flag == DLOOP_DATALOOP_ALL_BYTES))
{
return DLOOP_Dataloop_create_flattened_struct(count,
blklens,
disps,
oldtypes,
dlp_p,
dlsz_p,
dldepth_p,
flag);
}
/* scan through types and gather derived type info */
for (i=0; i < count; i++)
......
This diff is collapsed.
......@@ -13,8 +13,6 @@
#include "dataloop.h"
#include "veccpy.h"
/* NOTE: bufp values are unused, ripe for removal */
/* #define MPICH_DEBUG_SEGMENT_MOVE */
/* TODO: Consider integrating this with the general debug support. */
/* Note: This does not use the CVAR support for the environment variable
......@@ -37,6 +35,8 @@ static void setPrint( void ) {
#define DBG_SEGMENT(_a)
#endif
/* NOTE: bufp values are unused, ripe for removal */
int PREPEND_PREFIX(Segment_contig_m2m)(DLOOP_Offset *blocks_p,
DLOOP_Type el_type,
DLOOP_Offset rel_off,
......@@ -323,22 +323,6 @@ int PREPEND_PREFIX(Segment_blkidx_m2m)(DLOOP_Offset *blocks_p,
DLOOP_Handle_get_size_macro(el_type, el_size);
DBG_SEGMENT(printf( "blkidx m2m: elsize = %d, count = %d, blocklen = %d\n", (int)el_size, (int)count, (int)blocklen ));
/* If the blocklen * el_size is relatively small, then for
performance reasons, its important to hoist most of these
tests out of the loop. Ignoring some of the issues of handling
the available buffer size (blocks_left), this should translate
directly into code that looks like this for blocksize == 1
for (i=0; i<count; i++) {
dest[i] = userbuf[offsetarray[i]];
}
where "dest" and "userbuf" are pointers to objects of the correct
size. If blocksize is > 1, then various unrollings are important
until blocksize is large enough to make the overhead of memcpy
negligible. Datatypes such as this are used in LAMMPS, for example.
*/
while (blocks_left) {
char *src, *dest;
......
......@@ -58,17 +58,6 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
MPIU_DBG_PRINTF(("# contig blocks = %d\n",
(int) datatype_ptr->max_contig_blocks));
if (MPIR_CVAR_DATALOOP_OPTIMIZE) {
MPID_Dataloop_optimize(datatype_ptr->dataloop, 0 );
}
else {
/* This allows the developer to output the final dataloops
in the case where the dataloops are not optimized.
It does nothing if that printing is not enabled.
*/
MPID_Dataloop_debug_print( datatype_ptr->dataloop );
}
#if 0
MPIDI_Dataloop_dot_printf(datatype_ptr->dataloop, 0, 1);
#endif
......
......@@ -59,4 +59,4 @@ cxx-types 1 mpiversion=3.0
@largetest@large_type 1 mpiversion=3.0
@largetest@large_type_sendrec 2 arg=31 mpiversion=3.0
@largetest@large_type_sendrec 2 arg=32 mpiversion=3.0 timeLimit=360
get-struct 2 xfail=ticket2115
get-struct 2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment