Commit 14732967 authored by Paul Coffman's avatar Paul Coffman Committed by Rob Latham
Browse files

ROMIO Collective IO One-sided aggregation algorithm base code

Optimized collective IO algorithm for GPFS to replace the existing
two-phase algorithm with one utilizing one-sided MPI_Put and MPI_Get.
Significant performance and memory optimization possible for certain
workloads.  Guarded by GPFSMPIO_AGGMETHOD environment variable, see
ad_gpfs_tuning.c for details.
Signed-off-by: Rob Latham's avatarRob Latham <>
parent 0e4dcc43
......@@ -37,6 +37,7 @@ long bglocklessmpio_f_type;
int gpfsmpio_bg_nagg_pset;
int gpfsmpio_pthreadio;
int gpfsmpio_p2pcontig;
int gpfsmpio_aggmethod;
int gpfsmpio_balancecontig;
int gpfsmpio_devnullio;
int gpfsmpio_bridgeringagg;
......@@ -105,6 +106,22 @@ double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
* 3.) There are no gaps between the offsets.
* 4.) No single rank has a data size which spans multiple file domains.
* - GPFSMPIO_AGGMETHOD - Replaces the two-phase collective IO aggregation with a one-
* sided algorithm, significantly reducing communication and memory overhead. Fully
* supports all datasets and datatypes, the only caveat is that any holes in the data
* when writing to a pre-existing file are ignored -- there is no read-modify-write
* support to maintain the correctness of regions of pre-existing data so every byte
* must be explicitly written to maintain correctness. Users must beware of middle-ware
* libraries like PNETCDF which may count on read-modify-write functionality for certain
* features (like fill values). Possible values:
* - 0 - Normal two-phase collective IO is used.
* - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data
* for a compute to write to or read from the collective buffer on the aggregator.
* - 2 - An MPI derived datatype is created using all the contigous chunks and just one
* call to MPI_Put or MPI_Get is done with the derived datatype. On Blue Gene /Q
* optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
* - Default is 0
* - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
* to aggregators in a breadth-first fashion relative to the ions - additionally,
* file domains on the aggregators sharing the same bridgeset and ion have contiguous
......@@ -165,6 +182,10 @@ void ad_gpfs_get_env_vars() {
x = getenv( "GPFSMPIO_P2PCONTIG" );
if (x) gpfsmpio_p2pcontig = atoi(x);
gpfsmpio_aggmethod = 0;
x = getenv( "GPFSMPIO_AGGMETHOD" );
if (x) gpfsmpio_aggmethod = atoi(x);
gpfsmpio_balancecontig = 0;
if (x) gpfsmpio_balancecontig = atoi(x);
......@@ -66,6 +66,7 @@ extern int gpfsmpio_tuneblocking;
extern long bglocklessmpio_f_type;
extern int gpfsmpio_pthreadio;
extern int gpfsmpio_p2pcontig;
extern int gpfsmpio_aggmethod;
extern int gpfsmpio_balancecontig;
extern int gpfsmpio_devnullio;
extern int gpfsmpio_bridgeringagg;
......@@ -71,5 +71,6 @@ romio_other_sources += \
adio/common/hint_fns.c \
adio/common/ad_threaded_io.c \
adio/common/p2p_aggregation.c \
adio/common/onesided_aggregation.c \
......@@ -10,7 +10,7 @@
#include "adio_cb_config_list.h"
#include "mpio.h"
extern int gpfsmpio_aggmethod;
static int is_aggregator(int rank, ADIO_File fd);
static int uses_generic_read(ADIO_File fd);
static int uses_generic_write(ADIO_File fd);
......@@ -236,6 +236,7 @@ typedef struct ADIOI_FileD {
MPI_Datatype *file_realm_types; /* file realm datatypes */
int my_cb_nodes_index; /* my index into cb_config_list. -1 if N/A */
char *io_buf; /* two-phase buffer allocated out of i/o path */
MPI_Win io_buf_window; /* Window over the io_buf to support one-sided aggregation */
/* External32 */
int is_external32; /* bool: 0 means native view */
......@@ -686,6 +686,28 @@ void ADIOI_P2PContigReadAggregation(ADIO_File fd,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end);
void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
const void *buf,
MPI_Datatype datatype,
int *error_code,
ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
ADIO_Offset *fd_start,
ADIO_Offset* fd_end);
void ADIOI_OneSidedReadAggregation(ADIO_File fd,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
const void *buf,
MPI_Datatype datatype,
int *error_code,
ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
ADIO_Offset *fd_start,
ADIO_Offset* fd_end);
ADIO_Offset ADIOI_GEN_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
int whence, int *error_code);
void ADIOI_GEN_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment