Commit 5f4007b6 authored by William Gropp's avatar William Gropp
Browse files

[svn-r9963] Added code to complete RMA operations when large numbers of...

[svn-r9963] Added code to complete RMA operations when large numbers of pending requests are created; allows codes such as Graph500 benchmark to complete
parent d9e43ea3
......@@ -15,13 +15,29 @@ MPIU_INSTR_DURATION_DECL(wincreate_allgather);
MPIU_INSTR_DURATION_DECL(winfree_rs);
MPIU_INSTR_DURATION_DECL(winfree_complete);
MPIU_INSTR_DURATION_DECL(rmaqueue_alloc);
MPIU_INSTR_DURATION_DECL(rmaqueue_set);
extern void MPIDI_CH3_RMA_InitInstr(void);
#endif
extern void MPIDI_CH3_RMA_SetAccImmed( int );
#define MPIDI_PASSIVE_TARGET_DONE_TAG 348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924
/*
* TODO:
* Explore use of alternate allocation mechanisms for the RMA queue elements
* (Because profiling has shown that queue element allocation/deallocation
* can take a significant amount of time in the RMA operations).
* 1: Current approach (uses perm memory malloc/free)
* 2: Preallocate and maintain list (use perm memory malloc, but
* free onto window; use first; free on window free)
* 3: Preallocate and maintain list (use separate memory, but free to
* thread/process; free in Finalize handler. Option to use for
* single-threaded to avoid thread overheads)
* Possible interface
* int MPIDI_RMAListAlloc(MPIDI_RMA_ops **a,MPID_Win *win)
* int MPIDI_RMAListFree(MPIDI_RMA_ops *a, MPID_Win *win)
* return value is error code (e.g., allocation failure).
*/
#undef FUNCNAME
#define FUNCNAME MPIDI_Win_create
......@@ -46,16 +62,13 @@ int MPIDI_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info,
if(initRMAoptions) {
int rc;
MPIU_THREADSAFE_INIT_BLOCK_BEGIN(initRMAoptions);
/* Default is to enable the use of the immediate accumulate feature */
if (!MPL_env2bool( "MPICH_RMA_ACC_IMMED", &rc ))
rc = 1;
MPIDI_CH3_RMA_SetAccImmed(rc);
#ifdef USE_MPIU_INSTR
/* Define all instrumentation handle used in the CH3 RMA here*/
/* Define all instrumentation handles used in the CH3 RMA here*/
MPIU_INSTR_DURATION_INIT(wincreate_allgather,0,"WIN_CREATE:Allgather");
MPIU_INSTR_DURATION_INIT(winfree_rs,0,"WIN_FREE:ReduceScatterBlock");
MPIU_INSTR_DURATION_INIT(winfree_complete,0,"WIN_FREE:Complete");
MPIU_INSTR_DURATION_INIT(rmaqueue_alloc,0,"Allocate RMA Queue element");
MPIU_INSTR_DURATION_INIT(rmaqueue_set,0,"Set fields in RMA Queue element");
MPIDI_CH3_RMA_InitInstr();
#endif
......@@ -273,6 +286,7 @@ int MPIDI_Put(void *origin_addr, int origin_count, MPI_Datatype
win_ptr->rma_ops_list_head = new_ptr;
win_ptr->rma_ops_list_tail = new_ptr;
MPIU_INSTR_DURATION_START(rmaqueue_set);
/* FIXME: For contig and very short operations, use a streamlined op */
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_PUT;
......@@ -283,7 +297,8 @@ int MPIDI_Put(void *origin_addr, int origin_count, MPI_Datatype
new_ptr->target_disp = target_disp;
new_ptr->target_count = target_count;
new_ptr->target_datatype = target_datatype;
MPIU_INSTR_DURATION_END(rmaqueue_set);
/* if source or target datatypes are derived, increment their
reference counts */
MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, predefined);
......@@ -364,6 +379,7 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
win_ptr->rma_ops_list_head = new_ptr;
win_ptr->rma_ops_list_tail = new_ptr;
MPIU_INSTR_DURATION_START(rmaqueue_set);
/* FIXME: For contig and very short operations, use a streamlined op */
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_GET;
......@@ -374,6 +390,7 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
new_ptr->target_disp = target_disp;
new_ptr->target_count = target_count;
new_ptr->target_datatype = target_datatype;
MPIU_INSTR_DURATION_END(rmaqueue_set);
/* if source or target datatypes are derived, increment their
reference counts */
......@@ -553,6 +570,7 @@ int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype
/* If predefined and contiguous, use a simplified element */
if (origin_predefined && target_predefined && enableShortACC) {
MPIU_INSTR_DURATION_START(rmaqueue_set);
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_ACC_CONTIG;
/* Only the information needed for the contig/predefined acc */
......@@ -564,9 +582,11 @@ int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype
new_ptr->target_count = target_count;
new_ptr->target_datatype = target_datatype;
new_ptr->op = op;
MPIU_INSTR_DURATION_END(rmaqueue_set);
goto fn_exit;
}
MPIU_INSTR_DURATION_START(rmaqueue_set);
new_ptr->next = NULL;
new_ptr->type = MPIDI_RMA_ACCUMULATE;
new_ptr->origin_addr = origin_addr;
......@@ -577,6 +597,7 @@ int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype
new_ptr->target_count = target_count;
new_ptr->target_datatype = target_datatype;
new_ptr->op = op;
MPIU_INSTR_DURATION_END(rmaqueue_set);
/* if source or target datatypes are derived, increment their
reference counts */
......
This diff is collapsed.
......@@ -20,6 +20,8 @@ categories:
description : parameters that control communicator construction and operation
- name : pt2pt
description : parameters that control point-to-point communication behavior
- name : rma
description : parameters that control RMA communicatin behavior
- name : intranode
description : intranode communication parameters
- name : developer
......@@ -187,6 +189,34 @@ parameters:
default : true
description : >-
Use qsort(3) in the implementation of MPI_Comm_split instead of bubble sort.
##############################################################
# RMA parameters
- category : rma
name : RMA_ACC_IMMED
type : boolean
default : true
description : >-
Use the immediate accumulate optimization
- category : rma
name : RMA_NREQUEST_THRESHOLD
type : int
default : 4000
description : >-
Threshold at which the RMA implementation attempts to complete requests
while completing RMA operations and while using the lazy synchonization
approach. Change this value if programs fail because they run out of
requests or other internal resources
- category : rma
name : RMA_NREQUEST_NEW_THRESHOLD
type : int
default : 128
description : >-
Threshold for the number of new requests since the last attempt to
complete pending requests. Higher values can increase performance,
but may run the risk of exceeding the available number of requests
or other internal resources.
##############################################################
# intranode communication parameters
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment