Commit 81171c84 authored by David Goodell's avatar David Goodell
Browse files

[svn-r7356] convert collective thresholds to the parameter interface (tt#580)

This should make it much easier to tune the collective algorithm
selection on a given platform.  It also provides some knobs to expose
to apps/tools via the upcoming MPIT interface.

Reviewed by buntinas@.
parent efd643a8
......@@ -3140,32 +3140,7 @@ int MPID_VCR_Get_lpid(MPID_VCR vcr, int * lpid_ptr);
file (mpiimpl.h). */
#include "mpidpost.h"
/* ------------------------------------------------------------------------- */
/* FIXME: Also for mpicoll.h, in src/mpi/coll? */
/* ------------------------------------------------------------------------- */
/* thresholds to switch between long and short vector algorithms for
collective operations */
/* FIXME: Should there be a way to (a) update/compute these at configure time
and (b) provide runtime control? Should these be MPIR_xxx_DEFAULT
instead? */
#define MPIR_BCAST_SHORT_MSG 12288
#define MPIR_BCAST_LONG_MSG 524288
#define MPIR_BCAST_MIN_PROCS 8
#define MPIR_REDSCAT_COMMUTATIVE_LONG_MSG 524288
#define MPIR_REDSCAT_NONCOMMUTATIVE_SHORT_MSG 512
#define MPIR_ALLGATHER_SHORT_MSG 81920
#define MPIR_ALLGATHER_LONG_MSG 524288
#define MPIR_REDUCE_SHORT_MSG 2048
#define MPIR_ALLREDUCE_SHORT_MSG 2048
#define MPIR_GATHER_VSMALL_MSG 1024
#define MPIR_SCATTER_SHORT_MSG 2048 /* for intercommunicator scatter */
#define MPIR_GATHER_SHORT_MSG 2048 /* for intercommunicator scatter */
#define MPIR_GATHERV_MIN_PROCS 32
/* For pipelined collectives */
#define MPIR_ALLGATHERV_PIPELINE_MSGSIZE 32768
/* TODO convert all cut-over constants above to parameters */
/* tunable parameter values */
#include "mpich_param_vals.h"
/* Tags for point to point operations which implement collective and other
......
......@@ -119,7 +119,7 @@ int MPIR_Allgather_intra (
MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
tot_bytes = (MPI_Aint)recvcount * comm_size * type_size;
if ((tot_bytes < MPIR_ALLGATHER_LONG_MSG) && !(comm_size & (comm_size - 1))) {
if ((tot_bytes < MPIR_PARAM_ALLGATHER_LONG_MSG_SIZE) && !(comm_size & (comm_size - 1))) {
/* Short or medium size message and power-of-two no. of processes. Use
* recursive doubling algorithm */
......@@ -420,7 +420,7 @@ int MPIR_Allgather_intra (
#endif /* MPID_HAS_HETERO */
}
else if (tot_bytes < MPIR_ALLGATHER_SHORT_MSG) {
else if (tot_bytes < MPIR_PARAM_ALLGATHER_SHORT_MSG_SIZE) {
/* Short message and non-power-of-two no. of processes. Use
* Bruck algorithm (see description above). */
......
......@@ -110,7 +110,7 @@ int MPIR_Allgatherv_intra (
MPID_Datatype_get_extent_macro( recvtype, recvtype_extent );
MPID_Datatype_get_size_macro(recvtype, recvtype_size);
if ((total_count*recvtype_size < MPIR_ALLGATHER_LONG_MSG) &&
if ((total_count*recvtype_size < MPIR_PARAM_ALLGATHER_LONG_MSG_SIZE) &&
!(comm_size & (comm_size - 1))) {
/* Short or medium size message and power-of-two no. of processes. Use
* recursive doubling algorithm */
......@@ -477,7 +477,7 @@ int MPIR_Allgatherv_intra (
}
else if (total_count*recvtype_size < MPIR_ALLGATHER_SHORT_MSG) {
else if (total_count*recvtype_size < MPIR_PARAM_ALLGATHER_SHORT_MSG_SIZE) {
/* Short message and non-power-of-two no. of processes. Use
* Bruck algorithm (see description above). */
......@@ -601,8 +601,8 @@ int MPIR_Allgatherv_intra (
for (i = 1; i < comm_size; i++)
if (min > recvcounts[i])
min = recvcounts[i];
if (min * recvtype_extent < MPIR_ALLGATHERV_PIPELINE_MSGSIZE)
min = MPIR_ALLGATHERV_PIPELINE_MSGSIZE / recvtype_extent;
if (min * recvtype_extent < MPIR_PARAM_ALLGATHERV_PIPELINE_MSG_SIZE)
min = MPIR_PARAM_ALLGATHERV_PIPELINE_MSG_SIZE / recvtype_extent;
/* Handle the case where the datatype extent is larger than
* the pipeline size. */
if (!min)
......
......@@ -342,7 +342,7 @@ int MPIR_Allreduce_intra (
using recursive doubling in that case.) */
if (newrank != -1) {
if ((count*type_size <= MPIR_ALLREDUCE_SHORT_MSG) ||
if ((count*type_size <= MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE) ||
(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||
(count < pof2)) { /* use recursive doubling */
mask = 0x1;
......
......@@ -814,7 +814,7 @@ static int MPIR_SMP_Bcast(
nbytes = type_size * count;
if ((nbytes < MPIR_BCAST_SHORT_MSG) || (comm_ptr->local_size < MPIR_BCAST_MIN_PROCS))
if ((nbytes < MPIR_PARAM_BCAST_SHORT_MSG_SIZE) || (comm_ptr->local_size < MPIR_PARAM_BCAST_MIN_PROCS))
{
/* send to intranode-rank 0 on the root's node */
if (comm_ptr->node_comm != NULL &&
......@@ -847,12 +847,12 @@ static int MPIR_SMP_Bcast(
buffer, count, datatype, 0, comm_ptr->node_comm);
}
}
else /* (nbytes > MPIR_BCAST_SHORT_MSG) && (comm_ptr->size >= MPIR_BCAST_MIN_PROCS) */
else /* (nbytes > MPIR_PARAM_BCAST_SHORT_MSG_SIZE) && (comm_ptr->size >= MPIR_PARAM_BCAST_MIN_PROCS) */
{
/* supposedly...
smp+doubling good for pof2
reg+ring better for non-pof2 */
if (nbytes < MPIR_BCAST_LONG_MSG && MPIU_is_pof2(comm_ptr->local_size, NULL))
if (nbytes < MPIR_PARAM_BCAST_LONG_MSG_SIZE && MPIU_is_pof2(comm_ptr->local_size, NULL))
{
/* medium-sized msg and pof2 np */
......@@ -1015,19 +1015,19 @@ int MPIR_Bcast_intra (
nbytes = type_size * count;
if ((nbytes < MPIR_BCAST_SHORT_MSG) || (comm_size < MPIR_BCAST_MIN_PROCS))
if ((nbytes < MPIR_PARAM_BCAST_SHORT_MSG_SIZE) || (comm_size < MPIR_PARAM_BCAST_MIN_PROCS))
{
mpi_errno = MPIR_Bcast_binomial(buffer, count, datatype, root, comm_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
else /* (nbytes >= MPIR_BCAST_SHORT_MSG) && (comm_size >= MPIR_BCAST_MIN_PROCS) */
else /* (nbytes >= MPIR_PARAM_BCAST_SHORT_MSG_SIZE) && (comm_size >= MPIR_PARAM_BCAST_MIN_PROCS) */
{
if ((nbytes < MPIR_BCAST_LONG_MSG) && (MPIU_is_pof2(comm_size, NULL)))
if ((nbytes < MPIR_PARAM_BCAST_LONG_MSG_SIZE) && (MPIU_is_pof2(comm_size, NULL)))
{
mpi_errno = MPIR_Bcast_scatter_doubling_allgather(buffer, count, datatype, root, comm_ptr);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
else /* (nbytes >= MPIR_BCAST_LONG_MSG) || !(comm_size_is_pof2) */
else /* (nbytes >= MPIR_PARAM_BCAST_LONG_MSG_SIZE) || !(comm_size_is_pof2) */
{
/* We want the ring algorithm whether or not we have a
topologically aware communicator. Doing inter/intra-node
......
......@@ -136,12 +136,12 @@ int MPIR_Gather_intra (
/* If the message is smaller than the threshold, we will copy
* our message in there too */
if (nbytes < MPIR_GATHER_VSMALL_MSG) tmp_buf_size++;
if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) tmp_buf_size++;
tmp_buf_size *= nbytes;
/* For zero-ranked root, we don't need any temporary buffer */
if ((rank == root) && (!root || (nbytes >= MPIR_GATHER_VSMALL_MSG)))
if ((rank == root) && (!root || (nbytes >= MPIR_PARAM_GATHER_VSMALL_MSG_SIZE)))
tmp_buf_size = 0;
if (tmp_buf_size) {
......@@ -157,7 +157,7 @@ int MPIR_Gather_intra (
if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
}
}
else if (tmp_buf_size && (nbytes < MPIR_GATHER_VSMALL_MSG))
else if (tmp_buf_size && (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE))
{
/* copy from sendbuf into tmp_buf */
mpi_errno = MPIR_Localcopy(sendbuf, sendcnt, sendtype,
......@@ -196,7 +196,7 @@ int MPIR_Gather_intra (
&status);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
else if (nbytes < MPIR_GATHER_VSMALL_MSG) {
else if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) {
mpi_errno = MPIC_Recv(tmp_buf, recvblks * nbytes, MPI_BYTE,
src, MPIR_GATHER_TAG, comm, &status);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
......@@ -235,7 +235,7 @@ int MPIR_Gather_intra (
if (relative_src + mask > comm_size)
recvblks -= (relative_src + mask - comm_size);
if (nbytes < MPIR_GATHER_VSMALL_MSG)
if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE)
offset = mask * nbytes;
else
offset = (mask - 1) * nbytes;
......@@ -260,7 +260,7 @@ int MPIR_Gather_intra (
MPIR_GATHER_TAG, comm);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
}
else if (nbytes < MPIR_GATHER_VSMALL_MSG) {
else if (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) {
mpi_errno = MPIC_Send(tmp_buf, curr_cnt, MPI_BYTE, dst,
MPIR_GATHER_TAG, comm);
if (mpi_errno) MPIU_ERR_POP(mpi_errno);
......@@ -293,7 +293,7 @@ int MPIR_Gather_intra (
mask <<= 1;
}
if ((rank == root) && root && (nbytes < MPIR_GATHER_VSMALL_MSG) && copy_blks)
if ((rank == root) && root && (nbytes < MPIR_PARAM_GATHER_VSMALL_MSG_SIZE) && copy_blks)
{
/* reorder and copy from tmp_buf into recvbuf */
mpi_errno = MPIR_Localcopy(tmp_buf,
......@@ -470,7 +470,7 @@ int MPIR_Gather_inter (
nbytes = sendtype_size * sendcnt * local_size;
}
if (nbytes < MPIR_GATHER_SHORT_MSG)
if (nbytes < MPIR_PARAM_GATHER_INTER_SHORT_MSG_SIZE)
{
if (root == MPI_ROOT)
{
......
......@@ -65,7 +65,6 @@ int MPIR_Gatherv (
MPI_Aint extent;
int i, reqs;
int min_procs;
char *min_procs_str;
MPI_Request *reqarray;
MPI_Status *starray;
MPIU_CHKLMEM_DECL(2);
......@@ -135,20 +134,11 @@ int MPIR_Gatherv (
irrelevant here. */
comm_size = comm_ptr->local_size;
/* FIXME: Do not use getenv, particularly each time the
routine is called. Instead, use the parameter routines */
min_procs_str = getenv("MPICH2_GATHERV_MIN_PROCS");
/* FIXME: atoi does not indicate any errors and should not be
used unless there is a separate test for correctness */
if (min_procs_str != NULL)
min_procs = atoi(min_procs_str);
else
min_procs = comm_size + 1; /* Disable ssend if env not set */
min_procs = MPIR_PARAM_GATHERV_INTER_SSEND_MIN_PROCS;
if (min_procs == -1)
min_procs = comm_size + 1; /* Disable ssend */
else if (min_procs == 0)
min_procs = MPIR_GATHERV_MIN_PROCS; /* Use the default value */
else if (min_procs == 0) /* backwards compatibility, use default value */
MPIR_PARAM_GET_DEFAULT_INT(GATHERV_INTER_SSEND_MIN_PROCS,&min_procs);
if (comm_size >= min_procs) {
mpi_errno = MPIC_Ssend(sendbuf, sendcnt, sendtype, root,
......
......@@ -367,7 +367,7 @@ int MPIR_Reduce_scatter_intra (
* a user-passed in buffer */
MPID_Ensure_Aint_fits_in_pointer(total_count * MPIR_MAX(true_extent, extent));
if ((is_commutative) && (nbytes < MPIR_REDSCAT_COMMUTATIVE_LONG_MSG)) {
if ((is_commutative) && (nbytes < MPIR_PARAM_REDSCAT_COMMUTATIVE_LONG_MSG_SIZE)) {
/* commutative and short. use recursive halving algorithm */
/* allocate temp. buffer to receive incoming data */
......@@ -582,7 +582,7 @@ int MPIR_Reduce_scatter_intra (
}
}
if (is_commutative && (nbytes >= MPIR_REDSCAT_COMMUTATIVE_LONG_MSG)) {
if (is_commutative && (nbytes >= MPIR_PARAM_REDSCAT_COMMUTATIVE_LONG_MSG_SIZE)) {
/* commutative and long message, or noncommutative and long message.
use (p-1) pairwise exchanges */
......
......@@ -838,7 +838,7 @@ int MPIR_Reduce_intra (
while (pof2 <= comm_size) pof2 <<= 1;
pof2 >>=1;
if ((count*type_size > MPIR_REDUCE_SHORT_MSG) &&
if ((count*type_size > MPIR_PARAM_REDUCE_SHORT_MSG_SIZE) &&
(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) && (count >= pof2)) {
/* do a reduce-scatter followed by gather to root. */
mpi_errno = MPIR_Reduce_redscat_gather(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
......
......@@ -424,7 +424,7 @@ int MPIR_Scatter_inter (
nbytes = recvtype_size * recvcnt * local_size;
}
if (nbytes < MPIR_SCATTER_SHORT_MSG) {
if (nbytes < MPIR_PARAM_SCATTER_INTER_SHORT_MSG_SIZE) {
if (root == MPI_ROOT) {
/* root sends all data to rank 0 on remote group and returns */
mpi_errno = MPIC_Send(sendbuf, sendcnt*remote_size,
......
......@@ -60,6 +60,115 @@ parameters:
algorithms. Setting it to 0 causes all irecvs/isends to be
posted at once.
- category : collective
name : REDSCAT_COMMUTATIVE_LONG_MSG_SIZE
type : int
default : 524288
description : >-
the long message algorithm will be used if the operation is commutative
and the send buffer size is >= this value (in bytes)
- category : collective
name : BCAST_MIN_PROCS
type : int
default : 8
description : >-
the minimum number of processes in a communicator to use a non-binomial
broadcast algorithm
- category : collective
name : BCAST_SHORT_MSG_SIZE
type : int
default : 12288
description : >-
the short message algorithm will be used if the send buffer size is <
this value (in bytes)
- category : collective
name : BCAST_LONG_MSG_SIZE
type : int
default : 524288
description : >-
the long message algorithm will be used if the send buffer size is >=
this value (in bytes)
- category : collective
name : ALLGATHER_SHORT_MSG_SIZE
type : int
default : 81920
description : >-
For MPI_Allgather and MPI_Allgatherv, the short message algorithm will
be used if the send buffer size is < this value (in bytes).
- category : collective
name : ALLGATHER_LONG_MSG_SIZE
type : int
default : 524288
description : >-
For MPI_Allgather and MPI_Allgatherv, the long message algorithm will be
used if the send buffer size is >= this value (in bytes)
- category : collective
name : REDUCE_SHORT_MSG_SIZE
type : int
default : 2048
description : >-
the short message algorithm will be used if the send buffer size is <=
this value (in bytes)
- category : collective
name : ALLREDUCE_SHORT_MSG_SIZE
type : int
default : 2048
description : >-
the short message algorithm will be used if the send buffer size is <=
this value (in bytes)
- category : collective
name : GATHER_VSMALL_MSG_SIZE
type : int
default : 1024
description : >-
use a temporary buffer for intracommunicator MPI_Gather if the send
buffer size is < this value (in bytes)
- category : collective
name : GATHER_INTER_SHORT_MSG_SIZE
type : int
default : 2048
description : >-
use the short message algorithm for intercommunicator MPI_Gather if the
send buffer size is < this value (in bytes)
- category : collective
name : GATHERV_INTER_SSEND_MIN_PROCS
# backwards compatibility
abs-alt-env :
- MPICH2_GATHERV_MIN_PROCS
type : int
default : 32
description : >-
Use Ssend (synchronous send) for intercommunicator MPI_Gatherv if the
"group B" size is >= this value. Specifying "-1" always avoids using
Ssend. For backwards compatibility, specifying "0" uses the default
value.
- category : collective
name : SCATTER_INTER_SHORT_MSG_SIZE
type : int
default : 2048
description : >-
use the short message algorithm for intercommunicator MPI_Scatter if the
send buffer size is < this value (in bytes)
- category : collective
name : ALLGATHERV_PIPELINE_MSG_SIZE
type : int
default : 32768
description : >-
The smallest message size that will be used for the pipelined, large-message,
ring algorithm in the MPI_Allgatherv implementation.
##############################################################
# intranode communication parameters
- category : intranode
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment