Commit 2af0cf67 authored by Francois Tessier's avatar Francois Tessier

Memory-aware aggregator placement strategy

parent 06f19beb
......@@ -4,8 +4,6 @@
* Include and adapt the getopt function (miniHACC-AoS-Tapioca-W.cpp)
* Adapt the running scripts to the binary parameters (getopt)
- Implement the Memory-aware aggregators placement
- Change fprintf to the customized printMsg
- Write a getopt function to set the subfiling/output/tiers for the benchmarks
......@@ -15,6 +13,7 @@
- Ask Silvio for Vl3D for Cooley
- If NVR, reset #agg and aggregator buffer size (persistency on local storage)
- If NLS, MPI-IO VS mmap+RMA depending on processes location
- README and LICENCE
......
......@@ -294,6 +294,31 @@ char* Memory::memName ( ) {
}
char* Memory::memName ( mem_t mem ) {
switch ( mem )
{
case DDR:
return "DDR";
break;
case HBM:
return "HBM";
break;
case PFS:
return "PFS";
break;
case NLS:
return "NLS";
break;
case NVR:
return "NVR";
break;
default:
printMsg ( ERROR, "Wrong memory type!\n" );
MPI_Abort ( MPI_COMM_WORLD, -1 );
}
}
mem_t Memory::memTypeByName ( char* name ) {
if ( ! strcmp ( "DDR", name ) ) return DDR;
if ( ! strcmp ( "HBM", name ) ) return HBM;
......@@ -344,3 +369,68 @@ char* Memory::memPath ( ) {
else
return "";
}
/*
* kBps
*/
int64_t Memory::memBandwidth ( mem_t mem ) {
switch ( mem )
{
case DDR:
return 90000000;
break;
case HBM:
return 350000000;
break;
case PFS:
return 1800000;
break;
case NLS:
return 1800000;
break;
case NVR:
return 400000;
break;
default:
printMsg ( ERROR, "Wrong memory type!\n" );
MPI_Abort ( MPI_COMM_WORLD, -1 );
}
}
/*
* ms
*/
int64_t Memory::memLatency ( mem_t mem ) {
switch ( mem )
{
case DDR:
return 2;
break;
case HBM:
return 1;
break;
case PFS:
return 30;
break;
case NLS:
return 5;
break;
case NVR:
return 5;
break;
default:
printMsg ( ERROR, "Wrong memory type!\n" );
MPI_Abort ( MPI_COMM_WORLD, -1 );
}
}
int64_t Memory::memCapacity ( mem_t mem ) {
return 0;
}
bool Memory::memPersistency ( mem_t mem ) {
return false;
}
......@@ -37,6 +37,7 @@ public:
/* |-- Utils */
/**********************/
char* memName ( );
char* memName ( mem_t mem );
mem_t memTypeByName ( char* name );
mem_t memTypeByPath ( char* path );
......@@ -48,6 +49,11 @@ public:
int64_t memCapacity ( );
bool memPersistency ( );
char* memPath ( );
int64_t memBandwidth ( mem_t mem );
int64_t memLatency ( mem_t mem );
int64_t memCapacity ( mem_t mem );
bool memPersistency ( mem_t mem );
};
#endif // TP_MEMORY_H
......
......@@ -99,6 +99,15 @@ int Topology::IONodesPerFile ( char* filename, int *nodesList ) {
return nLnets;
}
int Topology::ListOfMemoryTiers ( mem_t* memList ) {
memList[0] = DDR;
memList[1] = HBM;
memList[2] = NVR;
return 3;
}
/**********************/
......@@ -118,6 +127,16 @@ int Topology::NetworkDimensions () {
}
int64_t Topology::NetworkBandwidth () {
return 1800000;
}
int64_t Topology::NetworkLatency () {
return 30;
}
/* |---- Coordinates */
void Topology::RankToCoordinates ( int rank, int* coord ) {
pmi_mesh_coord_t xyz;
......
......@@ -29,11 +29,15 @@ public:
int LocalCoreId ();
int ProcessPerNode ();
int IONodesPerFile ( char* filename, int *nodesList );
int ListOfMemoryTiers ( mem_t* memList );
/**********************/
/* |-- Network */
/**********************/
int NetworkDimensions ();
int64_t NetworkBandwidth ();
int64_t NetworkLatency ();
/* |---- Coordinates */
void RankToCoordinates ( int rank, int* coord );
......
......@@ -26,6 +26,7 @@ public:
/* |-- Utils */
/************************/
virtual char* memName ( ) = 0;
virtual char* memName ( mem_t mem ) = 0;
virtual mem_t memTypeByName ( char* name ) = 0;
virtual mem_t memTypeByPath ( char* path ) = 0;
......@@ -38,6 +39,10 @@ public:
virtual bool memPersistency ( ) = 0;
virtual char* memPath ( ) = 0;
virtual int64_t memBandwidth ( mem_t mem ) = 0;
virtual int64_t memLatency ( mem_t mem ) = 0;
virtual int64_t memCapacity ( mem_t mem ) = 0;
virtual bool memPersistency ( mem_t mem ) = 0;
/* Temporary */
void *buffer_;
......
......@@ -4,6 +4,8 @@
#include <stdio.h>
#include <stdlib.h>
#include "tp_memory_interface.hpp"
class iTopology {
public:
/**********************/
......@@ -16,11 +18,14 @@ class iTopology {
virtual int GlobalCoreId () = 0;
virtual int LocalCoreId () = 0;
virtual int ProcessPerNode () = 0;
virtual int ListOfMemoryTiers ( mem_t* memList ) = 0;
/**********************/
/* |-- Network */
/**********************/
virtual int NetworkDimensions () = 0;
virtual int64_t NetworkBandwidth () = 0;
virtual int64_t NetworkLatency () = 0;
/* |---- Coordinates */
virtual void RankToCoordinates ( int rank, int* coord ) = 0;
......
......@@ -217,6 +217,7 @@ void Tapioca::ParseEnvVariables ()
strcmp(envStrategy, "SHORTEST_PATH") ? 0 : this->strategy_ = SHORTEST_PATH;
strcmp(envStrategy, "LONGEST_PATH") ? 0 : this->strategy_ = LONGEST_PATH;
strcmp(envStrategy, "TOPOLOGY_AWARE") ? 0 : this->strategy_ = TOPOLOGY_AWARE;
strcmp(envStrategy, "MEMORY_AWARE") ? 0 : this->strategy_ = MEMORY_AWARE;
strcmp(envStrategy, "CONTENTION_AWARE") ? 0 : this->strategy_ = CONTENTION_AWARE;
strcmp(envStrategy, "UNIFORM") ? 0 : this->strategy_ = UNIFORM;
strcmp(envStrategy, "RANDOM") ? 0 : this->strategy_ = RANDOM;
......@@ -557,6 +558,9 @@ void Tapioca::ElectAggregators ()
case TOPOLOGY_AWARE:
aggrRank = this->RankTopologyAware (aggrComm, color);
break;
case MEMORY_AWARE:
aggrRank = this->RankMemoryAware (aggrComm, color);
break;
case CONTENTION_AWARE:
aggrRank = this->RankContentionAware (aggrComm, color);
break;
......@@ -646,6 +650,7 @@ const char* Tapioca::getStrategyName ()
case SHORTEST_PATH: return "Shortest path";
case LONGEST_PATH: return "Longest path";
case TOPOLOGY_AWARE: return "Topology-aware placement";
case MEMORY_AWARE: return "Memory-aware placement";
case CONTENTION_AWARE: return "Contention-aware placement";
case UNIFORM: return "Uniform placement";
case RANDOM : return "Random placement";
......
......@@ -2,8 +2,6 @@
#define TAPIOCA_H
#define MASTER 0
#define LATENCY 30
#define BANDWIDTH 1800000
#include <stdio.h>
#include <stdlib.h>
......@@ -25,6 +23,7 @@ enum MAPPING_STRATEGY
SHORTEST_PATH,
LONGEST_PATH,
TOPOLOGY_AWARE,
MEMORY_AWARE,
CONTENTION_AWARE,
UNIFORM,
RANDOM
......@@ -99,6 +98,7 @@ protected:
int RankShortestPath (MPI_Comm aggrComm, int64_t dataSize);
int RankLongestPath (MPI_Comm aggrComm, int64_t dataSize);
int RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize);
int RankMemoryAware (MPI_Comm aggrComm, int64_t dataSize);
int RankContentionAware (MPI_Comm aggrComm, int64_t dataSize);
int RankUniformDistribution (MPI_Comm aggrComm, int64_t dataSize);
int RankRandom (MPI_Comm aggrComm, int64_t dataSize);
......
......@@ -89,26 +89,16 @@ int Tapioca::RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize)
if ( rank != aggrCommRank ) {
distance = topology.DistanceBetweenRanks ( globalRanks[rank], worldRank );
// aggrCost.cost = std::max ( distance * LATENCY + (double)dataDistrib[rank] / BANDWIDTH,
// aggrCost.cost = std::max ( distance * this->topology.NetworkLatency () + (double)dataDistrib[rank] / this->topology.NetworkBandwidth (),
// aggrCost.cost );
aggrCost.cost += (distance * LATENCY + (double)dataDistrib[rank] / BANDWIDTH);
aggrCost.cost += (distance * this->topology.NetworkLatency ()
+ (double)dataDistrib[rank] / this->topology.NetworkBandwidth () );
}
}
// IOnodesList = (int *) malloc ( MAX_IONODES * sizeof ( int ) );
// nIOnodes = topology.IONodesPerFile (this->filename_, IOnodesList);
// if ( this->commRank_ == 0 ) {
// fprintf (stdout, "[LUSTRE] nLnet = %d\n", nIOnodes);
// fprintf (stdout, "[LUSTRE] list = ");
// for ( int i = 0; i < nIOnodes; i++ )
// fprintf (stdout, "%d ", IOnodesList[i]);
// fprintf (stdout, "\n");
// }
#ifdef BGQ
aggrCost.cost += topology.DistanceToIONode ( worldRank ) * LATENCY + (double)aggregatedData / BANDWIDTH;
#endif
if ( topology.DistanceToIONode ( worldRank ) != 0 )
aggrCost.cost += topology.DistanceToIONode ( worldRank ) * this->topology.NetworkLatency ()
+ (double)aggregatedData / this->topology.NetworkBandwidth ();
if ( this->excludedNode[this->hostId_] )
aggrCost.cost = DBL_MAX;
......@@ -134,6 +124,93 @@ int Tapioca::RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize)
}
int Tapioca::RankMemoryAware (MPI_Comm aggrComm, int64_t dataSize)
{
struct { double cost; int rank; } aggrCost, minCost;
double current_cost = DBL_MAX;
int aggrCommRank, aggrCommSize, worldRank, rank, distance, dim, hops, aggrRank, nIOnodes, memCount, m;
mem_t memList[10], best_mem = DDR;
Memory mem;
int64_t *dataDistrib, aggregatedData = 0, latency, bandwidth;
int *srcCoords, *destCoords, *globalRanks, *IOnodesList;
MPI_Comm_rank (aggrComm, &aggrCommRank);
MPI_Comm_size (aggrComm, &aggrCommSize);
MPI_Comm_rank (MPI_COMM_WORLD, &worldRank);
aggrCost.rank = aggrCommRank;
aggrCost.cost = 0;
dataDistrib = (int64_t*) malloc (aggrCommSize * sizeof(int64_t));
globalRanks = (int *) malloc (aggrCommSize * sizeof(int));
MPI_Allgather(&worldRank, 1, MPI_INT, globalRanks, 1, MPI_INT, aggrComm);
MPI_Allgather(&dataSize, 1, MPI_LONG_LONG, dataDistrib, 1, MPI_LONG_LONG, aggrComm);
memCount = topology.ListOfMemoryTiers ( memList );
for ( m = 0; m < memCount; m ++ ) {
for ( rank = 0; rank < aggrCommSize; rank++ ) {
aggregatedData += dataDistrib[rank];
if ( rank != aggrCommRank ) {
distance = topology.DistanceBetweenRanks ( globalRanks[rank], worldRank );
if ( distance == 0 ) {
latency = mem.memLatency ( memList[m] );
bandwidth = mem.memBandwidth ( memList[m] );
}
else {
latency = std::max ( mem.memLatency ( memList[m] ),
this->topology.NetworkLatency () );
bandwidth = std::min ( mem.memBandwidth ( memList[m] ),
this->topology.NetworkBandwidth () );
}
// aggrCost.cost = std::max ( distance * latency + (double)dataDistrib[rank] / bandwidth,
// aggrCost.cost );
aggrCost.cost += ( distance * latency + (double)dataDistrib[rank] / bandwidth );
}
}
if ( aggrCost.cost < current_cost ) {
current_cost = aggrCost.cost;
best_mem = memList[m];
}
aggrCost.cost = 0;
}
aggrCost.cost = current_cost;
if ( topology.DistanceToIONode ( worldRank ) != 0 )
aggrCost.cost += topology.DistanceToIONode ( worldRank ) * this->topology.NetworkLatency ()
+ (double)aggregatedData / this->topology.NetworkBandwidth ();
if ( this->excludedNode[this->hostId_] )
aggrCost.cost = DBL_MAX;
MPI_Allreduce ( &aggrCost, &minCost, 1, MPI_DOUBLE_INT, MPI_MINLOC, aggrComm );
MPI_Reduce ( &dataSize, &this->aggrDataSize_, 1, MPI_LONG_LONG, MPI_SUM, minCost.rank, aggrComm );
if ( minCost.rank == aggrCommRank ) {
aggrRank = this->commRank_;
this->amAnAggr_ = true;
}
MPI_Bcast ( &aggrRank, 1, MPI_INT, minCost.rank, aggrComm );
MPI_Bcast ( &best_mem, 1, MPI_INT, minCost.rank, aggrComm );
this->memAggr_ = best_mem;
#ifdef DBG
if ( minCost.rank == aggrCommRank )
fprintf (stdout, "[DEBUG] Aggr. rank %d in aggrComm, distance to I/O node %d hops, cost: %.4f, mem: %s\n",
minCost.rank, topology.DistanceToIONode ( worldRank ), minCost.cost, mem.memName( best_mem ) );
#endif
return aggrRank;
}
int Tapioca::RankContentionAware (MPI_Comm aggrComm, int64_t dataSize)
{
struct { double cost; int rank; } aggrCost, minCost;
......@@ -226,14 +303,14 @@ int Tapioca::RankContentionAware (MPI_Comm aggrComm, int64_t dataSize)
if ( rank != aggrCommRank ) {
aggrCost.cost = std::max ( (double)dataDistrib[rank] / ( BANDWIDTH / routeCost[srcNode] ),
aggrCost.cost = std::max ( (double)dataDistrib[rank] / ( this->topology.NetworkBandwidth () / routeCost[srcNode] ),
aggrCost.cost );
}
}
/* I/O Node */
srcNode = this->worldRank_ / ppn;
aggrCost.cost += aggregatedData / ( BANDWIDTH / routeCost[srcNode] );
aggrCost.cost += aggregatedData / ( this->topology.NetworkBandwidth () / routeCost[srcNode] );
if ( this->excludedNode[this->hostId_] )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment