Commit 2af0cf67 authored by Francois Tessier's avatar Francois Tessier

Memory-aware aggregator placement strategy

parent 06f19beb
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
* Include and adapt the getopt function (miniHACC-AoS-Tapioca-W.cpp) * Include and adapt the getopt function (miniHACC-AoS-Tapioca-W.cpp)
* Adapt the running scripts to the binary parameters (getopt) * Adapt the running scripts to the binary parameters (getopt)
- Implement the Memory-aware aggregators placement
- Change fprintf to the customized printMsg - Change fprintf to the customized printMsg
- Write a getopt function to set the subfiling/output/tiers for the benchmarks - Write a getopt function to set the subfiling/output/tiers for the benchmarks
...@@ -15,6 +13,7 @@ ...@@ -15,6 +13,7 @@
- Ask Silvio for Vl3D for Cooley - Ask Silvio for Vl3D for Cooley
- If NVR, reset #agg and aggregator buffer size (persistency on local storage) - If NVR, reset #agg and aggregator buffer size (persistency on local storage)
- If NLS, MPI-IO VS mmap+RMA depending on processes location
- README and LICENCE - README and LICENCE
......
...@@ -294,6 +294,31 @@ char* Memory::memName ( ) { ...@@ -294,6 +294,31 @@ char* Memory::memName ( ) {
} }
char* Memory::memName ( mem_t mem ) {
switch ( mem )
{
case DDR:
return "DDR";
break;
case HBM:
return "HBM";
break;
case PFS:
return "PFS";
break;
case NLS:
return "NLS";
break;
case NVR:
return "NVR";
break;
default:
printMsg ( ERROR, "Wrong memory type!\n" );
MPI_Abort ( MPI_COMM_WORLD, -1 );
}
}
mem_t Memory::memTypeByName ( char* name ) { mem_t Memory::memTypeByName ( char* name ) {
if ( ! strcmp ( "DDR", name ) ) return DDR; if ( ! strcmp ( "DDR", name ) ) return DDR;
if ( ! strcmp ( "HBM", name ) ) return HBM; if ( ! strcmp ( "HBM", name ) ) return HBM;
...@@ -344,3 +369,68 @@ char* Memory::memPath ( ) { ...@@ -344,3 +369,68 @@ char* Memory::memPath ( ) {
else else
return ""; return "";
} }
/*
* kBps
*/
int64_t Memory::memBandwidth ( mem_t mem ) {
switch ( mem )
{
case DDR:
return 90000000;
break;
case HBM:
return 350000000;
break;
case PFS:
return 1800000;
break;
case NLS:
return 1800000;
break;
case NVR:
return 400000;
break;
default:
printMsg ( ERROR, "Wrong memory type!\n" );
MPI_Abort ( MPI_COMM_WORLD, -1 );
}
}
/*
* ms
*/
int64_t Memory::memLatency ( mem_t mem ) {
switch ( mem )
{
case DDR:
return 2;
break;
case HBM:
return 1;
break;
case PFS:
return 30;
break;
case NLS:
return 5;
break;
case NVR:
return 5;
break;
default:
printMsg ( ERROR, "Wrong memory type!\n" );
MPI_Abort ( MPI_COMM_WORLD, -1 );
}
}
int64_t Memory::memCapacity ( mem_t mem ) {
return 0;
}
bool Memory::memPersistency ( mem_t mem ) {
return false;
}
...@@ -37,6 +37,7 @@ public: ...@@ -37,6 +37,7 @@ public:
/* |-- Utils */ /* |-- Utils */
/**********************/ /**********************/
char* memName ( ); char* memName ( );
char* memName ( mem_t mem );
mem_t memTypeByName ( char* name ); mem_t memTypeByName ( char* name );
mem_t memTypeByPath ( char* path ); mem_t memTypeByPath ( char* path );
...@@ -48,6 +49,11 @@ public: ...@@ -48,6 +49,11 @@ public:
int64_t memCapacity ( ); int64_t memCapacity ( );
bool memPersistency ( ); bool memPersistency ( );
char* memPath ( ); char* memPath ( );
int64_t memBandwidth ( mem_t mem );
int64_t memLatency ( mem_t mem );
int64_t memCapacity ( mem_t mem );
bool memPersistency ( mem_t mem );
}; };
#endif // TP_MEMORY_H #endif // TP_MEMORY_H
......
...@@ -99,6 +99,15 @@ int Topology::IONodesPerFile ( char* filename, int *nodesList ) { ...@@ -99,6 +99,15 @@ int Topology::IONodesPerFile ( char* filename, int *nodesList ) {
return nLnets; return nLnets;
} }
int Topology::ListOfMemoryTiers ( mem_t* memList ) {
memList[0] = DDR;
memList[1] = HBM;
memList[2] = NVR;
return 3;
}
/**********************/ /**********************/
...@@ -118,6 +127,16 @@ int Topology::NetworkDimensions () { ...@@ -118,6 +127,16 @@ int Topology::NetworkDimensions () {
} }
int64_t Topology::NetworkBandwidth () {
return 1800000;
}
int64_t Topology::NetworkLatency () {
return 30;
}
/* |---- Coordinates */ /* |---- Coordinates */
void Topology::RankToCoordinates ( int rank, int* coord ) { void Topology::RankToCoordinates ( int rank, int* coord ) {
pmi_mesh_coord_t xyz; pmi_mesh_coord_t xyz;
......
...@@ -29,11 +29,15 @@ public: ...@@ -29,11 +29,15 @@ public:
int LocalCoreId (); int LocalCoreId ();
int ProcessPerNode (); int ProcessPerNode ();
int IONodesPerFile ( char* filename, int *nodesList ); int IONodesPerFile ( char* filename, int *nodesList );
int ListOfMemoryTiers ( mem_t* memList );
/**********************/ /**********************/
/* |-- Network */ /* |-- Network */
/**********************/ /**********************/
int NetworkDimensions (); int NetworkDimensions ();
int64_t NetworkBandwidth ();
int64_t NetworkLatency ();
/* |---- Coordinates */ /* |---- Coordinates */
void RankToCoordinates ( int rank, int* coord ); void RankToCoordinates ( int rank, int* coord );
......
...@@ -26,6 +26,7 @@ public: ...@@ -26,6 +26,7 @@ public:
/* |-- Utils */ /* |-- Utils */
/************************/ /************************/
virtual char* memName ( ) = 0; virtual char* memName ( ) = 0;
virtual char* memName ( mem_t mem ) = 0;
virtual mem_t memTypeByName ( char* name ) = 0; virtual mem_t memTypeByName ( char* name ) = 0;
virtual mem_t memTypeByPath ( char* path ) = 0; virtual mem_t memTypeByPath ( char* path ) = 0;
...@@ -38,6 +39,10 @@ public: ...@@ -38,6 +39,10 @@ public:
virtual bool memPersistency ( ) = 0; virtual bool memPersistency ( ) = 0;
virtual char* memPath ( ) = 0; virtual char* memPath ( ) = 0;
virtual int64_t memBandwidth ( mem_t mem ) = 0;
virtual int64_t memLatency ( mem_t mem ) = 0;
virtual int64_t memCapacity ( mem_t mem ) = 0;
virtual bool memPersistency ( mem_t mem ) = 0;
/* Temporary */ /* Temporary */
void *buffer_; void *buffer_;
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include "tp_memory_interface.hpp"
class iTopology { class iTopology {
public: public:
/**********************/ /**********************/
...@@ -16,11 +18,14 @@ class iTopology { ...@@ -16,11 +18,14 @@ class iTopology {
virtual int GlobalCoreId () = 0; virtual int GlobalCoreId () = 0;
virtual int LocalCoreId () = 0; virtual int LocalCoreId () = 0;
virtual int ProcessPerNode () = 0; virtual int ProcessPerNode () = 0;
virtual int ListOfMemoryTiers ( mem_t* memList ) = 0;
/**********************/ /**********************/
/* |-- Network */ /* |-- Network */
/**********************/ /**********************/
virtual int NetworkDimensions () = 0; virtual int NetworkDimensions () = 0;
virtual int64_t NetworkBandwidth () = 0;
virtual int64_t NetworkLatency () = 0;
/* |---- Coordinates */ /* |---- Coordinates */
virtual void RankToCoordinates ( int rank, int* coord ) = 0; virtual void RankToCoordinates ( int rank, int* coord ) = 0;
......
...@@ -217,6 +217,7 @@ void Tapioca::ParseEnvVariables () ...@@ -217,6 +217,7 @@ void Tapioca::ParseEnvVariables ()
strcmp(envStrategy, "SHORTEST_PATH") ? 0 : this->strategy_ = SHORTEST_PATH; strcmp(envStrategy, "SHORTEST_PATH") ? 0 : this->strategy_ = SHORTEST_PATH;
strcmp(envStrategy, "LONGEST_PATH") ? 0 : this->strategy_ = LONGEST_PATH; strcmp(envStrategy, "LONGEST_PATH") ? 0 : this->strategy_ = LONGEST_PATH;
strcmp(envStrategy, "TOPOLOGY_AWARE") ? 0 : this->strategy_ = TOPOLOGY_AWARE; strcmp(envStrategy, "TOPOLOGY_AWARE") ? 0 : this->strategy_ = TOPOLOGY_AWARE;
strcmp(envStrategy, "MEMORY_AWARE") ? 0 : this->strategy_ = MEMORY_AWARE;
strcmp(envStrategy, "CONTENTION_AWARE") ? 0 : this->strategy_ = CONTENTION_AWARE; strcmp(envStrategy, "CONTENTION_AWARE") ? 0 : this->strategy_ = CONTENTION_AWARE;
strcmp(envStrategy, "UNIFORM") ? 0 : this->strategy_ = UNIFORM; strcmp(envStrategy, "UNIFORM") ? 0 : this->strategy_ = UNIFORM;
strcmp(envStrategy, "RANDOM") ? 0 : this->strategy_ = RANDOM; strcmp(envStrategy, "RANDOM") ? 0 : this->strategy_ = RANDOM;
...@@ -557,6 +558,9 @@ void Tapioca::ElectAggregators () ...@@ -557,6 +558,9 @@ void Tapioca::ElectAggregators ()
case TOPOLOGY_AWARE: case TOPOLOGY_AWARE:
aggrRank = this->RankTopologyAware (aggrComm, color); aggrRank = this->RankTopologyAware (aggrComm, color);
break; break;
case MEMORY_AWARE:
aggrRank = this->RankMemoryAware (aggrComm, color);
break;
case CONTENTION_AWARE: case CONTENTION_AWARE:
aggrRank = this->RankContentionAware (aggrComm, color); aggrRank = this->RankContentionAware (aggrComm, color);
break; break;
...@@ -646,6 +650,7 @@ const char* Tapioca::getStrategyName () ...@@ -646,6 +650,7 @@ const char* Tapioca::getStrategyName ()
case SHORTEST_PATH: return "Shortest path"; case SHORTEST_PATH: return "Shortest path";
case LONGEST_PATH: return "Longest path"; case LONGEST_PATH: return "Longest path";
case TOPOLOGY_AWARE: return "Topology-aware placement"; case TOPOLOGY_AWARE: return "Topology-aware placement";
case MEMORY_AWARE: return "Memory-aware placement";
case CONTENTION_AWARE: return "Contention-aware placement"; case CONTENTION_AWARE: return "Contention-aware placement";
case UNIFORM: return "Uniform placement"; case UNIFORM: return "Uniform placement";
case RANDOM : return "Random placement"; case RANDOM : return "Random placement";
......
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
#define TAPIOCA_H #define TAPIOCA_H
#define MASTER 0 #define MASTER 0
#define LATENCY 30
#define BANDWIDTH 1800000
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -25,6 +23,7 @@ enum MAPPING_STRATEGY ...@@ -25,6 +23,7 @@ enum MAPPING_STRATEGY
SHORTEST_PATH, SHORTEST_PATH,
LONGEST_PATH, LONGEST_PATH,
TOPOLOGY_AWARE, TOPOLOGY_AWARE,
MEMORY_AWARE,
CONTENTION_AWARE, CONTENTION_AWARE,
UNIFORM, UNIFORM,
RANDOM RANDOM
...@@ -99,6 +98,7 @@ protected: ...@@ -99,6 +98,7 @@ protected:
int RankShortestPath (MPI_Comm aggrComm, int64_t dataSize); int RankShortestPath (MPI_Comm aggrComm, int64_t dataSize);
int RankLongestPath (MPI_Comm aggrComm, int64_t dataSize); int RankLongestPath (MPI_Comm aggrComm, int64_t dataSize);
int RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize); int RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize);
int RankMemoryAware (MPI_Comm aggrComm, int64_t dataSize);
int RankContentionAware (MPI_Comm aggrComm, int64_t dataSize); int RankContentionAware (MPI_Comm aggrComm, int64_t dataSize);
int RankUniformDistribution (MPI_Comm aggrComm, int64_t dataSize); int RankUniformDistribution (MPI_Comm aggrComm, int64_t dataSize);
int RankRandom (MPI_Comm aggrComm, int64_t dataSize); int RankRandom (MPI_Comm aggrComm, int64_t dataSize);
......
...@@ -89,26 +89,16 @@ int Tapioca::RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize) ...@@ -89,26 +89,16 @@ int Tapioca::RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize)
if ( rank != aggrCommRank ) { if ( rank != aggrCommRank ) {
distance = topology.DistanceBetweenRanks ( globalRanks[rank], worldRank ); distance = topology.DistanceBetweenRanks ( globalRanks[rank], worldRank );
// aggrCost.cost = std::max ( distance * LATENCY + (double)dataDistrib[rank] / BANDWIDTH, // aggrCost.cost = std::max ( distance * this->topology.NetworkLatency () + (double)dataDistrib[rank] / this->topology.NetworkBandwidth (),
// aggrCost.cost ); // aggrCost.cost );
aggrCost.cost += (distance * LATENCY + (double)dataDistrib[rank] / BANDWIDTH); aggrCost.cost += (distance * this->topology.NetworkLatency ()
+ (double)dataDistrib[rank] / this->topology.NetworkBandwidth () );
} }
} }
// IOnodesList = (int *) malloc ( MAX_IONODES * sizeof ( int ) ); if ( topology.DistanceToIONode ( worldRank ) != 0 )
// nIOnodes = topology.IONodesPerFile (this->filename_, IOnodesList); aggrCost.cost += topology.DistanceToIONode ( worldRank ) * this->topology.NetworkLatency ()
+ (double)aggregatedData / this->topology.NetworkBandwidth ();
// if ( this->commRank_ == 0 ) {
// fprintf (stdout, "[LUSTRE] nLnet = %d\n", nIOnodes);
// fprintf (stdout, "[LUSTRE] list = ");
// for ( int i = 0; i < nIOnodes; i++ )
// fprintf (stdout, "%d ", IOnodesList[i]);
// fprintf (stdout, "\n");
// }
#ifdef BGQ
aggrCost.cost += topology.DistanceToIONode ( worldRank ) * LATENCY + (double)aggregatedData / BANDWIDTH;
#endif
if ( this->excludedNode[this->hostId_] ) if ( this->excludedNode[this->hostId_] )
aggrCost.cost = DBL_MAX; aggrCost.cost = DBL_MAX;
...@@ -134,6 +124,93 @@ int Tapioca::RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize) ...@@ -134,6 +124,93 @@ int Tapioca::RankTopologyAware (MPI_Comm aggrComm, int64_t dataSize)
} }
int Tapioca::RankMemoryAware (MPI_Comm aggrComm, int64_t dataSize)
{
struct { double cost; int rank; } aggrCost, minCost;
double current_cost = DBL_MAX;
int aggrCommRank, aggrCommSize, worldRank, rank, distance, dim, hops, aggrRank, nIOnodes, memCount, m;
mem_t memList[10], best_mem = DDR;
Memory mem;
int64_t *dataDistrib, aggregatedData = 0, latency, bandwidth;
int *srcCoords, *destCoords, *globalRanks, *IOnodesList;
MPI_Comm_rank (aggrComm, &aggrCommRank);
MPI_Comm_size (aggrComm, &aggrCommSize);
MPI_Comm_rank (MPI_COMM_WORLD, &worldRank);
aggrCost.rank = aggrCommRank;
aggrCost.cost = 0;
dataDistrib = (int64_t*) malloc (aggrCommSize * sizeof(int64_t));
globalRanks = (int *) malloc (aggrCommSize * sizeof(int));
MPI_Allgather(&worldRank, 1, MPI_INT, globalRanks, 1, MPI_INT, aggrComm);
MPI_Allgather(&dataSize, 1, MPI_LONG_LONG, dataDistrib, 1, MPI_LONG_LONG, aggrComm);
memCount = topology.ListOfMemoryTiers ( memList );
for ( m = 0; m < memCount; m ++ ) {
for ( rank = 0; rank < aggrCommSize; rank++ ) {
aggregatedData += dataDistrib[rank];
if ( rank != aggrCommRank ) {
distance = topology.DistanceBetweenRanks ( globalRanks[rank], worldRank );
if ( distance == 0 ) {
latency = mem.memLatency ( memList[m] );
bandwidth = mem.memBandwidth ( memList[m] );
}
else {
latency = std::max ( mem.memLatency ( memList[m] ),
this->topology.NetworkLatency () );
bandwidth = std::min ( mem.memBandwidth ( memList[m] ),
this->topology.NetworkBandwidth () );
}
// aggrCost.cost = std::max ( distance * latency + (double)dataDistrib[rank] / bandwidth,
// aggrCost.cost );
aggrCost.cost += ( distance * latency + (double)dataDistrib[rank] / bandwidth );
}
}
if ( aggrCost.cost < current_cost ) {
current_cost = aggrCost.cost;
best_mem = memList[m];
}
aggrCost.cost = 0;
}
aggrCost.cost = current_cost;
if ( topology.DistanceToIONode ( worldRank ) != 0 )
aggrCost.cost += topology.DistanceToIONode ( worldRank ) * this->topology.NetworkLatency ()
+ (double)aggregatedData / this->topology.NetworkBandwidth ();
if ( this->excludedNode[this->hostId_] )
aggrCost.cost = DBL_MAX;
MPI_Allreduce ( &aggrCost, &minCost, 1, MPI_DOUBLE_INT, MPI_MINLOC, aggrComm );
MPI_Reduce ( &dataSize, &this->aggrDataSize_, 1, MPI_LONG_LONG, MPI_SUM, minCost.rank, aggrComm );
if ( minCost.rank == aggrCommRank ) {
aggrRank = this->commRank_;
this->amAnAggr_ = true;
}
MPI_Bcast ( &aggrRank, 1, MPI_INT, minCost.rank, aggrComm );
MPI_Bcast ( &best_mem, 1, MPI_INT, minCost.rank, aggrComm );
this->memAggr_ = best_mem;
#ifdef DBG
if ( minCost.rank == aggrCommRank )
fprintf (stdout, "[DEBUG] Aggr. rank %d in aggrComm, distance to I/O node %d hops, cost: %.4f, mem: %s\n",
minCost.rank, topology.DistanceToIONode ( worldRank ), minCost.cost, mem.memName( best_mem ) );
#endif
return aggrRank;
}
int Tapioca::RankContentionAware (MPI_Comm aggrComm, int64_t dataSize) int Tapioca::RankContentionAware (MPI_Comm aggrComm, int64_t dataSize)
{ {
struct { double cost; int rank; } aggrCost, minCost; struct { double cost; int rank; } aggrCost, minCost;
...@@ -226,14 +303,14 @@ int Tapioca::RankContentionAware (MPI_Comm aggrComm, int64_t dataSize) ...@@ -226,14 +303,14 @@ int Tapioca::RankContentionAware (MPI_Comm aggrComm, int64_t dataSize)
if ( rank != aggrCommRank ) { if ( rank != aggrCommRank ) {
aggrCost.cost = std::max ( (double)dataDistrib[rank] / ( BANDWIDTH / routeCost[srcNode] ), aggrCost.cost = std::max ( (double)dataDistrib[rank] / ( this->topology.NetworkBandwidth () / routeCost[srcNode] ),
aggrCost.cost ); aggrCost.cost );
} }
} }
/* I/O Node */ /* I/O Node */
srcNode = this->worldRank_ / ppn; srcNode = this->worldRank_ / ppn;
aggrCost.cost += aggregatedData / ( BANDWIDTH / routeCost[srcNode] ); aggrCost.cost += aggregatedData / ( this->topology.NetworkBandwidth () / routeCost[srcNode] );
if ( this->excludedNode[this->hostId_] ) if ( this->excludedNode[this->hostId_] )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment