Commit 88287f5e authored by mubarak's avatar mubarak

Integrating DUMPI's MPI trace replay with model-net. Currently, supports...

Integrating DUMPI's MPI trace replay with model-net. Currently, supports replaying MPI point-to-point messaging on top of torus/dragonfly and simple-net network models.
parent 603bdbe3
...@@ -22,4 +22,12 @@ src_libcodes_net_a_SOURCES = \ ...@@ -22,4 +22,12 @@ src_libcodes_net_a_SOURCES = \
src/models/networks/model-net/model-net-lp.c \ src/models/networks/model-net/model-net-lp.c \
src/models/networks/model-net/model-net-sched.c \ src/models/networks/model-net/model-net-sched.c \
src/models/networks/model-net/model-net-sched-impl.h \ src/models/networks/model-net/model-net-sched-impl.h \
src/models/networks/model-net/model-net-sched-impl.c src/models/networks/model-net/model-net-sched-impl.c \
src/models/mpi-trace-replay/model-net-mpi-wrklds.c
bin_PROGRAMS += src/models/mpi-trace-replay/model-net-mpi-wrklds
src_models_mpi_trace_replay_model_net_mpi_wrklds_SOURCES = src/models/mpi-trace-replay/model-net-mpi-wrklds.c
src_models_mpi_trace_replay_model_net_mpi_wrklds_LDADD = $(testlib) $(CODES_BASE_LIBS)
src_models_mpi_trace_replay_model_net_mpi_wrklds_LDFLAGS = $(CODES_BASE_LDFLAGS)
src_models_mpi_trace_replay_model_net_mpi_wrklds_CFLAGS = ${CODES_BASE_CFLAGS}
1- Download, build and install the DUMPI software according to the
instructions available at:
http://sst.sandia.gov/about_dumpi.html
2- Configure codes-base with DUMPI. Make sure the CC environment variable
refers to a MPI compiler
./configure --with-ross=/path/to/ross/install --with-dumpi=/path/to/dumpi/install
--prefix=/path/to/codes-base/install CC=mpicc
3- Build codes-base
make clean && make && make install
4- Configure and build codes-net (See README.txt for instructions on building codes-net).
5- Download and untar the design forward DUMPI traces from URL
http://portal.nersc.gov/project/CAL/designforward.htm
6- Configure model-net using its config file (Example .conf files available at src/models/mpi-trace-replay/)
Make sure the number of nw-lp and model-net LP are the same in the config file.
7- From the main source directory of codes-net, run the DUMPI trace replay simulation on top of
model-net using (/dumpi-2014-04-05.22.12.17.37- is the prefix of the all DUMPI trace files.
We skip the last 4 digit prefix of the DUMPI trace file names).
./src/models/mpi-trace-replay/model-net-mpi-wrklds --sync=1 --workload_file=/path/to/dumpi/trace/directory/dumpi-2014-04-05.22.12.17.37- - --workload_type="dumpi" src/models/mpi-trace-replay/conf/modelnet-mpi-test.conf
The simulation runs in ROSS serial, conservative and optimistic modes.
8- Some example runs with small-scale traces
(i) AMG 8 MPI tasks http://portal.nersc.gov/project/CAL/designforward.htm#AMG
** Torus network model
mpirun -np 8 ./src/models/mpi-trace-replay/model-net-mpi-wrklds --sync=3 --extramem=462144 --workload_file=/home/mubarm/dumpi/df_AMG_n8_dumpi/dumpi-2014.03.03.14.12.46- --workload_type="dumpi" --batch=2 --gvt-interval=2 --num_net_traces=8 tests/conf/modelnet-mpi-test-torus.conf
** Simplenet network model
mpirun -np 8 ./src/models/mpi-trace-replay/model-net-mpi-wrklds --sync=3 --extramem=462144 --workload_file=/home/mubarm/dumpi/df_AMG_n8_dumpi/dumpi-2014.03.03.14.12.46- --workload_type="dumpi" --batch=2 --gvt-interval=2 tests/conf/modelnet-mpi-test.conf
** Dragonfly network model
mpirun -np 8 ./src/models/mpi-trace-replay/model-net-mpi-wrklds --sync=3 --extramem=462144 --workload_file=/home/mubarm/dumpi/df_AMG_n8_dumpi/dumpi-2014.03.03.14.12.46- --workload_type="dumpi" --batch=2 --gvt-interval=2 --num_net_traces=8 src/models/mpi-trace-replay//conf/modelnet-mpi-test-dragonfly.conf
Note: Dragonfly and torus networks may have more number of nodes in the network than the number network traces (Some network nodes will only pass messages and they will not end up loading the traces). Thats why --num_net_traces argument is used to specify exact number of traces available in the DUMPI directory if there is a mis-match between number of network nodes and traces.
(ii) Crystal router 10 MPI tasks http://portal.nersc.gov/project/CAL/designforward.htm#CrystalRouter
** Simple-net network model
mpirun -np 10 ./src/models/mpi-trace-replay/model-net-mpi-wrklds --sync=3 --extramem=185536 --workload_file=/home/mubarm/dumpi/cry_router/dumpi--2014.04.23.12.08.27- --workload_type="dumpi" src/models/mpi-trace-replay/conf/modelnet-mpi-test-cry-router.conf
(iii) MiniFE 18 MPI tasks http://portal.nersc.gov/project/CAL/designforward.htm#MiniFE
** Simple-net network model
mpirun -np 18 ./src/models/mpi-trace-replay/model-net-mpi-wrklds --sync=3 --extramem=6185536 --workload_file=/home/mubarm/dumpi/dumpi_data_18/dumpi-2014.04.22.12.17.37- --workload_type="dumpi" src/models/mpi-trace-replay/conf/modelnet-mpi-test-mini-fe.conf
LPGROUPS
{
MODELNET_GRP
{
repetitions="10";
nw-lp="1";
modelnet_simplenet="1";
}
}
PARAMS
{
packet_size="512";
message_size="296";
modelnet_order=( "simplenet" );
# scheduler options
modelnet_scheduler="fcfs";
net_startup_ns="1.5";
net_bw_mbps="20000";
}
LPGROUPS
{
MODELNET_GRP
{
repetitions="36";
nw-lp="2";
modelnet_dragonfly="2";
dragonfly_router="1";
}
}
PARAMS
{
packet_size="512";
modelnet_order=( "dragonfly" );
# scheduler options
modelnet_scheduler="fcfs";
chunk_size="32";
# modelnet_scheduler="round-robin";
num_vcs="1";
num_routers="4";
local_vc_size="16384";
global_vc_size="32768";
cn_vc_size="16384";
local_bandwidth="5.25";
global_bandwidth="4.7";
cn_bandwidth="5.25";
message_size="512";
routing="minimal";
}
LPGROUPS
{
MODELNET_GRP
{
repetitions="18";
nw-lp="1";
modelnet_simplenet="1";
}
}
PARAMS
{
packet_size="512";
message_size="296";
modelnet_order=( "simplenet" );
# scheduler options
modelnet_scheduler="fcfs";
net_startup_ns="1.5";
net_bw_mbps="20000";
}
LPGROUPS
{
MODELNET_GRP
{
repetitions="18";
nw-lp="1";
modelnet_torus="1";
}
}
PARAMS
{
packet_size="512";
message_size="296";
modelnet_order=( "torus" );
# scheduler options
modelnet_scheduler="fcfs";
net_startup_ns="1.5";
net_bw_mbps="20000";
n_dims="3";
dim_length="3,3,2";
link_bandwidth="2.0";
buffer_size="1310720";
num_vc="1";
chunk_size="64";
}
LPGROUPS
{
MODELNET_GRP
{
repetitions="8";
nw-lp="1";
modelnet_simplenet="1";
}
}
PARAMS
{
packet_size="512";
message_size="296";
modelnet_order=( "simplenet" );
# scheduler options
modelnet_scheduler="fcfs";
net_startup_ns="1.5";
net_bw_mbps="20000";
}
This diff is collapsed.
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
/* collective specific parameters */ /* collective specific parameters */
#define TREE_DEGREE 4 #define TREE_DEGREE 4
#define LEVEL_DELAY 1000 #define LEVEL_DELAY 1000
#define TORUS_COLLECTIVE_DEBUG 1 #define TORUS_COLLECTIVE_DEBUG 0
#define NUM_COLLECTIVES 1 #define NUM_COLLECTIVES 1
#define COLLECTIVE_COMPUTATION_DELAY 5700 #define COLLECTIVE_COMPUTATION_DELAY 5700
#define TORUS_FAN_OUT_DELAY 20.0 #define TORUS_FAN_OUT_DELAY 20.0
...@@ -43,6 +43,7 @@ struct torus_param ...@@ -43,6 +43,7 @@ struct torus_param
int* dim_length; /*Length of each torus dimension*/ int* dim_length; /*Length of each torus dimension*/
double link_bandwidth;/* bandwidth for each torus link */ double link_bandwidth;/* bandwidth for each torus link */
int buffer_size; /* number of buffer slots for each vc in flits*/ int buffer_size; /* number of buffer slots for each vc in flits*/
//int num_net_traces; /* number of network traces to be mapped on torus */
int num_vc; /* number of virtual channels for each torus link */ int num_vc; /* number of virtual channels for each torus link */
float mean_process;/* mean process time for each flit */ float mean_process;/* mean process time for each flit */
int chunk_size; /* chunk is the smallest unit--default set to 32 */ int chunk_size; /* chunk is the smallest unit--default set to 32 */
...@@ -167,6 +168,7 @@ static void torus_read_config( ...@@ -167,6 +168,7 @@ static void torus_read_config(
p->buffer_size); p->buffer_size);
} }
configuration_get_value_int(&config, "PARAMS", "chunk_size", anno, &p->chunk_size); configuration_get_value_int(&config, "PARAMS", "chunk_size", anno, &p->chunk_size);
if(!p->chunk_size) { if(!p->chunk_size) {
p->chunk_size = 32; p->chunk_size = 32;
...@@ -202,7 +204,20 @@ static void torus_read_config( ...@@ -202,7 +204,20 @@ static void torus_read_config(
i++; i++;
token = strtok(NULL,","); token = strtok(NULL,",");
} }
/*int num_nodes = 1;
for( i = 0; i < p->n_dims; i++)
num_nodes *= p->dim_length[i];
configuration_get_value_int(&config, "PARAMS", "num_net_traces", anno, &p->num_net_traces);
if(!p->num_net_traces) {
p->num_net_traces = num_nodes;
fprintf(stderr, "Number of network traces not specified, setting to %d",
p->num_net_traces);
}
// Number of network traces should be <= number of torus network nodes `
assert(p->num_net_traces <= num_nodes);*/
// create derived parameters // create derived parameters
// factor is an exclusive prefix product // factor is an exclusive prefix product
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment