Commit 83c55775 authored by Misbah Mubarak's avatar Misbah Mubarak

Updating the network model documentation, added examples on how to run network...

Updating the network model documentation, added examples on how to run network models with dumpi traces, fixed comm time in MPI Sim layer
parent 8183c2af
......@@ -124,21 +124,12 @@ performed in:
ACM SIGSIM conference on Principles of Advanced Discrete Simulations
(PADS), 2014.
The configuration parameters are as follows:
* n_dims - the number of torus dimensions.
* dim_length - the length of each torus dimension. For example, "4,2,2,2" describes a
four-dimensional, 4x2x2x2 torus.
* link_bandwidth - the bandwidth available per torus link (specified in GiB/sec).
* buffer_size - the buffer size available at each torus node. Buffer size is measured
in number of flits or chunks (flit/chunk size is configurable using chunk_size parameter).
* num_vc - number of virtual channels (currently unused - all traffic goes
through a single channel).
* chunk_size - element size per transfer, specified in bytes. Messages/packets are sent in
individual chunks. This is typically a small number (e.g., 32 bytes).
The configuration and model setup can be found at:
src/models/model-net/doc/README.torus.txt
== Dragonfly
The dragonfly model (model-net LP name: "dragonfly") is an experimental network
The dragonfly model (model-net LP name: "dragonfly") is a network
topology that utilizes the concept of virtual routers to produce systems with
very high virtual radix out of network components with a lower radix. The
topology itself and the simulation model are both described in
......
......@@ -195,7 +195,7 @@ static void codes_exec_mpi_recv_rc(
nw_state* s, tw_bf * bf, nw_message* m, tw_lp* lp);
/* execute the computational delay */
static void codes_exec_comp_delay(
nw_state* s, tw_lp* lp, struct codes_workload_op * mpi_op);
nw_state* s, nw_message * m, tw_lp* lp, struct codes_workload_op * mpi_op);
/* gets the next MPI operation from the network-workloads API. */
static void get_next_mpi_operation(
nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp);
......@@ -569,12 +569,13 @@ static void codes_issue_next_event(tw_lp* lp)
/* Simulate delays between MPI operations */
static void codes_exec_comp_delay(
nw_state* s, tw_lp* lp, struct codes_workload_op * mpi_op)
nw_state* s, nw_message * m, tw_lp* lp, struct codes_workload_op * mpi_op)
{
tw_event* e;
tw_stime ts;
nw_message* msg;
m->rc.saved_delay = s->compute_time;
s->compute_time += s_to_ns(mpi_op->u.delay.seconds);
ts = s_to_ns(mpi_op->u.delay.seconds);
......@@ -583,7 +584,6 @@ static void codes_exec_comp_delay(
e = tw_event_new( lp->gid, ts , lp );
msg = tw_event_data(e);
msg->msg_type = MPI_OP_GET_NEXT;
msg->rc.saved_delay = mpi_op->u.delay.seconds;
tw_event_send(e);
}
......@@ -595,9 +595,7 @@ static void codes_exec_mpi_recv_rc(
nw_message* m,
tw_lp* lp)
{
struct codes_workload_op * mpi_op = m->rc.saved_op;
num_bytes_recvd -= mpi_op->u.recv.num_bytes;
num_bytes_recvd -= m->rc.saved_num_bytes;
ns->recv_time = m->rc.saved_recv_time;
if(m->fwd.found_match >= 0)
{
......@@ -658,6 +656,8 @@ static void codes_exec_mpi_recv(
receive operations. */
m->rc.saved_recv_time = s->recv_time;
m->rc.saved_num_bytes = mpi_op->u.recv.num_bytes;
num_bytes_recvd += mpi_op->u.recv.num_bytes;
mpi_msgs_queue * recv_op = (mpi_msgs_queue*) malloc(sizeof(mpi_msgs_queue));
......@@ -1016,8 +1016,7 @@ void nw_test_event_handler(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp)
static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp)
{
struct codes_workload_op * mpi_op = m->rc.saved_op;
codes_workload_get_next_rc(wrkld_id, 0, (int)s->nw_id, mpi_op);
codes_workload_get_next_rc2(wrkld_id, 0, (int)s->nw_id);
if(m->op_type == CODES_WK_END)
{
......@@ -1050,6 +1049,8 @@ static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, t
{
s->num_delays--;
tw_rand_reverse_unif(lp->rng);
if(!disable_delay)
s->compute_time = m->rc.saved_delay;
}
break;
case CODES_WK_BCAST:
......@@ -1093,24 +1094,24 @@ static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, t
static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp)
{
struct codes_workload_op * mpi_op = malloc(sizeof(struct codes_workload_op));
codes_workload_get_next(wrkld_id, 0, (int)s->nw_id, mpi_op);
//struct codes_workload_op * mpi_op = malloc(sizeof(struct codes_workload_op));
struct codes_workload_op mpi_op;
codes_workload_get_next(wrkld_id, 0, (int)s->nw_id, &mpi_op);
m->op_type = mpi_op->op_type;
m->rc.saved_op = mpi_op;
m->op_type = mpi_op.op_type;
if(mpi_op->op_type == CODES_WK_END)
if(mpi_op.op_type == CODES_WK_END)
{
s->elapsed_time = tw_now(lp) - s->start_time;
return;
}
switch(mpi_op->op_type)
switch(mpi_op.op_type)
{
case CODES_WK_SEND:
case CODES_WK_ISEND:
{
s->num_sends++;
codes_exec_mpi_send(s, bf, m, lp, mpi_op);
codes_exec_mpi_send(s, bf, m, lp, &mpi_op);
}
break;
......@@ -1118,7 +1119,7 @@ static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_l
case CODES_WK_IRECV:
{
s->num_recvs++;
codes_exec_mpi_recv(s, bf, m, lp, mpi_op);
codes_exec_mpi_recv(s, bf, m, lp, &mpi_op);
}
break;
......@@ -1129,7 +1130,7 @@ static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_l
if(disable_delay)
codes_issue_next_event(lp);
else
codes_exec_comp_delay(s, lp, mpi_op);
codes_exec_comp_delay(s, m, lp, &mpi_op);
}
break;
......@@ -1144,13 +1145,13 @@ static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_l
case CODES_WK_WAITALL:
{
s->num_waitall++;
codes_exec_mpi_wait_all(s, bf, m, lp, mpi_op);
codes_exec_mpi_wait_all(s, bf, m, lp, &mpi_op);
}
break;
case CODES_WK_WAIT:
{
s->num_wait++;
codes_exec_mpi_wait(s, lp, mpi_op);
codes_exec_mpi_wait(s, lp, &mpi_op);
}
break;
case CODES_WK_BCAST:
......@@ -1167,7 +1168,7 @@ static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_l
}
break;
default:
printf("\n Invalid op type %d ", mpi_op->op_type);
printf("\n Invalid op type %d ", mpi_op.op_type);
}
return;
}
......@@ -1287,10 +1288,14 @@ int main( int argc, char** argv )
tw_opt_add(app_opt);
tw_init(&argc, &argv);
if(strlen(workload_file) == 0)
if(strlen(workload_file) == 0 || strcmp(workload_type, "dumpi") != 0 || num_net_traces <= 0)
{
if(tw_ismaster())
printf("Usage: mpirun -np n ./codes-nw-test --sync=1/2/3 --workload_type=type --workload_file=workload-file-name\n");
printf("Usage: mpirun -np n ./modelnet-mpi-replay --sync=1/3"
" --workload_type=dumpi --workload_file=prefix-workload-file-name"
" --num_net_traces=n -- config-file-name\n"
"See model-net/doc/README.dragonfly.txt and model-net/doc/README.torus.txt"
" for instructions on how to run the models with network traces ");
tw_end();
return -1;
}
......
......@@ -47,7 +47,7 @@ through events in the form of timestamped messages.In the dragonfly model, each
LP represents an individual router or node and each time-stamped message
represents a packet sent to/from a node/router.
2- Configuring ROSS dragonfly network model
2- Configuring CODES dragonfly network model
CODES dragonfly network model can be configured using the dragonfly config file (currently
located in codes-net/tests/conf). To adjust the network size, configure the MODELNET_GRP
section of the config file as well as the 'num_routers' parameter in the PARAMS section in
......
*** README file for torus network model ***
1- Model of the torus network topology
CODES torus model uses realistic design parameters of a torus network, and we
have validated our simulation results against the existing Blue Gene torus
architecture. Similar to the Blue Gene architecture, CODES torus model uses a
bubble escape virtual channel to prevent deadlocks. Following the
specifications of the BG/Q 5D torus network, by default the CODES torus model
has packets with a maximum size of 512 bytes, in which each packet is broken
into flits of 32 bytes for transportation over the network. Our torus network
model uses dimension-order routing to route packets. In this form of routing,
the radix-k digits of the destination are used to direct network packets, one
dimension at a time.
For more details about the torus network and its design, please see
Blue Gene/L torus interconnection network by Adiga, Blumrich et al.
The torus network model has a two LP types, a model-net LP and a torus node LP.
The high-level messages are passed on to the model-net LP by either the MPI
simulation layer or a synthetic workload generator. These messages are
scheduled on to the underlying torus node LP in the form of network packets,
which are then further broken into flits. Each torus node LP is connected to
its neighbors via channels having a fixed buffer capacity. Similar to the
dragonfly node LP, the torus network model also uses a credit-based flow
control to regulate network traffic. Whenever a flit arrives at the torus
network node, a hop delay based on the router speed and port bandwidth is added
in order to simulate the processing time of the flit. Once all flits of a
packet arrive at the destination torus node LP, they are forwarded to the
receiving model-net layer LP, which notifies the higher level (MPI simulation
layer or synthetic workload generator) layer about message arrival.
2- Configuring CODES torus network model
A simple config file for the CODES torus model can be found in codes-net/tests/conf.
The configuration parameters are as follows:
* n_dims - the number of torus dimensions.
* dim_length - the length of each torus dimension. For example, "4,2,2,2"
* describes a
four-dimensional, 4x2x2x2 torus.
* link_bandwidth - the bandwidth available per torus link (specified in
* GiB/sec).
* buffer_size - the buffer size available at each torus node. Buffer size is
* measured
in number of flits or chunks (flit/chunk size is configurable using
chunk_size parameter).
* num_vc - number of virtual channels (currently unused - all traffic goes
through a single channel).
* chunk_size - element size per transfer, specified in bytes.
* Messages/packets are sent in
individual chunks. This is typically a small number (e.g., 32 bytes).
3- Running torus model test program
- To run the torus network model with the modelnet-test program, the following
options are available
ROSS serial mode:
./tests/modelnet-test --sync=1 -- tests/conf/modelnet-test-torus.conf
ROSS optimistic mode:
mpirun -np 4 ./tests/modelnet-test --sync=3 -- tests/conf/modelnet-test-torus.conf
4- Running torus model with DUMPI application traces
- codes-base needs to be configured with DUMPI. See
codes-base/doc/GETTING_STARTED on how to configure codes-base with DUMPI
- Design forward network traces are available at:
http://portal.nersc.gov/project/CAL/designforward.htm
For illustration purposes, we use the AMG network trace with 27 MPI processes
available for download at:
http://portal.nersc.gov/project/CAL/doe-miniapps-mpi-traces/AMG/df_AMG_n27_dumpi.tar.gz
- Note on trace reading - the input file prefix to the dumpi workload generator
should be everything up to the rank number. E.g., if the dumpi files are of the
form "dumpi-YYYY.MM.DD.HH.MM.SS-XXXX.bin", then the input should be
"dumpi-YYYY.MM.DD.HH.MM.SS-"
- Example torus model config file with network traces can be found at:
src/models/network-workloads/conf/modelnet-mpi-test-torus.conf
- Running CODES torus model with AMG 27 rank trace in optimistic mode:
mpirun -np 4 ./src/models/network-workloads/model-net-mpi-replay --sync=3
--batch=2 --workload_type="dumpi" --num_net_traces=27
--workload_file=../../df_traces/AMG/df_AMG_n27_dumpi/dumpi-2014.03.03.14.55.00-
-- ../src/models/network-workloads/conf/modelnet-mpi-test-torus.conf
[batch is ROSS specific parameter that specifies the number of iterations the
simulation must process before checking the top event scheduling loop for
anti-messages. A smaller batch size comes with fewer rollbacks. The GVT
synchronization is done after every batch*gvt-interval epochs (gvt-interval
is 16 by default).
num_net_traces is the number of MPI processes to be simulated from the trace
file. With the torus and dragonfly networks, the number of simulated network
nodes may not exactly match the number of MPI processes. This is because the
simulated network nodes increase in specific increments for e.g. the number
of routers in a dragonfly define the network size and the number of
dimensions, dimension length defines the network nodes in the torus. Due to
this mismatch, we must ensure that the network nodes in the config file are
equal to or greater than the MPI processes to be simulated from the trace.]
- Running CODES torus model with AMG application trace, 27 ranks in serial
mode:
./src/models/network-workloads/model-net-mpi-replay --sync=1
--workload_type=dumpi
--workload_file=../../df_traces/AMG/df_AMG_n27_dumpi/dumpi-2014.03.03.14.55.00-
--num_net_traces=27 --
../src/models/network-workloads/conf/modelnet-mpi-test-torus.conf
- At small scale (usually up to a thousand simulated MPI processes), ROSS
serial mode has better performance than the optimistic mode. The benefits of
using optimistic mode usually show up for larger scale runs where the
simulation gets too big to run sequentially.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment