Commit 7a4ef14f authored by Misbah Mubarak's avatar Misbah Mubarak

Updates to modelnet (1) Updated the mapping configuration for dragonfly (the...

Updates to modelnet (1) Updated the mapping configuration for dragonfly (the mapping now distributes the routers equally on all the PES.  The old dragonly mapping configuration was placing the routers on a single PE which was causing congestion on that PE leading to slow model performance in parallel ) (2) Updated 'make check' to handle dragonfly and torus test cases as well (3) Remove the num_servers argument from model-net test case, the number of servers are now calculated from the config file (4) Placing the ross mapping parameters in the codes mapping file
parent 045705e7
......@@ -67,4 +67,6 @@ includedir="${includedir}/codes"
AC_CONFIG_FILES([Makefile])
AC_CONFIG_FILES([tests/modelnet-test.sh],[chmod +x tests/modelnet-test.sh])
AC_CONFIG_FILES([tests/modelnet-test-dragonfly.sh],[chmod +x tests/modelnet-test-dragonfly.sh])
AC_CONFIG_FILES([tests/modelnet-test-torus.sh],[chmod +x tests/modelnet-test-torus.sh])
AC_OUTPUT([maint/codes-net.pc])
......@@ -6,8 +6,9 @@ end of the simulation.
2- Mapping:
- Update the mapping code to handle remainder (where number of ROSS LPs are not evenly divisible by the number of PEs).
- Modify the get_lp_info function to return the LP count of the given LP? Can help with local lp count calculation.
- Make the mem_factor argument configurable in the codes mapping file (used for g_tw_events_per_pe).
- Remove the group id and type id arguments from get_lp_info function.
Make the model-net test case running for two different types of networks?
- Use the I/O activity trace logs from BG/Q and BG/P for I/O modelling.
......@@ -4,12 +4,8 @@ will be executed in sequential mode.
You can also run the test program manually in parallel (conservative or
optimistic) mode as follows:
mpiexec -n 4 tests/modelnet-test --sync=2 --num_servers=n (optional --nkp=n) tests/modelnet-test.conf
mpiexec -n 4 tests/modelnet-test --sync=2 (optional --nkp=n) tests/modelnet-test.conf
<or>
mpiexec -n 4 tests/modelnet-test --sync=3 --num_servers=n (optional --nkp=n) tests/modelnet-test.conf
Note: The num_servers runtime argument should match the number of server repetitions specified in the modelnet-test config file.
(1)- To run the modelnet test with the simplenet network plugin, use tests/modelnet-test.conf (default setting).
(2)- To run the modelnet test with the torus network plugin, use tests/modelnet-test-torus.conf file.
mpiexec -n 4 tests/modelnet-test --sync=3 (optional --nkp=n) tests/modelnet-test.conf
- To run the modelnet test with the simplenet network, torus and dragonfly network plugins use tests/modelnet-test.conf (default setting), tests/modelnet-test-torus.conf and tests/modelnet-tests-dragonfly.conf respectivel
......@@ -24,7 +24,7 @@ static void dragonfly_setup(const void* net_params)
cn_vc_size = d_param->cn_vc_size;
routing = d_param->routing;
radix = num_vcs * (num_cn + num_global_channels + num_groups);
radix = num_vcs * (num_cn + num_global_channels + num_routers);
total_routers = num_groups * num_routers;
lp_type_register("dragonfly_router", dragonfly_get_router_lp_type());
return;
......@@ -200,9 +200,11 @@ void packet_generate(terminal_state * s, tw_bf * bf, terminal_message * msg, tw_
num_chunks = msg->packet_size / CHUNK_SIZE;
msg->packet_ID = lp->gid + g_tw_nlp * s->packet_counter + tw_rand_integer(lp->rng, 0, lp->gid + g_tw_nlp * s->packet_counter);
msg->travel_start_time = tw_now(lp);
msg->my_N_hop = 0;
for(i = 0; i < num_chunks; i++)
{
// Before generating a packet, check if the input queue is available
// Before
// msg->my_N_hop = 0; generating a packet, check if the input queue is available
ts = i + tw_rand_exponential(lp->rng, MEAN_INTERVAL/200);
int chan = -1, j;
for(j = 0; j < num_vcs; j++)
......@@ -216,8 +218,7 @@ void packet_generate(terminal_state * s, tw_bf * bf, terminal_message * msg, tw_
e = tw_event_new(lp->gid, i + ts, lp);
m = tw_event_data(e);
memcpy(m, msg, dragonfly_get_msg_sz() + msg->local_event_size_bytes + msg->remote_event_size_bytes);
m->my_N_hop = 0;
memcpy(m, msg, dragonfly_get_msg_sz() + msg->remote_event_size_bytes + msg->local_event_size_bytes);
m->intm_group_id = -1;
m->saved_vc=0;
m->chunk_id = i;
......@@ -259,7 +260,7 @@ void packet_send(terminal_state * s, tw_bf * bf, terminal_message * msg, tw_lp *
s->terminal_available_time = max(s->terminal_available_time, tw_now(lp));
s->terminal_available_time += ts;
codes_mapping_get_lp_id("DRAGONFLY_ROUTER", "dragonfly_router", s->router_id, 0, &router_id);
codes_mapping_get_lp_id("MODELNET_GRP", "dragonfly_router", s->router_id, 0, &router_id);
e = tw_event_new(router_id, s->terminal_available_time - tw_now(lp), lp);
if(msg->packet_ID == TRACK && msg->chunk_id == num_chunks-1)
......@@ -355,12 +356,12 @@ if( msg->packet_ID == TRACK && msg->chunk_id == num_chunks-1)
int credit_delay = (1 / cn_bandwidth) * CREDIT_SIZE;
ts = credit_delay + tw_rand_exponential(lp->rng, credit_delay/1000);
msg->saved_available_time = s->next_credit_available_time;
msg->saved_credit_time = s->next_credit_available_time;
s->next_credit_available_time = max(s->next_credit_available_time, tw_now(lp));
s->next_credit_available_time += ts;
tw_lpid router_dest_id;
codes_mapping_get_lp_id("DRAGONFLY_ROUTER", "dragonfly_router", s->router_id, 0, &router_dest_id);
codes_mapping_get_lp_id("MODELNET_GRP", "dragonfly_router", s->router_id, 0, &router_dest_id);
buf_e = tw_event_new(router_dest_id, s->next_credit_available_time - tw_now(lp), lp);
buf_msg = tw_event_data(buf_e);
buf_msg->vc_index = msg->saved_vc;
......@@ -380,7 +381,9 @@ terminal_init( terminal_state * s,
int i;
// Assign the global router ID
codes_mapping_get_lp_info(lp->gid, lp_group_name, &mapping_grp_id, &mapping_type_id, lp_type_name, &mapping_rep_id, &mapping_offset);
s->terminal_id=mapping_rep_id + mapping_offset;
int num_lps = codes_mapping_get_lp_count("MODELNET_GRP", "modelnet_dragonfly");
s->terminal_id = (mapping_rep_id * num_lps) + mapping_offset;
s->router_id=(int)s->terminal_id / num_routers;
s->terminal_available_time = 0.0;
s->packet_counter = 0;
......@@ -443,12 +446,16 @@ terminal_event( terminal_state * s,
}
void
dragonfly_final( terminal_state * s,
dragonfly_terminal_final( terminal_state * s,
tw_lp * lp )
{
}
void dragonfly_router_final(router_state * s,
tw_lp * lp)
{
free(s->global_channel);
}
/* get the next stop for the current packet
* determines if it is a router within a group, a router in another group
* or the destination terminal */
......@@ -460,12 +467,13 @@ get_next_stop(router_state * s,
int path)
{
int dest_lp;
tw_lpid router_dest_id;
tw_lpid router_dest_id = -1;
int i;
int dest_group_id;
codes_mapping_get_lp_info(msg->dest_terminal_id, lp_group_name, &mapping_grp_id, &mapping_type_id, lp_type_name, &mapping_rep_id, &mapping_offset);
int dest_router_id = (mapping_offset + mapping_rep_id) / num_routers;
int num_lps = codes_mapping_get_lp_count("MODELNET_GRP", "modelnet_dragonfly");
int dest_router_id = (mapping_offset + (mapping_rep_id * num_lps)) / num_routers;
codes_mapping_get_lp_info(lp->gid, lp_group_name, &mapping_grp_id, &mapping_type_id, lp_type_name, &mapping_rep_id, &mapping_offset);
int local_router_id = (mapping_offset + mapping_rep_id);
......@@ -518,7 +526,7 @@ get_next_stop(router_state * s,
}
}
}
codes_mapping_get_lp_id("DRAGONFLY_ROUTER", "dragonfly_router", dest_lp, 0, &router_dest_id);
codes_mapping_get_lp_id("MODELNET_GRP", "dragonfly_router", dest_lp, 0, &router_dest_id);
return router_dest_id;
}
......@@ -532,11 +540,14 @@ get_output_port( router_state * s,
{
int output_port = -1, i, terminal_id;
codes_mapping_get_lp_info(msg->dest_terminal_id, lp_group_name, &mapping_grp_id, &mapping_type_id, lp_type_name, &mapping_rep_id, &mapping_offset);
terminal_id = mapping_offset + mapping_rep_id;
int num_lps = codes_mapping_get_lp_count("MODELNET_GRP","modelnet_dragonfly");
terminal_id = (mapping_rep_id * num_lps) + mapping_offset;
if(next_stop == msg->dest_terminal_id)
{
output_port = num_routers + num_global_channels + ( terminal_id % num_cn);
//if(output_port > 6)
// printf("\n incorrect output port %d terminal id %d ", output_port, terminal_id);
}
else
{
......@@ -548,7 +559,7 @@ get_output_port( router_state * s,
{
for(i=0; i < num_global_channels; i++)
{
if(s->global_channel[i] == next_stop)
if(s->global_channel[i] == local_router_id)
output_port = num_routers + i;
}
}
......@@ -556,6 +567,8 @@ get_output_port( router_state * s,
{
output_port = local_router_id % num_routers;
}
if(output_port == 6)
printf("\n output port not found %d next stop %d local router id %d group id %d intm grp id %d %d", output_port, next_stop, local_router_id, s->group_id, intm_grp_id, local_router_id%num_routers);
}
return output_port;
}
......@@ -585,6 +598,8 @@ router_packet_send( router_state * s,
int global=0;
int buf_size = local_vc_size;
assert(output_port != -1);
assert(output_chan != -1);
// Allocate output Virtual Channel
if(output_port >= num_routers && output_port < num_routers + num_global_channels)
{
......@@ -598,7 +613,7 @@ router_packet_send( router_state * s,
if(s->vc_occupancy[output_chan] >= buf_size)
{
printf("\n %lf Router %ld buffers overflowed from incoming terminals channel %d occupancy %d ", tw_now(lp),(long int) lp->gid, output_chan, s->vc_occupancy[output_chan]);
printf("\n %lf Router %ld buffers overflowed from incoming terminals channel %d occupancy %d radix %d next_stop %d ", tw_now(lp),(long int) lp->gid, output_chan, s->vc_occupancy[output_chan], radix, next_stop);
bf->c3 = 1;
MPI_Finalize();
exit(-1);
......@@ -694,14 +709,14 @@ void router_setup(router_state * r, tw_lp * lp)
r->group_id=r->router_id/num_routers;
int i;
int offset=(r->router_id % num_routers) * (num_global_channels / 2) + 1;
int router_offset=(r->router_id % num_routers) * (num_global_channels / 2) + 1;
r->global_channel = (int*)malloc(num_global_channels * sizeof(int));
r->next_output_available_time = (tw_stime*)malloc(radix * sizeof(tw_stime));
r->next_credit_available_time = (tw_stime*)malloc(radix * sizeof(tw_stime));
r->vc_occupancy = (int*)malloc(radix * sizeof(int));
r->output_vc_state = (int*)malloc(radix * sizeof(int));
for(i=0; i < radix; i++)
{
// Set credit & router occupancy
......@@ -716,12 +731,12 @@ void router_setup(router_state * r, tw_lp * lp)
{
if(i % 2 != 0)
{
r->global_channel[i]=(r->router_id + (offset * num_routers))%total_routers;
offset++;
r->global_channel[i]=(r->router_id + (router_offset * num_routers))%total_routers;
router_offset++;
}
else
{
r->global_channel[i]=r->router_id - ((offset) * num_routers);
r->global_channel[i]=r->router_id - ((router_offset) * num_routers);
}
if(r->global_channel[i]<0)
{
......@@ -792,17 +807,17 @@ void terminal_rc_event_handler(terminal_state * s, tw_bf * bf, terminal_message
case T_ARRIVE:
{
tw_rand_reverse_unif(lp->rng);
msg->my_N_hop--;
s->next_credit_available_time = msg->saved_available_time;
s->next_credit_available_time = msg->saved_credit_time;
if(bf->c2)
{
N_finished_packets--;
dragonfly_total_time -= (tw_now(lp) - msg->travel_start_time);
total_hops -= msg->my_N_hop;
}
if(bf->c3)
dragonfly_max_latency = msg->saved_available_time;
}
msg->my_N_hop--;
}
break;
......@@ -880,7 +895,7 @@ tw_lptype dragonfly_lps[] =
(init_f)terminal_init,
(event_f) terminal_event,
(revent_f) terminal_rc_event_handler,
(final_f) dragonfly_final,
(final_f) dragonfly_terminal_final,
(map_f) codes_mapping,
sizeof(terminal_state)
},
......@@ -888,7 +903,7 @@ tw_lptype dragonfly_lps[] =
(init_f) router_setup,
(event_f) router_event,
(revent_f) router_rc_event_handler,
(final_f) dragonfly_final,
(final_f) dragonfly_router_final,
(map_f) codes_mapping,
sizeof(router_state),
},
......
......@@ -35,6 +35,7 @@ int model_net_setup(char* name,
{
method_array[i]->mn_setup(net_params);
method_array[i]->packet_size = packet_size;
model_net_add_lp_type(i);
return(i);
}
}
......
......@@ -260,6 +260,7 @@ static void packet_generate( nodes_state * s,
//msg->saved_available_time = s->next_flit_generate_time[(2*tmp_dim) + tmp_dir][0];
msg->travel_start_time = tw_now(lp);
msg->packet_ID = lp->gid + g_tw_nlp * s->packet_counter;
msg->my_N_hop = 0;
num_chunks = msg->packet_size/CHUNK_SIZE;
s->packet_counter++;
......@@ -284,8 +285,6 @@ static void packet_generate( nodes_state * s,
m->chunk_id = j;
// find destination dimensions using destination LP ID
m->my_N_hop = 0;
// Send the packet out
m->type = SEND;
m->source_direction = tmp_dir;
m->source_dim = tmp_dim;
......@@ -515,7 +514,6 @@ static void node_rc_handler(nodes_state * s, tw_bf * bf, nodes_message * msg, tw
case ARRIVAL:
{
msg->my_N_hop--;
tw_rand_reverse_unif(lp->rng);
tw_rand_reverse_unif(lp->rng);
int next_dim = msg->source_dim;
......@@ -528,6 +526,7 @@ static void node_rc_handler(nodes_state * s, tw_bf * bf, nodes_message * msg, tw
total_time -= tw_now( lp ) - msg->travel_start_time;
total_hops -= msg->my_N_hop;
}
msg->my_N_hop--;
}
break;
......
check_PROGRAMS += tests/modelnet-test
TESTS += tests/modelnet-test.sh
EXTRA_DIST += tests/modelnet-test.sh
TESTS += tests/modelnet-test.sh \
tests/modelnet-test-torus.sh \
tests/modelnet-test-dragonfly.sh
EXTRA_DIST += tests/modelnet-test.sh \
tests/modelnet-test-torus.sh \
tests/modelnet-test-dragonfly.sh
testlib = src/libcodes-net.a
......
......@@ -2,14 +2,10 @@ LPGROUPS
{
MODELNET_GRP
{
repetitions="72";
server="1";
modelnet_dragonfly="1";
}
DRAGONFLY_ROUTER
{
repetitions="36";
dragonfly_router="1";
repetitions="36";
server="2";
modelnet_dragonfly="2";
dragonfly_router="1";
}
}
PARAMS
......@@ -24,6 +20,6 @@ PARAMS
local_bandwidth="5.25";
global_bandwidth="4.7";
cn_bandwidth="5.25";
message_size="1024";
message_size="2048";
routing="minimal";
}
#!/bin/bash
srcdir = .
tests/modelnet-test --sync=1 $srcdir/tests/modelnet-test-dragonfly.conf
#!/bin/bash
srcdir = @srcdir@
tests/modelnet-test --sync=1 $srcdir/tests/modelnet-test-dragonfly.conf
......@@ -2,7 +2,7 @@ LPGROUPS
{
MODELNET_GRP
{
repetitions="16";
repetitions="32";
server="1";
modelnet_torus="1";
}
......@@ -13,7 +13,7 @@ PARAMS
modelnet="torus";
message_size="512";
n_dims="4";
dim_length="2,2,2,2";
dim_length="4,2,2,2";
link_bandwidth="2.0";
buffer_size="256";
num_vc="1";
......
#!/bin/bash
srcdir = .
tests/modelnet-test --sync=1 $srcdir/tests/modelnet-test-torus.conf
#!/bin/bash
srcdir = @srcdir@
tests/modelnet-test --sync=1 $srcdir/tests/modelnet-test-torus.conf
......@@ -24,11 +24,13 @@
#include "codes/configuration.h"
#include "codes/lp-type-lookup.h"
#define NUM_REQS 1000 /* number of requests sent by each server */
#define NUM_REQS 500 /* number of requests sent by each server */
#define PAYLOAD_SZ 2048 /* size of simulated data payload, bytes */
static int net_id = 0;
static int num_servers = 16;
static int num_routers = 0;
static int num_servers = 0;
static int offset = 2;
typedef struct svr_msg svr_msg;
typedef struct svr_state svr_state;
......@@ -75,9 +77,6 @@ static void svr_rev_event(
static void svr_finalize(
svr_state * ns,
tw_lp * lp);
/*static tw_peid svr_node_mapping(
tw_lpid gid);
*/
tw_lptype svr_lp = {
(init_f) svr_init,
......@@ -136,7 +135,6 @@ static void handle_req_rev_event(
const tw_optdef app_opt [] =
{
TWOPT_GROUP("Model net test case" ),
TWOPT_UINT("num_servers", num_servers, "num_servers"),
TWOPT_END()
};
......@@ -148,7 +146,6 @@ int main(
int rank;
int ret;
lp_io_handle handle;
int message_size=0;
//printf("\n Config count %d ",(int) config.lpgroups_count);
g_tw_ts_end = s_to_ns(60*60*24*365); /* one year, in nsecs */
......@@ -157,7 +154,7 @@ int main(
if(argc < 2)
{
printf("\n Usage: mpirun <args> --sync=2/3 mapping_file_name.conf (optional --nkp) num_servers");
printf("\n Usage: mpirun <args> --sync=2/3 mapping_file_name.conf (optional --nkp) ");
MPI_Finalize();
return 0;
}
......@@ -166,32 +163,17 @@ int main(
configuration_load(argv[2], MPI_COMM_WORLD, &config);
net_id=model_net_set_params();
configuration_get_value_int(&config, "PARAMS", "message_size", &message_size);
model_net_add_lp_type(net_id);
svr_add_lp_type();
codes_mapping_setup();
g_tw_mapping=CUSTOM;
g_tw_custom_initial_mapping=&codes_mapping_init;
g_tw_custom_lp_global_to_local_map=&codes_mapping_to_lp;
g_tw_events_per_pe = 2048 * codes_mapping_get_lps_for_pe();
if(!message_size)
num_servers = codes_mapping_get_group_reps("MODELNET_GRP") * codes_mapping_get_lp_count("MODELNET_GRP", "server");
if(net_id == DRAGONFLY)
{
message_size = 256;
printf("\n Warning: ross message size not defined, resetting it to %d", message_size);
num_routers = codes_mapping_get_group_reps("MODELNET_GRP") * codes_mapping_get_lp_count("MODELNET_GRP", "dragonfly_router");
offset = 1;
}
tw_define_lps(codes_mapping_get_lps_for_pe(), message_size, 0 );
//g_tw_events_per_pe = 2 * NUM_REQS * (codes_mapping_get_lps_for_pe());
/* NOTE: the message size defined here has to be able to handle two
* svr_msg structs and a simplenet message joined together. This allows
* the model to send a single simplenet even that will handle a)
* simplenet routing b) remote event delivery and c) local send
* completion event.
*/
ret = lp_io_prepare("simplenet-test", LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD);
if(ret < 0)
{
......@@ -205,7 +187,6 @@ int main(
assert(ret == 0);
tw_end();
return 0;
}
......@@ -233,7 +214,7 @@ static void svr_init(
* simulation
*/
// printf("\n Initializing servers %d ", (int)lp->gid);
//printf("\n Initializing servers %d ", (int)lp->gid);
/* skew each kickoff event slightly to help avoid event ties later on */
kickoff_time = g_tw_lookahead + tw_rand_unif(lp->rng);
......@@ -309,12 +290,6 @@ static void svr_finalize(
return;
}
/*static tw_peid svr_node_mapping(
tw_lpid gid)
{
return (tw_peid) gid / g_tw_nlp;
}*/
/* convert ns to seconds */
static tw_stime ns_to_s(tw_stime ns)
{
......@@ -348,8 +323,13 @@ static void handle_kickoff_event(
/* record when transfers started on this server */
ns->start_ts = tw_now(lp);
int opt_offset = 0;
if(net_id == DRAGONFLY && lp->gid % 5)
opt_offset = 3; /* optional offset due to dragonfly mapping */
/* each server sends a request to the next highest server */
model_net_event(net_id, "test", (lp->gid + 2)%(num_servers*2), PAYLOAD_SZ, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
int dest_id = (lp->gid + offset + opt_offset)%(num_servers*2 + num_routers);
model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
ns->msg_sent_count++;
}
......@@ -433,7 +413,11 @@ static void handle_ack_event(
/* safety check that this request got to the right server */
// printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
assert(m->src == (lp->gid + 2)%(num_servers*2));
int opt_offset = 0;
if(net_id == DRAGONFLY && lp->gid % 5)
opt_offset = 3;
assert(m->src == (lp->gid + offset + opt_offset)%(num_servers*2 + num_routers));
if(ns->msg_sent_count < NUM_REQS)
{
......@@ -469,7 +453,11 @@ static void handle_req_event(
/* safety check that this request got to the right server */
// printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
assert(lp->gid == (m->src + 2)%(num_servers*2));
int opt_offset = 0;
if(net_id == DRAGONFLY && (m->src % 5))
opt_offset = 3; /* optional offset due to dragonfly mapping */
assert(lp->gid == (m->src + offset + opt_offset)%(num_servers*2 + num_routers));
ns->msg_recvd_count++;
/* send ack back */
......
......@@ -10,6 +10,7 @@ LPGROUPS
PARAMS
{
packet_size="512";
message_size="256";
modelnet="simplenet";
net_startup_ns="1.5";
net_bw_mbps="20000";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment