Commit 80e9fef0 authored by Misbah Mubarak's avatar Misbah Mubarak

Adding MPI rendezvous procotol, statistics recording for dragonfly validation

parent da5879aa
......@@ -29,12 +29,15 @@ int main(int argc, char **argv) {
int r = atoi(argv[2]);
int c = atoi(argv[3]);
int total_routers = g * r * c;
FILE *intra = fopen(argv[4], "wb");
FILE *inter = fopen(argv[5], "wb");
int router = 0;
int green = 0, black = 1;
int groups = 0;
printf("\n Rows %d Cols %d Groups %d ", r, c, g);
for(int rows = 0; rows < r; rows++) {
for(int cols = 0; cols < c; cols++) {
for(int cols1 = 0; cols1 < c; cols1++) {
......@@ -79,15 +82,19 @@ int main(int argc, char **argv) {
int dstB = (nsrcg % (gs/2)) * 2;
srcr = srcrB + srcB;
dstr = dstrB + dstB;
for(int r = 0; r < 2; r++) {
for(int block = 0; block < gsize; block++) {
fwrite(&srcr, sizeof(int), 1, inter);
fwrite(&dstr, sizeof(int), 1, inter);
printf("INTER %d %d\n", srcr, dstr);
if(srcr >= total_routers || dstr >= total_routers)
printf("\n connection between invalid routers src %d and dest %d ", srcr, dstr);
for(int r = 0; r < 2; r++) {
for(int block = 0; block < gsize; block++) {
fwrite(&srcr, sizeof(int), 1, inter);
fwrite(&dstr, sizeof(int), 1, inter);
//printf("INTER %d %d srcg %d destg %d srcrb %d dstrB %d \n", srcr, dstr, srcg, dstg, srcrB, dstrB);
}
}
srcr++;
dstr++;
}
}
}
}
......
......@@ -49,9 +49,9 @@ PARAMS
# number of global channels per router
num_global_channels="10";
# network config file for intra-group connections
intra-group-connections="@abs_srcdir@/intra-custom-small";
intra-group-connections="@abs_srcdir@/intra-theta";
# network config file for inter-group connections
inter-group-connections="@abs_srcdir@/inter-custom-small";
inter-group-connections="@abs_srcdir@/inter-theta";
# routing protocol to be used
routing="prog-adaptive";
}
......@@ -49,9 +49,9 @@ PARAMS
# number of global channels per router
num_global_channels="10";
# network config file for intra-group connections
intra-group-connections="../src/network-workloads/conf/dragonfly-custom/intra-custom";
intra-group-connections="../src/network-workloads/conf/dragonfly-custom/intra-custom-upd";
# network config file for inter-group connections
inter-group-connections="../src/network-workloads/conf/dragonfly-custom/inter-custom";
inter-group-connections="../src/network-workloads/conf/dragonfly-custom/inter-custom-upd";
# routing protocol to be used
routing="prog-adaptive";
}
......@@ -37,7 +37,7 @@ PARAMS
#bandwidth in GiB/s for local channels
local_bandwidth="5.25";
# bandwidth in GiB/s for global channels
global_bandwidth="18.75";
global_bandwidth="4.69";
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="16.0";
# ROSS message size
......@@ -52,5 +52,5 @@ PARAMS
# network config file for inter-group connections
inter-group-connections="../src/network-workloads/conf/dragonfly-custom/inter-theta";
# routing protocol to be used
routing="adaptive";
routing="prog-adaptive";
}
......@@ -35,8 +35,8 @@ static lp_io_handle io_handle;
static unsigned int lp_io_use_suffix = 0;
static int do_lp_io = 0;
static int num_msgs = 20;
static unsigned int sampling_interval = 800000;
static unsigned int sampling_end_time = 1600000;
static tw_stime sampling_interval = 800000;
static tw_stime sampling_end_time = 1600000;
typedef struct svr_msg svr_msg;
typedef struct svr_state svr_state;
......@@ -111,8 +111,8 @@ const tw_optdef app_opt [] =
TWOPT_GROUP("Model net synthetic traffic " ),
TWOPT_UINT("traffic", traffic, "UNIFORM RANDOM=1, NEAREST NEIGHBOR=2 "),
TWOPT_UINT("num_messages", num_msgs, "Number of messages to be generated per terminal "),
TWOPT_UINT("sampling-interval", sampling_interval, "the sampling interval "),
TWOPT_UINT("sampling-end-time", sampling_end_time, "sampling end time "),
TWOPT_STIME("sampling-interval", sampling_interval, "the sampling interval "),
TWOPT_STIME("sampling-end-time", sampling_end_time, "sampling end time "),
TWOPT_STIME("arrival_time", arrival_time, "INTER-ARRIVAL TIME"),
TWOPT_CHAR("lp-io-dir", lp_io_dir, "Where to place io output (unspecified -> no output"),
TWOPT_UINT("lp-io-use-suffix", lp_io_use_suffix, "Whether to append uniq suffix to lp-io directory (default 0)"),
......
......@@ -35,8 +35,8 @@ static lp_io_handle io_handle;
static unsigned int lp_io_use_suffix = 0;
static int do_lp_io = 0;
static int num_msgs = 20;
static unsigned int sampling_interval = 800000;
static unsigned int sampling_end_time = 1600000;
static tw_stime sampling_interval = 800000;
static tw_stime sampling_end_time = 1600000;
typedef struct svr_msg svr_msg;
typedef struct svr_state svr_state;
......@@ -139,8 +139,8 @@ const tw_optdef app_opt [] =
TWOPT_GROUP("Model net synthetic traffic " ),
TWOPT_UINT("traffic", traffic, "UNIFORM RANDOM=1, NEAREST NEIGHBOR=2 "),
TWOPT_UINT("num_messages", num_msgs, "Number of messages to be generated per terminal "),
TWOPT_UINT("sampling-interval", sampling_interval, "the sampling interval "),
TWOPT_UINT("sampling-end-time", sampling_end_time, "sampling end time "),
TWOPT_STIME("sampling-interval", sampling_interval, "the sampling interval "),
TWOPT_STIME("sampling-end-time", sampling_end_time, "sampling end time "),
TWOPT_STIME("arrival_time", arrival_time, "INTER-ARRIVAL TIME"),
TWOPT_CHAR("lp-io-dir", lp_io_dir, "Where to place io output (unspecified -> no output"),
TWOPT_UINT("lp-io-use-suffix", lp_io_use_suffix, "Whether to append uniq suffix to lp-io directory (default 0)"),
......
......@@ -99,6 +99,8 @@ static int sample_rtr_bytes_written = 0;
static char cn_sample_file[MAX_NAME_LENGTH];
static char router_sample_file[MAX_NAME_LENGTH];
static tw_stime mpi_soft_overhead = 150;
typedef struct terminal_custom_message_list terminal_custom_message_list;
struct terminal_custom_message_list {
terminal_custom_message msg;
......@@ -515,7 +517,7 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
fprintf(stderr, "Bandwidth of compute node channels not specified, setting to %lf\n", p->cn_bandwidth);
}
p->router_delay = 50;
p->router_delay = 100;
configuration_get_value_double(&config, "PARAMS", "router_delay", anno,
&p->router_delay);
......@@ -627,7 +629,10 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
}
FILE *systemFile = fopen(interFile, "rb");
if(!myRank)
{
printf("Reading inter-group connectivity file: %s\n", interFile);
printf("\n Total routers %d total groups %d ", p->total_routers, p->num_groups);
}
{
vector< int > offsets;
......@@ -637,6 +642,7 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
for(int g = 0; g < connectionList.size(); g++) {
connectionList[g].resize(p->num_groups);
}
InterGroupLink newLink;
while(fread(&newLink, sizeof(InterGroupLink), 1, systemFile) != 0) {
......@@ -668,7 +674,7 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
printf(" ( %d - ", it->first);
for(int l = 0; l < it->second.size(); l++) {
// offset is number of local connections
// type is blue or green according to Cray architecture
// type is black or green according to Cray architecture
printf("%d,%d ", it->second[l].offset, it->second[l].type);
}
printf(")");
......@@ -995,18 +1001,18 @@ static tw_stime dragonfly_custom_packet_event(
if(is_last_pckt) /* Its the last packet so pass in remote and local event information*/
{
if(req->remote_event_size > 0)
{
msg->remote_event_size_bytes = req->remote_event_size;
memcpy(tmp_ptr, remote_event, req->remote_event_size);
tmp_ptr += req->remote_event_size;
}
if(req->self_event_size > 0)
{
msg->local_event_size_bytes = req->self_event_size;
memcpy(tmp_ptr, self_event, req->self_event_size);
tmp_ptr += req->self_event_size;
}
if(req->remote_event_size > 0)
{
msg->remote_event_size_bytes = req->remote_event_size;
memcpy(tmp_ptr, remote_event, req->remote_event_size);
tmp_ptr += req->remote_event_size;
}
if(req->self_event_size > 0)
{
msg->local_event_size_bytes = req->self_event_size;
memcpy(tmp_ptr, self_event, req->self_event_size);
tmp_ptr += req->self_event_size;
}
}
//printf("\n dragonfly remote event %d local event %d last packet %d %lf ", msg->remote_event_size_bytes, msg->local_event_size_bytes, is_last_pckt, xfer_to_nic_time);
tw_event_send(e_new);
......@@ -1119,13 +1125,18 @@ static void packet_generate(terminal_state * s, tw_bf * bf, terminal_custom_mess
int total_event_size;
uint64_t num_chunks = msg->packet_size / p->chunk_size;
double cn_delay = s->params->cn_delay;
if (msg->packet_size % s->params->chunk_size)
num_chunks++;
if(!num_chunks)
num_chunks = 1;
nic_ts = g_tw_lookahead + (num_chunks * s->params->cn_delay) + tw_rand_unif(lp->rng);
if(msg->packet_size < s->params->chunk_size)
cn_delay = bytes_to_ns(msg->packet_size % s->params->chunk_size, s->params->cn_bandwidth);
nic_ts = g_tw_lookahead + (num_chunks * cn_delay) + tw_rand_unif(lp->rng);
msg->packet_ID = lp->gid + g_tw_nlp * s->packet_counter;
msg->my_N_hop = 0;
......@@ -1138,7 +1149,7 @@ static void packet_generate(terminal_state * s, tw_bf * bf, terminal_custom_mess
msg->packet_ID, s->terminal_id, LLU(msg->dest_terminal_id),
LLU(msg->packet_size), LLU(num_chunks));
for(uint64_t i = 0; i < num_chunks; i++)
for(int i = 0; i < num_chunks; i++)
{
terminal_custom_message_list *cur_chunk = (terminal_custom_message_list*)malloc(
sizeof(terminal_custom_message_list));
......@@ -1449,7 +1460,7 @@ static void send_remote_event(terminal_state * s, terminal_custom_message * msg,
{
void * tmp_ptr = model_net_method_get_edata(DRAGONFLY, msg);
//tw_stime ts = g_tw_lookahead + bytes_to_ns(msg->remote_event_size_bytes, (1/s->params->cn_bandwidth));
tw_stime ts = g_tw_lookahead + tw_rand_unif(lp->rng);
tw_stime ts = g_tw_lookahead + mpi_soft_overhead + tw_rand_unif(lp->rng);
if (msg->is_pull){
bf->c4 = 1;
struct codes_mctx mc_dst =
......@@ -1785,7 +1796,7 @@ void dragonfly_custom_rsample_fin(router_state * s,
"link traffic for each of the %d links (int64_t) \nsample end time (double) forward events per sample \nreverse events per sample ",
p->radix, p->radix);
fprintf(fp, "\n\nOrdering of links \n%d local (router-router same group) channels \n%d global (router-router remote group)"
" channels \n%d terminal channels", p->radix/2, p->radix/4, p->radix/4);
" channels \n%d terminal channels", p->num_col_chans * p->num_router_rows, p->num_global_channels);
fclose(fp);
}
char rt_fn[MAX_NAME_LENGTH];
......@@ -2887,7 +2898,10 @@ router_packet_send( router_state * s,
num_chunks = 1;
double bytetime = delay;
if(cur_entry->msg.packet_size == 0)
bytetime = bytes_to_ns(CREDIT_SIZE, bandwidth);
if((cur_entry->msg.packet_size % s->params->chunk_size) && (cur_entry->msg.chunk_id == num_chunks - 1))
bytetime = bytes_to_ns(cur_entry->msg.packet_size % s->params->chunk_size, bandwidth);
......
......@@ -1694,7 +1694,7 @@ static void torus_report_stats()
if(!g_tw_mynode)
{
printf(" Average number of hops traversed %f average message latency %lf us maximum message latency %lf us finished packets %lld finished hops %lld \n",
printf(" Average number of hops traversed %f average packet latency %lf us maximum packet latency %lf us finished packets %lld finished hops %lld \n",
(float)avg_hops/total_finished_packets, avg_time/(total_finished_packets*1000), max_time/1000, total_finished_packets, avg_hops);
}
}
......
......@@ -32,9 +32,12 @@ TESTS += tests/lp-io-test.sh \
tests/modelnet-test-dragonfly.sh \
tests/modelnet-test-slimfly.sh \
tests/modelnet-test-dragonfly-synthetic.sh \
tests/modelnet-test-dragonfly-traces.sh \
tests/modelnet-test-dragonfly-custom-synthetic.sh \
tests/modelnet-test-dragonfly-custom-traces.sh \
tests/modelnet-test-fattree-synthetic.sh \
tests/modelnet-test-slimfly-synthetic.sh \
tests/modelnet-test-slimfly-traces.sh \
tests/modelnet-p2p-bw-loggp.sh \
tests/modelnet-prio-sched-test.sh
......@@ -57,9 +60,16 @@ EXTRA_DIST += tests/lp-io-test.sh \
tests/expected/mapping_test.out \
tests/modelnet-test.sh \
tests/modelnet-test-torus.sh \
tests/modelnet-test-torus-traces.sh \
tests/modelnet-test-loggp.sh \
tests/modelnet-test-dragonfly.sh \
tests/modelnet-test-dragonfly-synthetic.sh \
tests/modelnet-test-dragonfly-traces.sh \
tests/modelnet-test-dragonfly-custom-synthetic.sh \
tests/modelnet-test-dragonfly-custom-traces.sh \
tests/modelnet-test-slimfly.sh \
tests/modelnet-test-slimfly-synthetic.sh \
tests/modelnet-test-slimfly-traces.sh \
tests/modelnet-p2p-bw-loggp.sh \
tests/modelnet-prio-sched-test.sh \
tests/conf/concurrent_msg_recv.conf \
......
#!/bin/bash
if [ -z $srcdir ]; then
echo srcdir variable not set.
exit 1
fi
mpirun -np 2 src/network-workloads/model-net-mpi-replay --sync=3 --num_net_traces=27 --disable_compute=1 --workload_file=/tmp/df_AMG_n27_dumpi/dumpi-2014.03.03.14.55.00- --workload_type="dumpi" -- src/network-workloads/conf/dragonfly-custom/modelnet-test-dragonfly-1728-nodes.conf
#!/bin/bash
if [ -z $srcdir ]; then
echo srcdir variable not set.
exit 1
fi
source $srcdir/tests/download-traces.sh
src/network-workloads/model-net-synthetic --sync=1 --num_messages=1 -- $srcdir/src/network-workloads/conf/modelnet-synthetic-dragonfly.conf
mpirun -np 2 src/network-workloads/model-net-mpi-replay --disable_compute=1 --sync=3 --num_net_traces=27 --workload_file=/tmp/df_AMG_n27_dumpi/dumpi-2014.03.03.14.55.00- --workload_type="dumpi" -- $srcdir/src/network-workloads/conf/modelnet-mpi-test-dfly-amg-216.conf
#!/bin/bash
if [ -z $srcdir ]; then
echo srcdir variable not set.
exit 1
fi
source $srcdir/tests/download-traces.sh
mpirun -np 2 src/network-workloads/model-net-mpi-replay --disable_compute=1 --sync=3 --num_net_traces=27 --workload_file=/tmp/df_AMG_n27_dumpi/dumpi-2014.03.03.14.55.00- --workload_type="dumpi" -- $srcdir/src/network-workloads/conf/modelnet-mpi-test-dfly-amg-216.conf
#!/bin/bash
if [ -z $srcdir ]; then
echo srcdir variable not set.
exit 1
fi
source $srcdir/tests/download-traces.sh
mpirun -np 2 src/network-workloads/model-net-mpi-replay --disable_compute=1 --sync=3 --num_net_traces=27 --workload_file=/tmp/df_AMG_n27_dumpi/dumpi-2014.03.03.14.55.00- --workload_type="dumpi" -- $srcdir/src/network-workloads/conf/modelnet-mpi-test-slimfly-min.conf
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment