Commit dd64e83f authored by Misbah Mubarak's avatar Misbah Mubarak

Updating documentation for sampling, generating meta files for sampled data

parent ef5adb76
************ Synthetic traffic with dragonfly network model **********
- traffic patterns supported: uniform random, nearest group and nearest neighbor traffic.
- Uniform random traffic: sends messages to a randomly selected destination
- 1--> Uniform random traffic: sends messages to a randomly selected destination
node. This traffic pattern is uniformly distributed throughout the
network and gives a better performance with minimal routing as compared
to non-minimal or adaptive routing.
- Nearest group traffic: with minimal routing, it sends traffic to the
- 2--> Nearest group traffic: with minimal routing, it sends traffic to the
single global channel connecting two groups (it congests the network when
using minimal routing). This pattern performs better with non-minimal
and adaptive routing algorithms.
- Nearest neighbor traffic: it sends traffic to the next node, potentially
- 3--> Nearest neighbor traffic: it sends traffic to the next node, potentially
connected to the same router.
SAMPLING:
- The modelnet_enable_sampling function takes a sampling interval "t" and
an end time. Over this end time, dragonfly model will collect compute
an end time in nanosecs. Over this end time, dragonfly model will collect compute
node and router samples after every "t" simulated nanoseconds. The
sampling output files can be specified in the config file using
names of the sampling output files can be specified in the config file using
cn_sample_file and rt_sample_file arguments. By default the compute node
and router outputs will be sent to dragonfly-cn-sampling-%d.bin and
dragonfly-router-sampling-%d.bin. Corresponding metadata files for also
dragonfly-router-sampling-%d.bin. Corresponding metadata files are also
generated that gives information on the file format, dragonfly
configuration being used, router radix etc.
......@@ -26,20 +26,22 @@ SAMPLING:
text format can be found at
src/networks/model-net/read-dragonfly-sample.c (Note that the router
radix aka RADIX needs to be tuned with the dragonfly configuration in the
utility to enable continguous array allocation).
utility to enable continguous array allocation. By default the radix is
set to 16 corresponding to a 1,056 node dragonfly network). The utility
can be built using mpicc and it expects the generated binary files to be
in the same directory when doing the translation from binary into text.
HOW TO RUN:
ROSS optimistic mode:
mpirun -np 4 ./src/models/network-workloads/model-net-synthetic --sync=3
--traffic=3 --lp-io-dir=mn_synthetic --lp-io-use-suffix=1 --arrival_time=100.0
-- ../src/models/network-workloads/conf/modelnet-synthetic-dragonfly.conf
mpirun -np 4 ./bin/model-net-synthetic --sync=3 --traffic=1
--lp-io-dir=mn_synthetic --lp-io-use-suffix=1 --arrival_time=1000.0 --
../src/network-workloads/conf/modelnet-synthetic-dragonfly.conf
ROSS serial mode:
./src/models/network-workloads/model-net-synthetic --sync=1 --traffic=3
--lp-io-dir=mn_synthetic --lp-io-use-suffix=1 --arrival_time=100.0 --
../src/models/network-workloads/conf/modelnet-synthetic-dragonfly.conf
./bin/model-net-synthetic --sync=1 --traffic=1
--lp-io-dir=mn_synthetic --lp-io-use-suffix=1 --arrival_time=1000.0 --
../src/network-workloads/conf/modelnet-synthetic-dragonfly.conf
options:
......@@ -53,11 +55,9 @@ num_msgs: number of messages generated per terminal. Each message has a size of
traffic: 1 for uniform random traffic, 2 for nearest group traffic and 3 for
nearest neighbor traffic.
sampling-interval: if time-stepped series sampling is turned on, this parameter
can be used to configure the sampling interval.
sampling-interval: this parameter can be used to configure the sampling interval.
sampling-end-time: if time-stepped series sampling is turned on, this parameter
can be used to configure end time.
sampling-end-time: this parameter can be used to configure end time.
lp-io-dir: generates network traffic information on dragonfly terminals and
routers. Here is information on individual files:
......
......@@ -718,7 +718,6 @@ static void codes_exec_mpi_recv_rc(
nw_message* m,
tw_lp* lp)
{
num_bytes_recvd -= m->rc.saved_num_bytes;
ns->recv_time = m->rc.saved_recv_time;
if(m->fwd.found_match >= 0)
{
......@@ -781,8 +780,6 @@ static void codes_exec_mpi_recv(
m->rc.saved_recv_time = s->recv_time;
m->rc.saved_num_bytes = mpi_op->u.recv.num_bytes;
num_bytes_recvd += mpi_op->u.recv.num_bytes;
mpi_msgs_queue * recv_op = (mpi_msgs_queue*) malloc(sizeof(mpi_msgs_queue));
recv_op->req_init_time = tw_now(lp);
recv_op->op_type = mpi_op->op_type;
......@@ -990,6 +987,7 @@ static void update_arrival_queue_rc(nw_state* s,
{
s->recv_time = m->rc.saved_recv_time;
s->num_bytes_recvd -= m->fwd.num_bytes;
num_bytes_recvd -= m->fwd.num_bytes;
codes_local_latency_reverse(lp);
......@@ -1045,6 +1043,7 @@ static void update_arrival_queue(nw_state* s, tw_bf * bf, nw_message * m, tw_lp
// printf("\n Dest rank %d local rank %d ", m->fwd.dest_rank, s->local_rank);
m->rc.saved_recv_time = s->recv_time;
s->num_bytes_recvd += m->fwd.num_bytes;
num_bytes_recvd += m->fwd.num_bytes;
// send a callback to the sender to increment times
// find the global id of the source
......@@ -1270,7 +1269,7 @@ static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, t
if(m->op_type == CODES_WK_ISEND)
codes_issue_next_event_rc(lp);
s->num_sends--;
s->num_bytes_sent += m->rc.saved_num_bytes;
s->num_bytes_sent -= m->rc.saved_num_bytes;
num_bytes_sent -= m->rc.saved_num_bytes;
}
break;
......@@ -1703,7 +1702,9 @@ int main( int argc, char** argv )
assert(num_net_traces);
if(!g_tw_mynode)
printf("\n Total bytes sent %llu recvd %llu \n max runtime %lf ns avg runtime %lf \n max comm time %lf avg comm time %lf \n max send time %lf avg send time %lf \n max recv time %lf avg recv time %lf \n max wait time %lf avg wait time %lf \n", total_bytes_sent, total_bytes_recvd,
printf("\n Total bytes sent %llu recvd %llu \n max runtime %lf ns avg runtime %lf \n max comm time %lf avg comm time %lf \n max send time %lf avg send time %lf \n max recv time %lf avg recv time %lf \n max wait time %lf avg wait time %lf \n",
total_bytes_sent,
total_bytes_recvd,
max_run_time, avg_run_time/num_net_traces,
max_comm_run_time, avg_comm_run_time/num_net_traces,
total_max_send_time, total_avg_send_time/num_net_traces,
......
......@@ -75,7 +75,7 @@ int main( int argc, char** argv )
long in_sz = st.st_size;
event_array = malloc(in_sz);
sprintf(buffer_write, "dragonfly-write-log.%d", my_rank);
sprintf(buffer_write, "dragonfly-write-log-%d.dat", my_rank);
writeFile = fopen(buffer_write, "w");
if(pFile == NULL || writeFile == NULL)
......@@ -83,13 +83,21 @@ int main( int argc, char** argv )
fputs("\n File error ", stderr);
return -1;
}
if(my_rank == 0)
{
char meta_filename[128];
sprintf(meta_filename, "dragonfly-write-log.meta");
FILE * fp_meta = fopen(meta_filename, "w+");
fprintf(fp_meta, "Rank_ID num_finished_chunks data_size_finished(bytes) finished_hops time_spent(ns) busy_time(ns) num_fwd_events num_rev_events sample_end_time(ns)");
fclose(fp_meta);
}
fseek(pFile, 0L, SEEK_SET);
fread(event_array, sizeof(struct dfly_samples), in_sz / sizeof(struct dfly_samples), pFile);
fprintf(writeFile, " Rank_ID Finished_chunks Data_size Finished_hops Time_spent busy_time fwd_events rev_events sample_end_time");
for(i = 0; i < in_sz / sizeof(struct dfly_samples); i++)
{
printf("\n Terminal id %ld ", event_array[i].terminal_id);
fprintf(writeFile, "\n %ld %ld %ld %lf %lf %lf %ld %ld %lf ", event_array[i].terminal_id,
fprintf(writeFile, "%ld %ld %ld %lf %lf %lf %ld %ld %lf \n", event_array[i].terminal_id,
event_array[i].fin_chunks_sample,
event_array[i].data_size_sample,
event_array[i].fin_hops_sample,
......@@ -117,7 +125,7 @@ int main( int argc, char** argv )
r_event_array = malloc(in_sz_rt);
int sample_size = sizeof(struct dfly_rtr_sample);
sprintf(buffer_rtr_write, "dragonfly-rtr-write-log.%d", my_rank);
sprintf(buffer_rtr_write, "dragonfly-rtr-write-%d.dat", my_rank);
writeRouterFile = fopen(buffer_rtr_write, "w");
if(writeRouterFile == NULL || pFile == NULL)
......@@ -127,13 +135,23 @@ int main( int argc, char** argv )
return -1;
}
fseek(pFile, 0L, SEEK_SET);
if(my_rank == 0)
{
char rtr_meta_filename[128];
sprintf(rtr_meta_filename, "dragonfly-rtr-write-log.meta");
FILE * fp_rtr_meta = fopen(rtr_meta_filename, "w+");
fprintf(fp_rtr_meta, "%d entries for busy time and link traffic \n", RADIX);
fprintf(fp_rtr_meta, "Format: Router_ID Busy_time_per_channel(ns) Link_traffic_per_channel(ns) Sample_end_time(ns) fwd_events reverse_events");
fclose(fp_rtr_meta);
}
fread(r_event_array, sample_size, in_sz_rt / sample_size, pFile);
fprintf(writeRouterFile, "\n Router_ID Busy_time_per_channel Link_traffic_per_channel Sample_end_time fwd_events reverse_events");
//printf("\n Sample size %d in_sz_rt %ld ", in_sz_rt / sample_size, in_sz_rt);
for(i = 0; i < in_sz_rt / sample_size; i++)
{
//printf("\n %ld ", r_event_array[i].router_id);
fprintf(writeRouterFile, "\n %ld ", r_event_array[i].router_id);
fprintf(writeRouterFile, "%ld ", r_event_array[i].router_id);
for(j = 0; j < RADIX; j++ )
{
......@@ -146,7 +164,7 @@ int main( int argc, char** argv )
//printf("\n link traffic %ld ", r_event_array[i].link_traffic[j]);
fprintf(writeRouterFile, " %ld ", r_event_array[i].link_traffic[j]);
}
fprintf(writeRouterFile, " %lf \n", r_event_array[i].end_time);
fprintf(writeRouterFile, " %lf ", r_event_array[i].end_time);
fprintf(writeRouterFile, " %ld ", r_event_array[i].fwd_events);
fprintf(writeRouterFile, " %ld \n", r_event_array[i].rev_events);
}
......@@ -163,10 +181,19 @@ int main( int argc, char** argv )
mpi_event_array = malloc(in_sz_mpi);
int mpi_sample_sz = sizeof(struct mpi_workload_sample);
sprintf(buffer_rtr_write, "dragonfly-mpi-write-logs-%d", my_rank);
sprintf(buffer_rtr_write, "dragonfly-mpi-write-logs-%d.dat", my_rank);
writeFile = fopen(buffer_rtr_write, "w+");
assert(writeFile);
if(my_rank == 0)
{
char ops_meta_filename[128];
sprintf(ops_meta_filename, "dragonfly-mpi-write-logs.meta");
FILE * fp_ops_meta = fopen(ops_meta_filename, "w+");
fprintf(fp_ops_meta, "network_node_id app_id num_sends num_bytes_sent num_waits sample_end_time(ns)");
fclose(fp_ops_meta);
}
fread(mpi_event_array, mpi_sample_sz, in_sz_mpi / mpi_sample_sz, pFile);
for(i = 0; i < in_sz_mpi / mpi_sample_sz; i++)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment