Commit 81c58341 authored by Caitlin Ross's avatar Caitlin Ross
Browse files

adding some additional metrics to MPI replay sampling

parent 447e212c
...@@ -41,7 +41,7 @@ PARAMS ...@@ -41,7 +41,7 @@ PARAMS
# bandwidth in GiB/s for compute node-router channels # bandwidth in GiB/s for compute node-router channels
cn_bandwidth="16.0"; cn_bandwidth="16.0";
# ROSS message size # ROSS message size
message_size="720"; message_size="736";
# number of compute nodes connected to router, dictated by dragonfly config # number of compute nodes connected to router, dictated by dragonfly config
# file # file
num_cns_per_router="2"; num_cns_per_router="2";
......
...@@ -23,6 +23,6 @@ PARAMS ...@@ -23,6 +23,6 @@ PARAMS
local_bandwidth="5.25"; local_bandwidth="5.25";
global_bandwidth="4.7"; global_bandwidth="4.7";
cn_bandwidth="5.25"; cn_bandwidth="5.25";
message_size="720"; message_size="736";
routing="adaptive"; routing="adaptive";
} }
...@@ -31,6 +31,6 @@ PARAMS ...@@ -31,6 +31,6 @@ PARAMS
cn_bandwidth="9.0"; cn_bandwidth="9.0";
router_delay="0"; router_delay="0";
link_delay="0"; link_delay="0";
message_size="720"; message_size="736";
routing="minimal"; routing="minimal";
} }
...@@ -10,7 +10,7 @@ LPGROUPS ...@@ -10,7 +10,7 @@ LPGROUPS
PARAMS PARAMS
{ {
packet_size="512"; packet_size="512";
message_size="720"; message_size="736";
modelnet_order=( "torus" ); modelnet_order=( "torus" );
# scheduler options # scheduler options
modelnet_scheduler="fcfs"; modelnet_scheduler="fcfs";
......
...@@ -226,6 +226,9 @@ struct ross_model_sample ...@@ -226,6 +226,9 @@ struct ross_model_sample
double recv_time; double recv_time;
double wait_time; double wait_time;
double compute_time; double compute_time;
double comm_time;
double max_time;
double avg_msg_time;
}; };
typedef struct mpi_msgs_queue mpi_msgs_queue; typedef struct mpi_msgs_queue mpi_msgs_queue;
...@@ -363,6 +366,7 @@ struct nw_message ...@@ -363,6 +366,7 @@ struct nw_message
int64_t saved_num_bytes; int64_t saved_num_bytes;
int saved_syn_length; int saved_syn_length;
unsigned long saved_prev_switch; unsigned long saved_prev_switch;
double saved_prev_max_time;
} rc; } rc;
}; };
...@@ -848,6 +852,11 @@ void arrive_syn_tr_rc(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp) ...@@ -848,6 +852,11 @@ void arrive_syn_tr_rc(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp)
s->ross_sample.num_bytes_recvd -= data; s->ross_sample.num_bytes_recvd -= data;
s->send_time = m->rc.saved_send_time; s->send_time = m->rc.saved_send_time;
s->ross_sample.send_time = m->rc.saved_send_time_sample; s->ross_sample.send_time = m->rc.saved_send_time_sample;
if((tw_now(lp) - m->fwd.sim_start_time) > s->max_time)
{
s->max_time = m->rc.saved_prev_max_time;
s->ross_sample.max_time = m->rc.saved_prev_max_time;
}
} }
void arrive_syn_tr(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp) void arrive_syn_tr(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp)
{ {
...@@ -874,7 +883,11 @@ void arrive_syn_tr(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp) ...@@ -874,7 +883,11 @@ void arrive_syn_tr(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp)
m->rc.saved_send_time = s->send_time; m->rc.saved_send_time = s->send_time;
m->rc.saved_send_time_sample = s->ross_sample.send_time; m->rc.saved_send_time_sample = s->ross_sample.send_time;
if((tw_now(lp) - m->fwd.sim_start_time) > s->max_time) if((tw_now(lp) - m->fwd.sim_start_time) > s->max_time)
{
m->rc.saved_prev_max_time = s->max_time;
s->max_time = tw_now(lp) - m->fwd.sim_start_time; s->max_time = tw_now(lp) - m->fwd.sim_start_time;
s->ross_sample.max_time = tw_now(lp) - m->fwd.sim_start_time;
}
s->send_time += (tw_now(lp) - m->fwd.sim_start_time); s->send_time += (tw_now(lp) - m->fwd.sim_start_time);
s->ross_sample.send_time += (tw_now(lp) - m->fwd.sim_start_time); s->ross_sample.send_time += (tw_now(lp) - m->fwd.sim_start_time);
...@@ -2839,6 +2852,15 @@ void ross_nw_lp_sample_fn(nw_state * s, tw_bf * bf, tw_lp * lp, struct ross_mode ...@@ -2839,6 +2852,15 @@ void ross_nw_lp_sample_fn(nw_state * s, tw_bf * bf, tw_lp * lp, struct ross_mode
sample->nw_id = s->nw_id; sample->nw_id = s->nw_id;
sample->app_id = s->app_id; sample->app_id = s->app_id;
sample->local_rank = s->local_rank; sample->local_rank = s->local_rank;
sample->comm_time = s->elapsed_time - s->compute_time;
if (alloc_spec == 1)
{
struct codes_jobmap_id lid;
lid = codes_jobmap_to_local_id(s->nw_id, jobmap_ctx);
if(strncmp(file_name_of_job[lid.job], "synthetic", 9) == 0)
sample->avg_msg_time = (s->send_time / s->num_recvs);
}
memset(&s->ross_sample, 0, sizeof(s->ross_sample)); memset(&s->ross_sample, 0, sizeof(s->ross_sample));
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment