Commit 95a285af authored by Xin's avatar Xin

add allreduce output for OSU benchmark tests

parent 3ca902db
......@@ -28,6 +28,7 @@
#define lprintf(_fmt, ...) \
do {if (CS_LP_DBG) printf(_fmt, __VA_ARGS__);} while (0)
#define MAX_STATS 65536
#define MAX_MSGS 50
static int msg_size_hash_compare(
void *key, struct qhash_head *link);
......@@ -236,6 +237,10 @@ struct nw_state
double all_reduce_time;
int num_all_reduce;
double col_latency[MAX_MSGS];
uint64_t col_msizes[MAX_MSGS];
int num_msg_sizes;
double elapsed_time;
/* time spent in compute operations */
double compute_time;
......@@ -274,7 +279,7 @@ struct nw_state
int max_arr_size;
struct mpi_workload_sample * mpi_wkld_samples;
char output_buf[512];
char col_stats[64];
char col_stats[100*MAX_MSGS];
};
/* data for handling reverse computation.
......@@ -1757,6 +1762,7 @@ void nw_test_init(nw_state* s, tw_lp* lp)
s->num_reduce = 0;
s->reduce_time = 0;
s->all_reduce_time = 0;
s->num_msg_sizes = 0;
if(!num_net_traces)
num_net_traces = num_mpi_lps;
......@@ -1849,7 +1855,7 @@ void nw_test_init(nw_state* s, tw_lp* lp)
}
else
{
printf("\n Trace %s job id %d %d ", file_name_of_job[lid.job], s->app_id, s->local_rank);
printf("\n Trace %s job id %d %d on nid %d", file_name_of_job[lid.job], s->app_id, s->local_rank, s->nw_id);
strcpy(params_d.file_name, file_name_of_job[lid.job]);
params_d.num_net_traces = num_traces_of_job[lid.job];
params = (char*)&params_d;
......@@ -2029,6 +2035,9 @@ static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, t
{
if(bf->c1)
{
s->col_latency[s->num_msg_sizes] = 0;
s->col_msizes[s->num_msg_sizes] = 0;
//todo: reverse handler for num_msg_sizes
s->num_all_reduce--;
s->col_time = m->rc.saved_send_time;
s->all_reduce_time -= s->col_time;
......@@ -2174,6 +2183,16 @@ static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_l
{
bf->c1 = 1;
m->rc.saved_delay = s->all_reduce_time;
if(s->num_msg_sizes == 0)
{
s->num_msg_sizes += 1;
}
else if(s->col_msizes[s->num_msg_sizes-1] != mpi_op.u.collective.num_bytes)
{
s->num_msg_sizes += 1;
}
s->col_latency[s->num_msg_sizes-1] += (tw_now(lp) - s->col_time);
s->col_msizes[s->num_msg_sizes-1] = mpi_op.u.collective.num_bytes;
s->all_reduce_time += (tw_now(lp) - s->col_time);
m->rc.saved_send_time = s->col_time;
s->col_time = 0;
......@@ -2296,8 +2315,14 @@ void nw_test_finalize(nw_state* s, tw_lp* lp)
written = 0;
if(debug_cols)
written += sprintf(s->col_stats + written, "%lld \t %lf \n", s->nw_id, ns_to_s(s->all_reduce_time / s->num_all_reduce));
{
for (int i=0; i< MAX_MSGS; i++)
{
written += sprintf(s->col_stats + written, "\n %d %d %lld %lf",
s->local_rank, s->num_all_reduce, s->col_msizes[i], s->col_latency[i]);
}
//written += sprintf(s->col_stats + written, "%lld \t %lf \n", s->nw_id, ns_to_s(s->all_reduce_time / s->num_all_reduce));
}
lp_io_write(lp->gid, "avg-all-reduce-time", written, s->col_stats);
avg_time += s->elapsed_time;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment