Commit 66ba3dda authored by Misbah Mubarak's avatar Misbah Mubarak

Adding statistics collection using lp-io in model-net API (comes from the...

Adding statistics collection using lp-io in model-net API (comes from the simple-net stats collection), works for simple-net, torus and the dragonfly network models
parent 36405e04
......@@ -19,6 +19,10 @@
#define MEAN_PROCESS 1.0
#define MAX_NAME_LENGTH 256
#define CATEGORY_NAME_MAX 16
#define CATEGORY_MAX 12
// debugging parameters
#define TRACK 235221
#define PRINT_ROUTER_TABLE 1
......@@ -109,6 +113,7 @@ struct terminal_state
// Terminal generate, sends and arrival T_SEND, T_ARRIVAL, T_GENERATE
// Router-Router Intra-group sends and receives RR_LSEND, RR_LARRIVE
// Router-Router Inter-group sends and receives RR_GSEND, RR_GARRIVE
struct mn_stats dragonfly_stats_array[CATEGORY_MAX];
};
/* terminal event type (1-4) */
enum event_t
......
......@@ -10,12 +10,16 @@
#include "ross.h"
#include "codes/lp-type-lookup.h"
#include "codes/configuration.h"
#include "codes/lp-io.h"
#define MAX_NAME_LENGTH 256
#define CATEGORY_NAME_MAX 16
#define CATEGORY_MAX 12
typedef struct simplenet_param simplenet_param;
typedef struct dragonfly_param dragonfly_param;
typedef struct torus_param torus_param;
typedef struct mn_stats mn_stats;
enum NETWORKS
{
......@@ -24,6 +28,19 @@ enum NETWORKS
DRAGONFLY
};
/* data structure for tracking network statistics */
struct mn_stats
{
char category[CATEGORY_NAME_MAX];
long send_count;
long send_bytes;
tw_stime send_time;
long recv_count;
long recv_bytes;
tw_stime recv_time;
long max_event_size;
};
/* structs for initializing a network/ specifying network parameters */
struct simplenet_param
{
......@@ -130,6 +147,15 @@ int model_net_get_packet_size(int net_id);
void model_net_add_lp_type(int net_id);
void model_net_report_stats(int net_id);
/* writing model-net statistics */
void model_net_write_stats(tw_lpid lpid, mn_stats* stat);
/* printing model-net statistics */
void model_net_print_stats(tw_lpid lpid, mn_stats mn_stats_array[]);
/* find model-net statistics */
mn_stats* model_net_find_stats(const char* category, mn_stats mn_stats_array[]);
#endif /* MODELNET_H */
/*
......
/*
* Copyright (C) 2013 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
*/
* * Copyright (C) 2013, University of Chicago
* *
* * See COPYRIGHT notice in top-level directory.
* */
#ifndef INC_torus_h
#define INC_torus_h
......@@ -10,6 +10,7 @@
#include <ross.h>
#include <assert.h>
#include "codes/lp-io.h"
#include "codes/codes_mapping.h"
#include "codes/codes.h"
#include "codes/model-net.h"
......@@ -18,10 +19,12 @@
#define CHUNK_SIZE 32
#define DEBUG 1
#define MEAN_INTERVAL 100
#define CATEGORY_NAME_MAX 16
#define MAX_NAME_LENGTH 256
#define TRACE -1
#define CATEGORY_NAME_MAX 16
#define CATEGORY_MAX 12
/* Torus network model implementation of codes, implements the modelnet API */
// Total number of nodes in torus, calculate in main
......@@ -115,6 +118,9 @@ struct nodes_state
/* neighbor LP ids for this torus node */
int* neighbour_minus_lpID;
int* neighbour_plus_lpID;
/* records torus statistics for this LP having different communication categories */
struct mn_stats torus_stats_array[CATEGORY_MAX];
};
struct nodes_message
......
......@@ -202,7 +202,7 @@ void packet_generate(terminal_state * s, tw_bf * bf, terminal_message * msg, tw_
tw_stime ts;
tw_event *e;
terminal_message *m;
int i;
int i, total_event_size;
num_chunks = msg->packet_size / CHUNK_SIZE;
msg->packet_ID = lp->gid + g_tw_nlp * s->packet_counter + tw_rand_integer(lp->rng, 0, lp->gid + g_tw_nlp * s->packet_counter);
msg->travel_start_time = tw_now(lp);
......@@ -246,6 +246,14 @@ void packet_generate(terminal_state * s, tw_bf * bf, terminal_message * msg, tw_
exit(-1);
} //else
} // for
total_event_size = dragonfly_get_msg_sz() + msg->remote_event_size_bytes + msg->local_event_size_bytes;
mn_stats* stat;
stat = model_net_find_stats(msg->category, s->dragonfly_stats_array);
stat->send_count++;
stat->send_bytes += msg->packet_size;
stat->send_time += (1/cn_bandwidth) * msg->packet_size;
if(stat->max_event_size < total_event_size)
stat->max_event_size = total_event_size;
return;
}
......@@ -336,6 +344,11 @@ if( msg->packet_ID == TRACK && msg->chunk_id == num_chunks-1)
if(msg->chunk_id == num_chunks-1)
{
bf->c2 = 1;
mn_stats* stat = model_net_find_stats(msg->category, s->dragonfly_stats_array);
stat->recv_count++;
stat->recv_bytes += msg->packet_size;
stat->recv_time += tw_now(lp) - msg->travel_start_time;
N_finished_packets++;
dragonfly_total_time += tw_now( lp ) - msg->travel_start_time;
total_hops += msg->my_N_hop;
......@@ -455,6 +468,7 @@ void
dragonfly_terminal_final( terminal_state * s,
tw_lp * lp )
{
model_net_print_stats(lp->gid, s->dragonfly_stats_array);
}
void dragonfly_router_final(router_state * s,
......@@ -796,6 +810,11 @@ void terminal_rc_event_handler(terminal_state * s, tw_bf * bf, terminal_message
for(i = 0; i < num_chunks; i++)
tw_rand_reverse_unif(lp->rng);
mn_stats* stat;
stat = model_net_find_stats(msg->category, s->dragonfly_stats_array);
stat->send_count--;
stat->send_bytes -= msg->packet_size;
stat->send_time -= (1/cn_bandwidth) * msg->packet_size;
}
break;
......@@ -816,6 +835,11 @@ void terminal_rc_event_handler(terminal_state * s, tw_bf * bf, terminal_message
s->next_credit_available_time = msg->saved_credit_time;
if(bf->c2)
{
mn_stats* stat;
stat = model_net_find_stats(msg->category, s->dragonfly_stats_array);
stat->recv_count--;
stat->recv_bytes -= msg->packet_size;
stat->recv_time -= tw_now(lp) - msg->travel_start_time;
N_finished_packets--;
dragonfly_total_time -= (tw_now(lp) - msg->travel_start_time);
total_hops -= msg->my_N_hop;
......
......@@ -24,11 +24,13 @@ static struct model_net_method* method_array[] =
static int model_net_get_msg_sz(int net_id);
static lp_io_handle handle;
int model_net_setup(char* name,
int packet_size,
const void* net_params)
{
int i;
int i, ret;
/* find struct for underlying method (according to configuration file) */
for(i=0; method_array[i] != NULL; i++)
{
......@@ -37,6 +39,11 @@ int model_net_setup(char* name,
method_array[i]->mn_setup(net_params);
method_array[i]->packet_size = packet_size;
model_net_add_lp_type(i);
ret = lp_io_prepare(name, LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD);
if(ret < 0)
{
return -1;
}
return(i);
}
}
......@@ -44,6 +51,88 @@ int model_net_setup(char* name,
return -1; // indicating error
}
void model_net_write_stats(tw_lpid lpid, struct mn_stats* stat)
{
int ret;
char id[32];
char data[1024];
sprintf(id, "model-net-category-%s", stat->category);
sprintf(data, "lp:%ld\tsend_count:%ld\tsend_bytes:%ld\tsend_time:%f\t"
"recv_count:%ld\trecv_bytes:%ld\trecv_time:%f\tmax_event_size:%ld\n",
(long)lpid,
stat->send_count,
stat->send_bytes,
stat->send_time,
stat->recv_count,
stat->recv_bytes,
stat->recv_time,
stat->max_event_size);
ret = lp_io_write(lpid, id, strlen(data), data);
assert(ret == 0);
return;
}
void model_net_print_stats(tw_lpid lpid, mn_stats mn_stats_array[])
{
int i;
struct mn_stats all;
memset(&all, 0, sizeof(all));
sprintf(all.category, "all");
for(i=0; i<CATEGORY_MAX; i++)
{
if(strlen(mn_stats_array[i].category) > 0)
{
all.send_count += mn_stats_array[i].send_count;
all.send_bytes += mn_stats_array[i].send_bytes;
all.send_time += mn_stats_array[i].send_time;
all.recv_count += mn_stats_array[i].recv_count;
all.recv_bytes += mn_stats_array[i].recv_bytes;
all.recv_time += mn_stats_array[i].recv_time;
if(mn_stats_array[i].max_event_size > all.max_event_size)
all.max_event_size = mn_stats_array[i].max_event_size;
model_net_write_stats(lpid, &mn_stats_array[i]);
}
}
model_net_write_stats(lpid, &all);
}
struct mn_stats* model_net_find_stats(const char* category, mn_stats mn_stats_array[])
{
int i;
int new_flag = 0;
int found_flag = 0;
for(i=0; i<CATEGORY_MAX; i++)
{
if(strlen(mn_stats_array[i].category) == 0)
{
found_flag = 1;
new_flag = 1;
break;
}
if(strcmp(category, mn_stats_array[i].category) == 0)
{
found_flag = 1;
new_flag = 0;
break;
}
}
assert(found_flag);
if(new_flag)
{
strcpy(mn_stats_array[i].category, category);
}
return(&mn_stats_array[i]);
}
void model_net_event(
int net_id,
char* category,
......@@ -348,7 +437,11 @@ void model_net_report_stats(int net_id)
// TODO: ADd checks by network names
// // Add dragonfly and torus network models
return method_array[net_id]->mn_report_stats();
method_array[net_id]->mn_report_stats();
int ret = lp_io_flush(handle, MPI_COMM_WORLD);
assert(ret == 0);
return;
}
/* registers the lp type */
void model_net_add_lp_type(int net_id)
......
......@@ -30,25 +30,12 @@ enum sn_event_type
};
/* data structure for tracking network statistics */
struct sn_stats
{
char category[CATEGORY_NAME_MAX];
long send_count;
long send_bytes;
tw_stime send_time;
long recv_count;
long recv_bytes;
tw_stime recv_time;
long max_event_size;
};
struct sn_state
{
/* next idle times for network card, both inbound and outbound */
tw_stime net_send_next_idle;
tw_stime net_recv_next_idle;
struct sn_stats sn_stats_array[CATEGORY_MAX];
struct mn_stats sn_stats_array[CATEGORY_MAX];
};
struct sn_message
......@@ -184,7 +171,6 @@ static void handle_msg_start_event(
tw_bf * b,
sn_message * m,
tw_lp * lp);
static struct sn_stats* find_stats(const char* category, sn_state *ns);
/* returns pointer to LP information for simplenet module */
static const tw_lptype* sn_get_lp_type()
......@@ -279,59 +265,11 @@ static void sn_rev_event(
return;
}
static void write_stats(tw_lp* lp, struct sn_stats* stat)
{
int ret;
char id[32];
char data[1024];
sprintf(id, "sn-category-%s", stat->category);
sprintf(data, "lp:%ld\tsend_count:%ld\tsend_bytes:%ld\tsend_time:%f\t"
"recv_count:%ld\trecv_bytes:%ld\trecv_time:%f\tmax_event_size:%ld\n",
(long)lp->gid,
stat->send_count,
stat->send_bytes,
stat->send_time,
stat->recv_count,
stat->recv_bytes,
stat->recv_time,
stat->max_event_size);
ret = lp_io_write(lp->gid, id, strlen(data), data);
assert(ret == 0);
return;
}
static void sn_finalize(
sn_state * ns,
tw_lp * lp)
{
int i;
struct sn_stats all;
memset(&all, 0, sizeof(all));
sprintf(all.category, "all");
for(i=0; i<CATEGORY_MAX; i++)
{
if(strlen(ns->sn_stats_array[i].category) > 0)
{
all.send_count += ns->sn_stats_array[i].send_count;
all.send_bytes += ns->sn_stats_array[i].send_bytes;
all.send_time += ns->sn_stats_array[i].send_time;
all.recv_count += ns->sn_stats_array[i].recv_count;
all.recv_bytes += ns->sn_stats_array[i].recv_bytes;
all.recv_time += ns->sn_stats_array[i].recv_time;
if(ns->sn_stats_array[i].max_event_size > all.max_event_size)
all.max_event_size = ns->sn_stats_array[i].max_event_size;
write_stats(lp, &ns->sn_stats_array[i]);
}
}
write_stats(lp, &all);
model_net_print_stats(lp->gid, &ns->sn_stats_array[0]);
return;
}
......@@ -362,11 +300,11 @@ static void handle_msg_ready_rev_event(
sn_message * m,
tw_lp * lp)
{
struct sn_stats* stat;
struct mn_stats* stat;
ns->net_recv_next_idle = m->net_recv_next_idle_saved;
stat = find_stats(m->category, ns);
stat = model_net_find_stats(m->category, ns->sn_stats_array);
stat->recv_count--;
stat->recv_bytes -= m->net_msg_size_bytes;
stat->recv_time -= rate_to_ns(m->net_msg_size_bytes, global_net_bw_mbs);
......@@ -386,11 +324,11 @@ static void handle_msg_ready_event(
tw_stime recv_queue_time = 0;
tw_event *e_new;
sn_message *m_new;
struct sn_stats* stat;
struct mn_stats* stat;
//printf("handle_msg_ready_event(), lp %llu.\n", (unsigned long long)lp->gid);
/* add statistics */
stat = find_stats(m->category, ns);
stat = model_net_find_stats(m->category, ns->sn_stats_array);
stat->recv_count++;
stat->recv_bytes += m->net_msg_size_bytes;
stat->recv_time += rate_to_ns(m->net_msg_size_bytes, global_net_bw_mbs);
......@@ -431,8 +369,6 @@ static void handle_msg_start_rev_event(
sn_message * m,
tw_lp * lp)
{
struct sn_stats* stat;
ns->net_send_next_idle = m->net_send_next_idle_saved;
if(m->local_event_size_bytes > 0)
......@@ -440,7 +376,8 @@ static void handle_msg_start_rev_event(
codes_local_latency_reverse(lp);
}
stat = find_stats(m->category, ns);
mn_stats* stat;
stat = model_net_find_stats(m->category, ns->sn_stats_array);
stat->send_count--;
stat->send_bytes -= m->net_msg_size_bytes;
stat->send_time -= global_net_startup_ns + rate_to_ns(m->net_msg_size_bytes, global_net_bw_mbs);
......@@ -460,7 +397,7 @@ static void handle_msg_start_event(
tw_event *e_new;
sn_message *m_new;
tw_stime send_queue_time = 0;
struct sn_stats* stat;
mn_stats* stat;
int mapping_grp_id, mapping_type_id, mapping_rep_id, mapping_offset;
tw_lpid dest_id;
char lp_type_name[MAX_NAME_LENGTH], lp_group_name[MAX_NAME_LENGTH];
......@@ -470,7 +407,7 @@ static void handle_msg_start_event(
//printf("handle_msg_start_event(), lp %llu.\n", (unsigned long long)lp->gid);
/* add statistics */
stat = find_stats(m->category, ns);
stat = model_net_find_stats(m->category, ns->sn_stats_array);
stat->send_count++;
stat->send_bytes += m->net_msg_size_bytes;
stat->send_time += global_net_startup_ns + rate_to_ns(m->net_msg_size_bytes, global_net_bw_mbs);
......@@ -526,36 +463,6 @@ static void handle_msg_start_event(
return;
}
static struct sn_stats* find_stats(const char* category, sn_state *ns)
{
int i;
int new_flag = 0;
int found_flag = 0;
for(i=0; i<CATEGORY_MAX; i++)
{
if(strlen(ns->sn_stats_array[i].category) == 0)
{
found_flag = 1;
new_flag = 1;
break;
}
if(strcmp(category, ns->sn_stats_array[i].category) == 0)
{
found_flag = 1;
new_flag = 0;
break;
}
}
assert(found_flag);
if(new_flag)
{
strcpy(ns->sn_stats_array[i].category, category);
}
return(&ns->sn_stats_array[i]);
}
/* Model-net function calls */
/*This method will serve as an intermediate layer between simplenet and modelnet.
......
......@@ -250,7 +250,7 @@ static void packet_generate( nodes_state * s,
tw_lp * lp )
{
// printf("\n msg local event size %d remote event size %d ", msg->local_event_size_bytes, msg->remote_event_size_bytes);
int j, tmp_dir=-1, tmp_dim=-1;
int j, tmp_dir=-1, tmp_dim=-1, total_event_size;
tw_stime ts;
// event triggered when packet head is sent
......@@ -303,6 +303,17 @@ static void packet_generate( nodes_state * s,
exit(-1);
}
}
total_event_size = torus_get_msg_sz() + msg->remote_event_size_bytes + msg->local_event_size_bytes;
/* record the statistics of the generated packets */
mn_stats* stat;
stat = model_net_find_stats(msg->category, s->torus_stats_array);
stat->send_count++;
stat->send_bytes += msg->packet_size;
stat->send_time += (1/link_bandwidth) * msg->packet_size;
/* record the maximum ROSS event size */
if(stat->max_event_size < total_event_size)
stat->max_event_size = total_event_size;
}
/*Sends a 8-byte credit back to the torus node LP that sent the message */
static void credit_send( nodes_state * s,
......@@ -333,7 +344,6 @@ static void credit_send( nodes_state * s,
m->type = CREDIT;
tw_event_send( buf_e );
}
/* send a packet from one torus node to another torus node
A packet can be up to 256 bytes on BG/L and BG/P and up to 512 bytes on BG/Q */
......@@ -420,8 +430,9 @@ static void packet_arrive( nodes_state * s,
tw_event *e;
tw_stime ts;
nodes_message *m;
mn_stats* stat;
credit_send( s, bf, lp, msg); // Commented on May 22nd to check if the credit needs to be sent from the final destination or not
credit_send( s, bf, lp, msg);
msg->my_N_hop++;
ts = 0.1 + tw_rand_exponential(lp->rng, MEAN_INTERVAL/200);
......@@ -432,6 +443,12 @@ static void packet_arrive( nodes_state * s,
if( msg->chunk_id == num_chunks - 1 )
{
bf->c2 = 1;
stat = model_net_find_stats(msg->category, s->torus_stats_array);
stat->recv_count++;
stat->recv_bytes += msg->packet_size;
stat->recv_time += tw_now( lp ) - msg->travel_start_time;
/*count the number of packets completed overall*/
N_finished_packets++;
total_time += tw_now( lp ) - msg->travel_start_time;
total_hops += msg->my_N_hop;
......@@ -488,6 +505,7 @@ static void torus_report_stats()
void
final( nodes_state * s, tw_lp * lp )
{
model_net_print_stats(lp->gid, &s->torus_stats_array[0]);
free(s->next_link_available_time);
free(s->next_credit_available_time);
free(s->next_flit_generate_time);
......@@ -514,7 +532,12 @@ static void node_rc_handler(nodes_state * s, tw_bf * bf, nodes_message * msg, tw
//s->next_flit_generate_time[(saved_dim * 2) + saved_dir][0] = msg->saved_available_time;
for(i=0; i < num_chunks; i++)
tw_rand_reverse_unif(lp->rng);
tw_rand_reverse_unif(lp->rng);
mn_stats* stat;
stat = model_net_find_stats(msg->category, s->torus_stats_array);
stat->send_count--;
stat->send_bytes -= msg->packet_size;
stat->send_time -= (1/link_bandwidth) * msg->packet_size;
}
break;
......@@ -528,6 +551,11 @@ static void node_rc_handler(nodes_state * s, tw_bf * bf, nodes_message * msg, tw
s->next_credit_available_time[next_dir + ( next_dim * 2 )][0] = msg->saved_available_time;
if(bf->c2)
{
struct mn_stats* stat;
stat = model_net_find_stats(msg->category, s->torus_stats_array);
stat->recv_count--;
stat->recv_bytes -= msg->packet_size;
stat->recv_time -= tw_now(lp) - msg->travel_start_time;
N_finished_packets--;
total_time -= tw_now( lp ) - msg->travel_start_time;
total_hops -= msg->my_N_hop;
......
......@@ -144,8 +144,6 @@ int main(
{
int nprocs;
int rank;
int ret;
lp_io_handle handle;
//printf("\n Config count %d ",(int) config.lpgroups_count);
g_tw_ts_end = s_to_ns(60*60*24*365); /* one year, in nsecs */
......@@ -174,18 +172,9 @@ int main(
offset = 1;
}
ret = lp_io_prepare("simplenet-test", LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD);
if(ret < 0)
{
return(-1);
}
tw_run();
model_net_report_stats(net_id);
ret = lp_io_flush(handle, MPI_COMM_WORLD);
assert(ret == 0);
tw_end();
return 0;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment