Commit 77e85ef9 authored by Caitlin Ross's avatar Caitlin Ross
Browse files

changes related to the ROSS model-level sampling feature

parent 1bc73715
......@@ -21,8 +21,8 @@ const tw_lptype* lp_type_lookup(const char* name);
/* register an LP with CODES/ROSS */
void lp_type_register(const char* name, const tw_lptype* type);
void trace_type_register(const char* name, const st_trace_type* type);
const st_trace_type* trace_type_lookup(const char* name);
void st_model_type_register(const char* name, const st_model_types* type);
const st_model_types* st_model_type_lookup(const char* name);
#ifdef __cplusplus
}
#endif
......
......@@ -63,8 +63,8 @@ struct model_net_method
revent_f mn_sample_rc_fn;
init_f mn_sample_init_fn;
final_f mn_sample_fini_fn;
void (*mn_trace_register)(st_trace_type *base_type);
const st_trace_type* (*mn_get_trace_type)();
void (*mn_model_stat_register)(st_model_types *base_type);
const st_model_types* (*mn_get_model_stat_types)();
};
extern struct model_net_method * method_array[];
......
......@@ -388,7 +388,7 @@ void model_net_set_msg_param(
/* returns pointer to LP information for simplenet module */
const tw_lptype* model_net_get_lp_type(int net_id);
const st_trace_type* model_net_get_trace_type(int net_id);
const st_model_types* model_net_get_model_stat_type(int net_id);
DEPRECATED
uint64_t model_net_get_packet_size(int net_id);
......
......@@ -91,6 +91,8 @@ struct terminal_message
tw_stime saved_hist_start_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
tw_stime saved_busy_time_ross;
tw_stime saved_fin_chunks_ross;
int saved_hist_num;
int saved_occupancy;
......
......@@ -1569,22 +1569,29 @@ void nw_lp_event_collect(nw_message *m, tw_lp *lp, char *buffer, int *collect_fl
memcpy(buffer, &type, sizeof(type));
}
st_trace_type nw_lp_trace_types[] = {
void nw_lp_model_stat_collect(nw_state *s, tw_lp *lp, char *buffer)
{
return;
}
st_model_types nw_lp_model_types[] = {
{(rbev_trace_f) nw_lp_event_collect,
sizeof(int),
(ev_trace_f) nw_lp_event_collect,
sizeof(int)},
sizeof(int),
(model_stat_f) nw_lp_model_stat_collect,
0},
{0}
};
static const st_trace_type *nw_lp_get_trace_types(void)
static const st_model_types *nw_lp_get_model_stat_types(void)
{
return(&nw_lp_trace_types[0]);
return(&nw_lp_model_types[0]);
}
void nw_lp_register_trace()
void nw_lp_register_model()
{
trace_type_register("nw-lp", nw_lp_get_trace_types());
st_model_type_register("nw-lp", nw_lp_get_model_stat_types());
}
/* end of ROSS event tracing setup */
......@@ -1659,8 +1666,8 @@ int main( int argc, char** argv )
nw_add_lp_type();
model_net_register();
if (g_st_ev_trace)
nw_lp_register_trace();
if (g_st_ev_trace || g_st_model_stats)
nw_lp_register_model();
net_ids = model_net_configure(&num_nets);
// assert(num_nets == 1);
......
......@@ -130,22 +130,29 @@ void ft_svr_event_collect(svr_msg *m, tw_lp *lp, char *buffer, int *collect_flag
memcpy(buffer, &type, sizeof(type));
}
st_trace_type ft_svr_trace_types[] = {
void ft_svr_model_stat_collect(svr_state *s, tw_lp *lp, char *buffer)
{
return;
}
st_model_types ft_svr_model_types[] = {
{(rbev_trace_f) ft_svr_event_collect,
sizeof(int),
(ev_trace_f) ft_svr_event_collect,
sizeof(int)},
sizeof(int),
(model_stat_f) ft_svr_model_stat_collect,
0},
{0}
};
static const st_trace_type *ft_svr_get_trace_types(void)
static const st_model_types *ft_svr_get_model_stat_types(void)
{
return(&ft_svr_trace_types[0]);
return(&ft_svr_model_types[0]);
}
void ft_svr_register_trace()
void ft_svr_register_model_stats()
{
trace_type_register("server", ft_svr_get_trace_types());
st_model_type_register("server", ft_svr_get_model_stat_types());
}
const tw_optdef app_opt [] =
......@@ -437,7 +444,7 @@ int main(
svr_add_lp_type();
if (g_st_ev_trace)
ft_svr_register_trace();
ft_svr_register_model_stats();
codes_mapping_setup();
......
......@@ -116,22 +116,29 @@ void svr_event_collect(svr_msg *m, tw_lp *lp, char *buffer, int *collect_flag)
memcpy(buffer, &type, sizeof(type));
}
st_trace_type svr_trace_types[] = {
void svr_model_stat_collect(svr_state *s, tw_lp *lp, char *buffer)
{
return;
}
st_model_types svr_model_types[] = {
{(rbev_trace_f) svr_event_collect,
sizeof(int),
(ev_trace_f) svr_event_collect,
sizeof(int)},
sizeof(int),
(model_stat_f) svr_model_stat_collect,
0},
{0}
};
static const st_trace_type *svr_get_trace_types(void)
static const st_model_types *svr_get_model_stat_types(void)
{
return(&svr_trace_types[0]);
return(&svr_model_types[0]);
}
void svr_register_trace()
void svr_register_model_types()
{
trace_type_register("server", svr_get_trace_types());
st_model_type_register("server", svr_get_model_stat_types());
}
const tw_optdef app_opt [] =
......@@ -393,8 +400,8 @@ int main(
model_net_register();
svr_add_lp_type();
if (g_st_ev_trace)
svr_register_trace();
if (g_st_ev_trace || g_st_model_stats)
svr_register_model_types();
codes_mapping_setup();
......
......@@ -261,6 +261,13 @@ struct terminal_state
/* for logging forward and reverse events */
long fwd_events;
long rev_events;
/* following used for ROSS model-level stats collection */
long fin_chunks_ross_sample;
long data_size_ross_sample;
long fin_hops_ross_sample;
tw_stime fin_chunks_time_ross_sample;
tw_stime busy_time_ross_sample;
};
/* terminal event type (1-4) */
......@@ -346,7 +353,33 @@ struct router_state
long fwd_events;
long rev_events;
/* following used for ROSS model-level stats collection */
tw_stime* busy_time_ross_sample;
int64_t * link_traffic_ross_sample;
};
/* had to pull some of the ROSS model stats collection stuff up here */
void dragonfly_event_collect(terminal_message *m, tw_lp *lp, char *buffer, int *collect_flag);
void dragonfly_model_stat_collect(terminal_state *s, tw_lp *lp, char *buffer);
void dfly_router_model_stat_collect(router_state *s, tw_lp *lp, char *buffer);
st_model_types dragonfly_model_types[] = {
{(rbev_trace_f) dragonfly_event_collect,
sizeof(int),
(ev_trace_f) dragonfly_event_collect,
sizeof(int),
(model_stat_f) dragonfly_model_stat_collect,
sizeof(tw_lpid) + sizeof(long) * 2 + sizeof(double) + sizeof(tw_stime) *2},
{(rbev_trace_f) dragonfly_event_collect,
sizeof(int),
(ev_trace_f) dragonfly_event_collect,
sizeof(int),
(model_stat_f) dfly_router_model_stat_collect,
0}, //updated in router_setup() since it's based on the radix
{0}
};
/* End of ROSS model stats collection */
static short routing = MINIMAL;
......@@ -769,6 +802,13 @@ terminal_init( terminal_state * s,
s->in_send_loop = 0;
s->issueIdle = 0;
/* set up for ROSS stats sampling */
s->fin_chunks_ross_sample = 0;
s->data_size_ross_sample = 0;
s->fin_hops_ross_sample = 0;
s->fin_chunks_time_ross_sample = 0.0;
s->busy_time_ross_sample = 0.0;
dragonfly_collective_init(s, lp);
return;
}
......@@ -842,6 +882,12 @@ static void router_setup(router_state * r, tw_lp * lp)
r->busy_time = (tw_stime*)malloc(p->radix * sizeof(tw_stime));
r->busy_time_sample = (tw_stime*)malloc(p->radix * sizeof(tw_stime));
/* set up for ROSS stats sampling */
r->link_traffic_ross_sample = (int64_t*)calloc(p->radix, sizeof(int64_t));
r->busy_time_ross_sample = (tw_stime*)calloc(p->radix, sizeof(tw_stime));
if (g_st_model_stats)
lp->model_types->mstat_sz = sizeof(tw_lpid) + (sizeof(int64_t) + sizeof(tw_stime)) * p->radix;
rc_stack_create(&r->st);
for(int i=0; i < p->radix; i++)
{
......@@ -1250,6 +1296,7 @@ static void packet_send_rc(terminal_state * s, tw_bf * bf, terminal_message * ms
s->busy_time = msg->saved_total_time;
s->last_buf_full = msg->saved_busy_time;
s->busy_time_sample = msg->saved_sample_time;
s->busy_time_ross_sample = msg->saved_busy_time_ross;
}
}
return;
......@@ -1364,9 +1411,11 @@ static void packet_send(terminal_state * s, tw_bf * bf, terminal_message * msg,
msg->saved_total_time = s->busy_time;
msg->saved_busy_time = s->last_buf_full;
msg->saved_sample_time = s->busy_time_sample;
msg->saved_busy_time_ross = s->busy_time_ross_sample;
s->busy_time += (tw_now(lp) - s->last_buf_full);
s->busy_time_sample += (tw_now(lp) - s->last_buf_full);
s->busy_time_ross_sample += (tw_now(lp) - s->last_buf_full);
s->last_buf_full = 0.0;
}
}
......@@ -1389,12 +1438,15 @@ static void packet_arrive_rc(terminal_state * s, tw_bf * bf, terminal_message *
N_finished_chunks--;
s->finished_chunks--;
s->fin_chunks_sample--;
s->fin_chunks_ross_sample--;
total_hops -= msg->my_N_hop;
s->total_hops -= msg->my_N_hop;
s->fin_hops_sample -= msg->my_N_hop;
s->fin_hops_ross_sample -= msg->my_N_hop;
dragonfly_total_time = msg->saved_total_time;
s->fin_chunks_time = msg->saved_sample_time;
s->fin_chunks_time_ross_sample = msg->saved_fin_chunks_ross;
s->total_time = msg->saved_avg_time;
struct qhash_head * hash_link = NULL;
......@@ -1431,6 +1483,7 @@ static void packet_arrive_rc(terminal_state * s, tw_bf * bf, terminal_message *
total_msg_sz -= msg->total_size;
s->total_msg_size -= msg->total_size;
s->data_size_sample -= msg->total_size;
s->data_size_ross_sample -= msg->total_size;
struct dfly_qhash_entry * d_entry_pop = rc_stack_pop(s->st);
qhash_add(s->rank_tbl, &key, &(d_entry_pop->hash_link));
......@@ -1552,6 +1605,7 @@ static void packet_arrive(terminal_state * s, tw_bf * bf, terminal_message * msg
s->finished_chunks++;
/* Finished chunks per sample */
s->fin_chunks_sample++;
s->fin_chunks_ross_sample++;
/* WE do not allow self messages through dragonfly */
assert(lp->gid != msg->src_terminal_id);
......@@ -1581,6 +1635,8 @@ static void packet_arrive(terminal_state * s, tw_bf * bf, terminal_message * msg
/* save the sample time */
msg->saved_sample_time = s->fin_chunks_time;
s->fin_chunks_time += (tw_now(lp) - msg->travel_start_time);
msg->saved_fin_chunks_ross = s->fin_chunks_time_ross_sample;
s->fin_chunks_time_ross_sample += (tw_now(lp) - msg->travel_start_time);
/* save the total time per LP */
msg->saved_avg_time = s->total_time;
......@@ -1591,6 +1647,7 @@ static void packet_arrive(terminal_state * s, tw_bf * bf, terminal_message * msg
total_hops += msg->my_N_hop;
s->total_hops += msg->my_N_hop;
s->fin_hops_sample += msg->my_N_hop;
s->fin_hops_ross_sample += msg->my_N_hop;
mn_stats* stat = model_net_find_stats(msg->category, s->dragonfly_stats_array);
msg->saved_rcv_time = stat->recv_time;
......@@ -1673,6 +1730,7 @@ static void packet_arrive(terminal_state * s, tw_bf * bf, terminal_message * msg
s->total_msg_size += msg->total_size;
s->finished_msgs++;
s->data_size_sample += msg->total_size;
s->data_size_ross_sample += msg->total_size;
if(tmp->remote_event_data && tmp->remote_event_size > 0) {
bf->c8 = 1;
......@@ -2874,11 +2932,13 @@ static void router_packet_send_rc(router_state * s,
{
s->link_traffic[output_port] -= cur_entry->msg.packet_size % s->params->chunk_size;
s->link_traffic_sample[output_port] -= cur_entry->msg.packet_size % s->params->chunk_size;
s->link_traffic_ross_sample[output_port] -= cur_entry->msg.packet_size % s->params->chunk_size;
}
if(bf->c12)
{
s->link_traffic[output_port] -= s->params->chunk_size;
s->link_traffic_sample[output_port] -= s->params->chunk_size;
s->link_traffic_ross_sample[output_port] -= s->params->chunk_size;
}
s->next_output_available_time[output_port] = msg->saved_available_time;
......@@ -3002,10 +3062,13 @@ router_packet_send( router_state * s,
s->params->chunk_size);
s->link_traffic_sample[output_port] += (cur_entry->msg.packet_size %
s->params->chunk_size);
s->link_traffic_ross_sample[output_port] += (cur_entry->msg.packet_size %
s->params->chunk_size);
} else {
bf->c12 = 1;
s->link_traffic[output_port] += s->params->chunk_size;
s->link_traffic_sample[output_port] += s->params->chunk_size;
s->link_traffic_ross_sample[output_port] += s->params->chunk_size;
}
if(routing == PROG_ADAPTIVE)
......@@ -3074,6 +3137,7 @@ static void router_buf_update_rc(router_state * s,
{
s->busy_time[indx] = msg->saved_rcv_time;
s->busy_time_sample[indx] = msg->saved_sample_time;
s->busy_time_ross_sample[indx] = msg->saved_busy_time_ross;
s->last_buf_full[indx] = msg->saved_busy_time;
}
if(bf->c1) {
......@@ -3103,8 +3167,10 @@ static void router_buf_update(router_state * s, tw_bf * bf, terminal_message * m
msg->saved_rcv_time = s->busy_time[indx];
msg->saved_busy_time = s->last_buf_full[indx];
msg->saved_sample_time = s->busy_time_sample[indx];
msg->saved_busy_time_ross = s->busy_time_ross_sample[indx];
s->busy_time[indx] += (tw_now(lp) - s->last_buf_full[indx]);
s->busy_time_sample[indx] += (tw_now(lp) - s->last_buf_full[indx]);
s->busy_time_ross_sample[indx] += (tw_now(lp) - s->last_buf_full[indx]);
s->last_buf_full[indx] = 0.0;
}
if(s->queued_msgs[indx][output_chan] != NULL) {
......@@ -3271,27 +3337,91 @@ void dragonfly_event_collect(terminal_message *m, tw_lp *lp, char *buffer, int *
memcpy(buffer, &type, sizeof(type));
}
st_trace_type dragonfly_trace_types[] = {
{(rbev_trace_f) dragonfly_event_collect,
sizeof(int),
(ev_trace_f) dragonfly_event_collect,
sizeof(int)},
{0}
};
void dragonfly_model_stat_collect(terminal_state *s, tw_lp *lp, char *buffer)
{
int index = 0;
tw_lpid id = 0;
long tmp = 0;
tw_stime tmp2 = 0;
id = s->terminal_id;
memcpy(&buffer[index], &id, sizeof(id));
index += sizeof(id);
tmp = s->fin_chunks_ross_sample;
memcpy(&buffer[index], &tmp, sizeof(tmp));
index += sizeof(tmp);
s->fin_chunks_ross_sample = 0;
tmp = s->data_size_ross_sample;
memcpy(&buffer[index], &tmp, sizeof(tmp));
index += sizeof(tmp);
s->data_size_ross_sample = 0;
tmp = s->fin_hops_ross_sample;
memcpy(&buffer[index], &tmp, sizeof(tmp));
index += sizeof(tmp);
s->fin_hops_ross_sample = 0;
tmp2 = s->fin_chunks_time_ross_sample;
memcpy(&buffer[index], &tmp2, sizeof(tmp2));
index += sizeof(tmp2);
s->fin_chunks_time_ross_sample = 0;
tmp2 = s->busy_time_ross_sample;
memcpy(&buffer[index], &tmp2, sizeof(tmp2));
index += sizeof(tmp2);
s->busy_time_ross_sample = 0;
return;
}
void dfly_router_model_stat_collect(router_state *s, tw_lp *lp, char *buffer)
{
const dragonfly_param * p = s->params;
int i, index = 0;
tw_lpid id = 0;
tw_stime tmp = 0;
int64_t tmp2 = 0;
id = s->router_id;
memcpy(&buffer[index], &id, sizeof(id));
index += sizeof(id);
for(i = 0; i < p->radix; i++)
{
tmp = s->busy_time_ross_sample[i];
memcpy(&buffer[index], &tmp, sizeof(tmp));
index += sizeof(tmp);
s->busy_time_ross_sample[i] = 0;
tmp2 = s->link_traffic_ross_sample[i];
memcpy(&buffer[index], &tmp2, sizeof(tmp2));
index += sizeof(tmp2);
s->link_traffic_ross_sample[i] = 0;
}
return;
}
static const st_model_types *dragonfly_get_model_types(void)
{
return(&dragonfly_model_types[0]);
}
static const st_trace_type *dragonfly_get_trace_types(void)
static const st_model_types *dfly_router_get_model_types(void)
{
return(&dragonfly_trace_types[0]);
return(&dragonfly_model_types[1]);
}
static void dragonfly_register_trace(st_trace_type *base_type)
static void dragonfly_register_model_types(st_model_types *base_type)
{
trace_type_register(LP_CONFIG_NM_TERM, base_type);
st_model_type_register(LP_CONFIG_NM_TERM, base_type);
}
static void router_register_trace(st_trace_type *base_type)
static void router_register_model_types(st_model_types *base_type)
{
trace_type_register(LP_CONFIG_NM_ROUT, base_type);
st_model_type_register(LP_CONFIG_NM_ROUT, base_type);
}
/*** END of ROSS event tracing additions */
......@@ -3331,8 +3461,8 @@ struct model_net_method dragonfly_method =
.mn_sample_rc_fn = (void*)dragonfly_sample_rc_fn,
.mn_sample_init_fn = (void*)dragonfly_sample_init,
.mn_sample_fini_fn = (void*)dragonfly_sample_fin,
.mn_trace_register = dragonfly_register_trace,
.mn_get_trace_type = dragonfly_get_trace_types,
.mn_model_stat_register = dragonfly_register_model_types,
.mn_get_model_stat_types = dragonfly_get_model_types,
};
struct model_net_method dragonfly_router_method =
......@@ -3352,6 +3482,6 @@ struct model_net_method dragonfly_router_method =
.mn_sample_rc_fn = (void*)dragonfly_rsample_rc_fn,
.mn_sample_init_fn = (void*)dragonfly_rsample_init,
.mn_sample_fini_fn = (void*)dragonfly_rsample_fin,
.mn_trace_register = router_register_trace,
.mn_get_trace_type = dragonfly_get_trace_types,
.mn_model_stat_register = router_register_model_types,
.mn_get_model_stat_types = dfly_router_get_model_types,
};
......@@ -1811,6 +1811,7 @@ void switch_packet_receive_rc(switch_state * s,
s_arrive_r++;
#endif
int output_port = msg->saved_vc;
tw_rand_reverse_unif(lp->rng);
if(bf->c1)
{
tw_rand_reverse_unif(lp->rng);
......@@ -2636,10 +2637,13 @@ int ft_get_output_port( switch_state * s, tw_bf * bf, fattree_message * msg,
assert(end_port > start_port);
outport = start_port;
//outport = start_port;
// when occupancy is same, just choose random port
outport = tw_rand_integer(lp->rng, start_port, end_port-1);
int load = s->vc_occupancy[outport] + s->queued_length[outport];
if(load != 0) {
for(int port = start_port + 1; port < end_port; port++) {
//for(int port = start_port + 1; port < end_port; port++) {
for(int port = start_port; port < end_port; port++) {
if(s->vc_occupancy[port] + s->queued_length[port] < load) {
load = s->vc_occupancy[port] + s->queued_length[port];
outport = port;
......@@ -2968,23 +2972,27 @@ void fattree_event_collect(fattree_message *m, tw_lp *lp, char *buffer, int *col
memcpy(buffer, &type, sizeof(type));
}
st_trace_type fattree_trace_types[] = {
// TODO will need to separate fattree_method into one for terminal and one for switch
// in order to use the ROSS model stats collection
st_model_types fattree_model_types[] = {
{(rbev_trace_f) fattree_event_collect,
sizeof(int),
(ev_trace_f) fattree_event_collect,
sizeof(int)},
sizeof(int),
NULL,
0},
{0}
};
static const st_trace_type *fattree_get_trace_types(void)
static const st_model_types *fattree_get_model_stat_types(void)
{
return(&fattree_trace_types[0]);
return(&fattree_model_types[0]);
}
static void fattree_register_trace(st_trace_type *base_type)
static void fattree_register_model_stats(st_model_types *base_type)
{
trace_type_register(LP_CONFIG_NM, base_type);
trace_type_register("fattree_switch", &fattree_trace_types[0]);
st_model_type_register(LP_CONFIG_NM, base_type);
st_model_type_register("fattree_switch", &fattree_model_types[0]);
//trace_type_register("fattree_switch", base_type);
}
/*** END of ROSS event tracing additions */
......@@ -3018,7 +3026,7 @@ struct model_net_method fattree_method =
// .model_net_method_find_local_device = NULL,
.mn_collective_call = NULL,
.mn_collective_call_rc = NULL,
.mn_trace_register = fattree_register_trace,
.mn_get_trace_type = fattree_get_trace_types
.mn_model_stat_register = fattree_register_model_stats,
.mn_get_model_stat_types = fattree_get_model_stat_types
};
......@@ -53,7 +53,7 @@ typedef struct model_net_base_state {
// lp type and state of underlying model net method - cache here so we
// don't have to constantly look up
const tw_lptype *sub_type;
const st_trace_type *sub_trace_type;
const st_model_types *sub_model_type;
void *sub_state;
} model_net_base_state;
......@@ -139,10 +139,10 @@ void mn_event_collect(model_net_wrap_msg *m, tw_lp *lp, char *buffer, int *colle
break;
case MN_BASE_PASS:
sub_msg = ((char*)m)+msg_offsets[((model_net_base_state*)lp->cur_state)->net_id];
if (g_st_ev_trace == RB_TRACE)
(((model_net_base_state*)lp->cur_state)->sub_trace_type->rbev_trace)(sub_msg, lp, buffer, collect_flag);
if (g_st_ev_trace == RB_TRACE || g_st_ev_trace == COMMIT_TRACE)
(((model_net_base_state*)lp->cur_state)->sub_model_type->rbev_trace)(sub_msg, lp, buffer, collect_flag);
else if (g_st_ev_trace == FULL_TRACE)
(((model_net_base_state*)lp->cur_state)->sub_trace_type->ev_trace)(sub_msg, lp, buffer, collect_flag);
(((model_net_base_state*)lp->cur_state)->sub_model_type->ev_trace)(sub_msg, lp, buffer, collect_flag);
break;
default: // this shouldn't happen, but can help detect an issue
type = 9004;
......@@ -150,11 +150,22 @@ void mn_event_collect(model_net_wrap_msg *m, tw_lp *lp, char *buffer, int *colle
}
}
st_trace_type mn_trace_types = {
void mn_model_stat_collect(model_net_base_state *s, tw_lp *lp, char *buffer)
{
// need to call the model level stats collection fn
(*s->sub_model_type->model_stat_fn)(s->sub_state, lp, buffer);
return;
}