Commit b13d30f4 authored by Misbah Mubarak's avatar Misbah Mubarak
Browse files

Adding the torus network plugin with modelnet API, see...

Adding the torus network plugin with modelnet API, see model-net/README_MN_TEST.txt for instructions to run the test
parent 0f6e417d
......@@ -20,8 +20,8 @@ typedef struct torus_param torus_param;
enum NETWORKS
{
SIMPLENET,
DRAGONFLY,
TORUS
TORUS,
DRAGONFLY
};
/* structs for initializing a network/ specifying network parameters */
......@@ -42,9 +42,12 @@ struct dragonfly_param
struct torus_param
{
char* name;
int n_dim; /*Dimension of the torus network, 5-D, 7-D or any other*/
int n_dims; /*Dimension of the torus network, 5-D, 7-D or any other*/
int* dim_length; /*Length of each torus dimension*/
double link_bandwidth;/* bandwidth for each torus link */
int buffer_size; /* number of buffer slots for each vc in flits*/
int num_vc; /* number of virtual channels for each torus link */
float mean_process;/* mean process time for each flit */
};
/* NOTE: the following auxilliary functions are probably wrong; just leaving
* these examples from simplenet for reference purposes.
......
#ifndef INC_torus_h
#define INC_torus_h
#include <ross.h>
#include <assert.h>
#include "codes/codes_mapping.h"
#include "codes/codes.h"
#include "codes/model-net.h"
#include "codes/model-net-method.h"
#define CHUNK_SIZE 32
#define DEBUG 1
#define MEAN_INTERVAL 100
#define CATEGORY_NAME_MAX 16
#define MAX_NAME_LENGTH 256
#define TRACE -1
// Total number of nodes in torus, calculate in main
int N_nodes = 1;
double link_bandwidth;
int buffer_size;
int num_vc;
int n_dims;
int * dim_length;
int * factor;
int * half_length;
char grp_name[MAX_NAME_LENGTH], type_name[MAX_NAME_LENGTH];
int grp_id, lp_type_id, rep_id, offset;
typedef enum nodes_event_t nodes_event_t;
typedef struct nodes_state nodes_state;
typedef struct nodes_message nodes_message;
/* Issues a torus packet event call */
static void torus_packet_event(
char* category,
tw_lpid final_dest_lp,
int packet_size,
int remote_event_size,
const void* remote_event,
int self_event_size,
const void* self_event,
tw_lp *sender,
int is_last_pckt);
static void torus_packet_event_rc(tw_lp *sender);
static void torus_setup(const void* net_params);
static int torus_get_msg_sz(void);
static const tw_lptype* torus_get_lp_type(void);
/* data structure for torus statistics */
struct model_net_method torus_method =
{
.method_name = "torus",
.mn_setup = torus_setup,
.model_net_method_packet_event = torus_packet_event,
.model_net_method_packet_event_rc = torus_packet_event_rc,
.mn_get_lp_type = torus_get_lp_type,
.mn_get_msg_sz = torus_get_msg_sz,
};
enum nodes_event_t
{
GENERATE = 1,
ARRIVAL,
SEND,
CREDIT,
};
struct nodes_state
{
unsigned long long packet_counter;
tw_stime** next_link_available_time;
tw_stime** next_credit_available_time;
tw_stime** next_flit_generate_time;
int** buffer;
int* dim_position;
int* neighbour_minus_lpID;
int* neighbour_plus_lpID;
int source_dim;
int direction;
};
struct nodes_message
{
char category[CATEGORY_NAME_MAX];
tw_stime travel_start_time;
tw_stime saved_available_time;
unsigned long long packet_ID;
nodes_event_t type;
int saved_src_dim;
int saved_src_dir;
int* dest;
tw_lpid final_dest_gid;
tw_lpid dest_lp;
tw_lpid sender_lp;
int my_N_hop;
int source_dim;
int source_direction;
int next_stop;
int packet_size;
short chunk_id;
// for codes local and remote events
int local_event_size_bytes;
int remote_event_size_bytes;
};
tw_stime average_travel_time = 0;
tw_stime total_time = 0;
tw_stime max_latency = 0;
float head_delay=0.0;
float credit_delay = 0.0;
static unsigned long long N_finished_packets = 0;
static unsigned long long total_hops = 0;
// run time arguments
int num_packets;
int num_chunks;
int packet_offset = 0;
#endif
lib_LIBRARIES += src/libcodes-net.a
nobase_include_HEADERS = \
codes/model-net.h
codes/model-net.h \
codes/torus.h \
codes/model-net-method.h
src_libcodes_net_a_SOURCES = \
src/models/networks/model-net/model-net-method.h \
src/models/networks/model-net/model-net.c \
src/models/networks/model-net/simplenet-upd.c
src/models/networks/model-net/simplenet-upd.c \
src/models/networks/model-net/torus.c
......@@ -8,3 +8,5 @@ mpiexec -n 4 tests/modelnet-test --sync=2 tests/modelnet-test.conf
<or>
mpiexec -n 4 tests/modelnet-test --sync=3 tests/modelnet-test.conf
(1)- To run the modelnet test with the simplenet network plugin, use tests/modelnet-test.conf (default setting).
(2)- To run the modelnet test with the torus network plugin, use tests/modelnet-test-torus.conf file.
......@@ -7,19 +7,20 @@
#include <assert.h>
#include "codes/model-net.h"
#include "model-net-method.h"
#include "codes/model-net-method.h"
#define STR_SIZE 16
#define PROC_TIME 10.0
#define NUM_NETS 1
extern struct model_net_method simplenet_method;
extern struct model_net_method torus_method;
//extern struct dragonfly_method dragonfly_method;
//extern struct torus_method torus_method;
/* Global array initialization, terminated with a NULL entry */
static struct model_net_method* method_array[] =
{&simplenet_method, NULL};
{&simplenet_method, &torus_method, NULL};
int model_net_setup(char* name,
int packet_size,
......@@ -63,7 +64,7 @@ void model_net_event(
/*Determine the network name*/
if(net_id < 0 || net_id > NUM_NETS)
{
fprintf(stderr, "Error: undefined network ID %d (Available options 0 (simplenet), 1 (dragonfly) 2 (torus) ) \n", net_id);
fprintf(stderr, "Error: undefined network ID %d (Available options 0 (simplenet), 1 (torus) 2 (dragonfly) ) \n", net_id);
exit(-1);
}
......@@ -114,8 +115,8 @@ int model_net_set_params()
configuration_get_value_double(&config, "PARAMS", "net_startup_ns", &net_startup_ns);
configuration_get_value_double(&config, "PARAMS", "net_bw_mbps", &net_bw_mbps);
net_params.net_startup_ns = 1.5;
net_params.net_bw_mbps = 20000;
net_params.net_startup_ns = net_startup_ns;
net_params.net_bw_mbps = net_bw_mbps;
net_id = model_net_setup("simplenet", packet_size, (const void*)&net_params); /* Sets the network as simplenet and packet size 512 */
}
else if(strcmp("dragonfly", mn_name)==0)
......@@ -124,7 +125,61 @@ int model_net_set_params()
}
else if(strcmp("torus", mn_name)==0)
{
printf("\n not supported yet ");
torus_param net_params;
char dim_length[MAX_NAME_LENGTH];
int n_dims=0, buffer_size=0, num_vc=0, i=0;
double link_bandwidth=0;
configuration_get_value_int(&config, "PARAMS", "n_dims", &n_dims);
if(!n_dims)
{
n_dims = 4; /* a 4-D torus */
printf("\n Number of dimensions not specified, setting to %d ", n_dims);
}
configuration_get_value_double(&config, "PARAMS", "link_bandwidth", &link_bandwidth);
if(!link_bandwidth)
{
link_bandwidth = 2.0; /*default bg/q configuration */
printf("\n Link bandwidth not specified, setting to %lf ", link_bandwidth);
}
configuration_get_value_int(&config, "PARAMS", "buffer_size", &buffer_size);
if(!buffer_size)
{
buffer_size = 2048;
printf("\n Buffer size not specified, setting to %d ",buffer_size);
}
configuration_get_value_int(&config, "PARAMS", "num_vc", &num_vc);
if(!num_vc)
{
num_vc = 1; /*by default, we have one for taking packets, another for taking credit*/
printf("\n num_vc not specified, setting to %d ", num_vc);
}
configuration_get_value(&config, "PARAMS", "dim_length", dim_length, MAX_NAME_LENGTH);
char* token;
net_params.n_dims=n_dims;
net_params.num_vc=num_vc;
net_params.buffer_size=buffer_size;
net_params.link_bandwidth=link_bandwidth;
net_params.dim_length=malloc(n_dims*sizeof(int));
token = strtok(dim_length, ",");
while(token != NULL)
{
sscanf(token, "%d", &net_params.dim_length[i]);
if(!net_params.dim_length[i])
{
printf("\n Invalid torus dimension specified %d, exitting... ", net_params.dim_length[i]);
MPI_Finalize();
exit(-1);
}
i++;
token = strtok(NULL,",");
}
net_id = model_net_setup("torus", packet_size, (const void*)&net_params);
}
else
printf("\n Invalid network argument %s ", mn_name);
......@@ -203,6 +258,9 @@ void model_net_add_lp_type(int net_id)
lp_type_register("modelnet_simplenet", model_net_get_lp_type(net_id));
break;
case TORUS:
lp_type_register("modelnet_torus", model_net_get_lp_type(net_id));
break;
default:
{
printf("\n Invalid net_id specified ");
......
......@@ -10,7 +10,7 @@
#include "codes/lp-io.h"
#include "codes/jenkins-hash.h"
#include "model-net-method.h"
#include "codes/model-net-method.h"
#include "codes/model-net.h"
#include "codes/codes_mapping.h"
#include "codes/codes.h"
......
#include "codes/torus.h"
/* setup the torus model */
static void torus_setup(const void* net_params)
{
int i;
torus_param* t_param = (torus_param*)net_params;
n_dims = t_param->n_dims;
link_bandwidth = t_param->link_bandwidth;
buffer_size = t_param->buffer_size;
num_vc = t_param->num_vc;
head_delay = (1 / link_bandwidth) * CHUNK_SIZE;
credit_delay = (1 / link_bandwidth) * 8;
dim_length = malloc(n_dims * sizeof(int));
factor = malloc(n_dims * sizeof(int));
half_length = malloc(n_dims * sizeof(int));
for(i = 0; i < n_dims; i++)
dim_length[i] = t_param->dim_length[i]; /* TODO, read comma separated values from files */
}
static void torus_packet_event_rc(tw_lp *sender)
{
codes_local_latency_reverse(sender);
return;
}
/* torus mode packet event */
static void torus_packet_event(char* category, tw_lpid final_dest_lp, int packet_size, int remote_event_size, const void* remote_event, int self_event_size, const void* self_event, tw_lp *sender, int is_last_pckt)
{
tw_event * e_new;
tw_stime xfer_to_nic_time;
nodes_message * msg;
tw_lpid local_nic_id, dest_nic_id;
char* tmp_ptr;
char lp_type_name[MAX_NAME_LENGTH], lp_group_name[MAX_NAME_LENGTH];
int mapping_grp_id, mapping_rep_id, mapping_type_id, mapping_offset;
codes_mapping_get_lp_info(sender->gid, lp_group_name, &mapping_grp_id, &mapping_type_id, lp_type_name, &mapping_rep_id, &mapping_offset);
codes_mapping_get_lp_id("MODELNET_GRP", "modelnet_torus", mapping_rep_id, mapping_offset, &local_nic_id);
codes_mapping_get_lp_info(final_dest_lp, lp_group_name, &mapping_grp_id, &mapping_type_id, lp_type_name, &mapping_rep_id, &mapping_offset);
codes_mapping_get_lp_id("MODELNET_GRP", "modelnet_torus", mapping_rep_id, mapping_offset, &dest_nic_id);
/* TODO: Should send the packets in correct sequence. Currently the last packet is being sent first due to codes_local_latency offset. */
xfer_to_nic_time = 0.01 + codes_local_latency(sender); /* Throws an error of found last KP time > current event time otherwise */
e_new = codes_event_new(local_nic_id, xfer_to_nic_time, sender);
msg = tw_event_data(e_new);
strcpy(msg->category, category);
msg->final_dest_gid = final_dest_lp;
msg->dest_lp = dest_nic_id;
msg->sender_lp=sender->gid;
msg->packet_size = packet_size;
msg->remote_event_size_bytes = 0;
msg->local_event_size_bytes = 0;
msg->type = GENERATE;
if(is_last_pckt) /* Its the last packet so pass in remote event information*/
{
tmp_ptr = (char*)msg;
tmp_ptr += torus_get_msg_sz();
if(remote_event_size > 0)
{
msg->remote_event_size_bytes = remote_event_size;
memcpy(tmp_ptr, remote_event, remote_event_size);
tmp_ptr += remote_event_size;
}
if(self_event_size > 0)
{
msg->local_event_size_bytes = self_event_size;
memcpy(tmp_ptr, self_event, self_event_size);
tmp_ptr += self_event_size;
}
}
//printf("\n torus remote event %d local event %d last packet %d %lf ", msg->remote_event_size_bytes, msg->local_event_size_bytes, is_last_pckt, xfer_to_nic_time);
tw_event_send(e_new);
}
/*Initialize the torus model, this initialization part is borrowed from Ning's torus model */
static void torus_init( nodes_state * s,
tw_lp * lp )
{
int i, j;
int dim_N[ n_dims + 1 ];
codes_mapping_get_lp_info(lp->gid, grp_name, &grp_id, &lp_type_id, type_name, &rep_id, &offset);
dim_N[ 0 ]=rep_id + offset;
s->neighbour_minus_lpID = (int*)malloc(n_dims * sizeof(int));
s->neighbour_plus_lpID = (int*)malloc(n_dims * sizeof(int));
s->dim_position = (int*)malloc(n_dims * sizeof(int));
s->buffer = (int**)malloc(2*n_dims * sizeof(int*));
s->next_link_available_time = (tw_stime**)malloc(2*n_dims * sizeof(tw_stime*));
s->next_credit_available_time = (tw_stime**)malloc(2*n_dims * sizeof(tw_stime*));
s->next_flit_generate_time = (tw_stime**)malloc(2*n_dims*sizeof(tw_stime*));
for(i=0; i < 2*n_dims; i++)
{
s->buffer[i] = (int*)malloc(num_vc * sizeof(int));
s->next_link_available_time[i] = (tw_stime*)malloc(num_vc * sizeof(tw_stime));
s->next_credit_available_time[i] = (tw_stime*)malloc(num_vc * sizeof(tw_stime));
s->next_flit_generate_time[i] = (tw_stime*)malloc(num_vc * sizeof(tw_stime));
}
//printf("\n LP ID %d ", (int)lp->gid);
// calculate my torus co-ordinates
for ( i=0; i < n_dims; i++ )
{
s->dim_position[ i ] = dim_N[ i ]%dim_length[ i ];
//printf(" dim position %d ", s->dim_position[i]);
dim_N[ i + 1 ] = ( dim_N[ i ] - s->dim_position[ i ] )/dim_length[ i ];
half_length[ i ] = dim_length[ i ] / 2;
}
//printf("\n");
factor[ 0 ] = 1;
for ( i=1; i < n_dims; i++ )
{
factor[ i ] = 1;
for ( j = 0; j < i; j++ )
factor[ i ] *= dim_length[ j ];
}
int temp_dim_pos[ n_dims ];
for ( i = 0; i < n_dims; i++ )
temp_dim_pos[ i ] = s->dim_position[ i ];
tw_lpid neighbor_id;
// calculate minus neighbour's lpID
for ( j = 0; j < n_dims; j++ )
{
temp_dim_pos[ j ] = (s->dim_position[ j ] -1 + dim_length[ j ]) % dim_length[ j ];
s->neighbour_minus_lpID[ j ] = 0;
for ( i = 0; i < n_dims; i++ )
s->neighbour_minus_lpID[ j ] += factor[ i ] * temp_dim_pos[ i ];
codes_mapping_get_lp_id("MODELNET_GRP", "modelnet_torus", s->neighbour_minus_lpID[ j ], 0, &neighbor_id);
//printf("\n neighbor %d lp id %d ", (int)s->neighbour_minus_lpID[ j ], (int)neighbor_id);
temp_dim_pos[ j ] = s->dim_position[ j ];
}
// calculate plus neighbour's lpID
for ( j = 0; j < n_dims; j++ )
{
temp_dim_pos[ j ] = ( s->dim_position[ j ] + 1 + dim_length[ j ]) % dim_length[ j ];
s->neighbour_plus_lpID[ j ] = 0;
for ( i = 0; i < n_dims; i++ )
s->neighbour_plus_lpID[ j ] += factor[ i ] * temp_dim_pos[ i ];
codes_mapping_get_lp_id("MODELNET_GRP", "modelnet_torus", s->neighbour_plus_lpID[ j ], 0, &neighbor_id);
//printf("\n neighbor %d lp id %d ", (int)s->neighbour_plus_lpID[ j ], (int)neighbor_id);
temp_dim_pos[ j ] = s->dim_position[ j ];
}
//printf("\n");
for( j=0; j < 2 * n_dims; j++ )
{
for( i = 0; i < num_vc; i++ )
{
s->buffer[ j ][ i ] = 0;
s->next_link_available_time[ j ][ i ] = 0.0;
s->next_credit_available_time[j][i] = 0.0;
}
}
// record LP time
s->packet_counter = 0;
}
static int torus_get_msg_sz(void)
{
return sizeof(nodes_message);
}
/*Returns the next neighbor to which the packet should be routed by using DOR (Taken from Ning's code of the torus model)*/
static void dimension_order_routing( nodes_state * s,
tw_lpid * dst_lp,
int * dim,
int * dir )
{
int dim_N[n_dims],
dest[n_dims],
i,
dest_id=0;
codes_mapping_get_lp_info(*dst_lp, grp_name, &grp_id, &lp_type_id, type_name, &rep_id, &offset);
dim_N[ 0 ]=rep_id + offset;
// find destination dimensions using destination LP ID
for ( i = 0; i < n_dims; i++ )
{
dest[ i ] = dim_N[ i ] % dim_length[ i ];
dim_N[ i + 1 ] = ( dim_N[ i ] - dest[ i ] ) / dim_length[ i ];
}
for( i = 0; i < n_dims; i++ )
{
if ( s->dim_position[ i ] - dest[ i ] > half_length[ i ] )
{
dest_id = s->neighbour_plus_lpID[ i ];
*dim = i;
*dir = 1;
break;
}
if ( s->dim_position[ i ] - dest[ i ] < -half_length[ i ] )
{
dest_id = s->neighbour_minus_lpID[ i ];
*dim = i;
*dir = 0;
break;
}
if ( ( s->dim_position[ i ] - dest[ i ] <= half_length[ i ] ) && ( s->dim_position[ i ] - dest[ i ] > 0 ) )
{
dest_id = s->neighbour_minus_lpID[ i ];
*dim = i;
*dir = 0;
break;
}
if (( s->dim_position[ i ] - dest[ i ] >= -half_length[ i ] ) && ( s->dim_position[ i ] - dest[ i ] < 0) )
{
dest_id = s->neighbour_plus_lpID[ i ];
*dim = i;
*dir = 1;
break;
}
}
codes_mapping_get_lp_id("MODELNET_GRP", "modelnet_torus", dest_id, 0, dst_lp);
}
/*Generates a packet. If there are two buffer slots available, then the packet is
injected in the network. Else, the packet is placed in the injection queue */
static void packet_generate( nodes_state * s,
tw_bf * bf,
nodes_message * msg,
tw_lp * lp )
{
// printf("\n msg local event size %d remote event size %d ", msg->local_event_size_bytes, msg->remote_event_size_bytes);
int j, tmp_dir=-1, tmp_dim=-1;
tw_stime ts;
// event triggered when packet head is sent
tw_event * e_h;
nodes_message *m;
tw_lpid dst_lp = msg->dest_lp;
dimension_order_routing( s, &dst_lp, &tmp_dim, &tmp_dir );
msg->saved_src_dim = tmp_dim;
msg->saved_src_dir = tmp_dir;
//msg->saved_available_time = s->next_flit_generate_time[(2*tmp_dim) + tmp_dir][0];
msg->travel_start_time = tw_now(lp);
msg->packet_ID = lp->gid + g_tw_nlp * s->packet_counter;
num_chunks = msg->packet_size/CHUNK_SIZE;
s->packet_counter++;
if(msg->packet_ID == TRACE)
printf("\n packet generated %lld at lp %d dest %d final dest %d", msg->packet_ID, (int)lp->gid, (int)dst_lp, (int)msg->dest_lp);
for(j = 0; j < num_chunks; j++)
{
if(s->buffer[ tmp_dir + ( tmp_dim * 2 ) ][ 0 ] < buffer_size)
{
ts = j + tw_rand_exponential(lp->rng, MEAN_INTERVAL/200);
//s->next_flit_generate_time[(2*tmp_dim) + tmp_dir][0] = max(s->next_flit_generate_time[(2*tmp_dim) + tmp_dir][0], tw_now(lp));
//s->next_flit_generate_time[(2*tmp_dim) + tmp_dir][0] += ts;
//e_h = tw_event_new( lp->gid, s->next_flit_generate_time[(2*tmp_dim) + tmp_dir][0] - tw_now(lp), lp);
e_h = tw_event_new(lp->gid, ts, lp);
msg->source_direction = tmp_dir;
msg->source_dim = tmp_dim;
m = tw_event_data( e_h );
memcpy(m, msg, torus_get_msg_sz() + msg->local_event_size_bytes + msg->remote_event_size_bytes);
m->next_stop = dst_lp;
m->chunk_id = j;
// find destination dimensions using destination LP ID
m->my_N_hop = 0;
// Send the packet out
m->type = SEND;
m->source_direction = tmp_dir;
m->source_dim = tmp_dim;
tw_event_send(e_h);
}
else
{
printf("\n %d Packet queued in line increase buffer space, dir %d dim %d buffer space %d dest LP %d ", (int)lp->gid, tmp_dir, tmp_dim, s->buffer[ tmp_dir + ( tmp_dim * 2 ) ][ 0 ], (int)msg->dest_lp);
}
}
}
/*Sends a 8-byte credit back to the torus node LP that sent the message */
static void credit_send( nodes_state * s,
tw_bf * bf,
tw_lp * lp,
nodes_message * msg)
{
#if DEBUG
//if(lp->gid == TRACK_LP)