...
 
Commits (39)
......@@ -75,9 +75,17 @@ struct terminal_custom_message
model_net_event_return event_rc;
int is_pull;
uint32_t pull_size;
int path_type;
/* for reverse computation */
int path_type;
short num_rngs;
short num_cll;
int qos_index;
short last_saved_qos;
short qos_reset1;
short qos_reset2;
tw_stime saved_available_time;
tw_stime saved_avg_time;
tw_stime saved_rcv_time;
......
......@@ -83,6 +83,16 @@ struct terminal_plus_message
int is_pull;
uint32_t pull_size;
/* for counting reverse calls */
short num_rngs;
short num_cll;
/* qos related attributes */
int qos_index;
short last_saved_qos;
short qos_reset1;
short qos_reset2;
/* for reverse computation */
int path_type;
tw_stime saved_available_time;
......
......@@ -12,8 +12,10 @@ LPGROUPS
}
PARAMS
{
adaptive_threshold="8192";
# minimal-bias="1";
adaptive_threshold="131072";
num_qos_levels="2";
qos_bandwidth="30,70";
minimal-bias="1";
df-dally-vc = "1";
# packet size in the network
packet_size="4096";
......@@ -31,32 +33,32 @@ PARAMS
# number of groups in the network
num_groups="65";
# buffer size in bytes for local virtual channels
local_vc_size="32768";
local_vc_size="16384";
#buffer size in bytes for global virtual channels
global_vc_size="32768";
global_vc_size="16384";
#buffer size in bytes for compute node virtual channels
cn_vc_size="32768";
#bandwidth in GiB/s for local channels
local_bandwidth="25.0";
local_bandwidth="2.0";
# bandwidth in GiB/s for global channels
global_bandwidth="25.0";
global_bandwidth="2.0";
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="25.0";
cn_bandwidth="2.0";
# Number of row channels
num_row_chans="1";
# Number of column channels
num_col_chans="1";
# ROSS message size
message_size="640";
message_size="656";
# number of compute nodes connected to router, dictated by dragonfly config
# file
num_cns_per_router="8";
# number of global channels per router
num_global_channels="8";
# network config file for intra-group connections
intra-group-connections="/home/mubarak/codes-online/codes/src/network-workloads/conf/dragonfly-custom/dfdally_8k_intra";
intra-group-connections="../src/network-workloads/conf/dragonfly-custom/dfdally_8k_intra";
# network config file for inter-group connections
inter-group-connections="/home/mubarak/codes-online/codes/src/network-workloads/conf/dragonfly-custom/dfdally_8k_inter";
inter-group-connections="../src/network-workloads/conf/dragonfly-custom/dfdally_8k_inter";
# routing protocol to be used
routing="prog-adaptive";
}
......@@ -13,6 +13,9 @@ LPGROUPS
PARAMS
{
# packet size in the network
adaptive_threshold="131072";
num_qos_levels="2";
qos_bandwidth="30,70";
packet_size="4096";
modelnet_order=( "dragonfly_plus","dragonfly_plus_router" );
# scheduler options
......@@ -43,19 +46,19 @@ PARAMS
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="25.0";
# ROSS message size
message_size="624";
message_size="656";
# number of compute nodes connected to router, dictated by dragonfly config
# file
num_cns_per_router="16";
# number of global channels per router
num_global_connections="16";
# network config file for intra-group connections
intra-group-connections="/home/mmubarak/codes-online/codes/src/network-workloads/conf/dragonfly-custom/dfp_8k_intra";
intra-group-connections="/home/mubarak/codes-online/codes/src/network-workloads/conf/dragonfly-custom/dfp_8k_intra";
# network config file for inter-group connections
inter-group-connections="/home/mmubarak/codes-online/codes/src/network-workloads/conf/dragonfly-custom/dfp_8k_inter";
inter-group-connections="/home/mubarak/codes-online/codes/src/network-workloads/conf/dragonfly-custom/dfp_8k_inter";
# routing protocol to be used
routing="prog-adaptive";
# route scoring protocol to be used - options are 'alpha' or 'beta'
route_scoring_metric="alpha";
route_scoring_metric="delta";
}
LPGROUPS
{
MODELNET_GRP
{
repetitions="168";
# name of this lp changes according to the model
nw-lp="6";
# these lp names will be the same for dragonfly-custom model
modelnet_dragonfly_plus="6";
modelnet_dragonfly_plus_router="1";
}
}
PARAMS
{
# packet size in the network
packet_size="4096";
# order of LPs, mapping for modelnet grp
modelnet_order=( "dragonfly_plus","dragonfly_plus_router" );
# scheduler options
modelnet_scheduler="fcfs";
# chunk size in the network (when chunk size = packet size, packets will not be divided into chunks)
chunk_size="4096";
# number of spine routers per group
num_router_spine="12";
# number of leaf routers per group
num_router_leaf="12";
# number of links connecting between group levels per router
num_level_chans="1";
# number of groups in the network
num_groups="7";
# buffer size in bytes for local virtual channels
local_vc_size="32768";
# buffer size in bytes for global virtual channels
global_vc_size="32768";
# buffer size in bytes for compute node virtual channels
cn_vc_size="32768";
# bandwidth in GiB/s for local channels
local_bandwidth="25.0";
# bandwidth in GiB/s for global channels
global_bandwidth="25.0";
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="25.0";
# ROSS message size
message_size="656";
# number of compute nodes connected to router, dictated by dragonfly config file
num_cns_per_router="12";
# number of global channels per router
num_global_connections="12";
# network config file for intra-group connections
intra-group-connections="../src/network-workloads/conf/dragonfly-plus/dfp_1k_intra";
# network config file for inter-group connections
inter-group-connections="../src/network-workloads/conf/dragonfly-plus/dfp_1k_inter";
# routing protocol to be used - 'minimal', 'non-minimal-spine', 'non-minimal-leaf', 'prog-adaptive'
routing="prog-adaptive";
# route scoring protocol to be used - options are 'alpha', 'beta', or 'delta' - 'gamma' has been deprecated
route_scoring_metric="delta";
# minimal route threshold before considering non-minimal paths
adaptive_threshold="131072"; #1/16 of 32768
}
This diff is collapsed.
This diff is collapsed.
......@@ -108,6 +108,11 @@ static void handle_sched_next_rc(
tw_bf *b,
model_net_wrap_msg * m,
tw_lp * lp);
static void model_net_commit_event(
model_net_base_state * ns,
tw_bf *b,
model_net_wrap_msg * m,
tw_lp * lp);
/* ROSS function pointer table for this LP */
tw_lptype model_net_base_lp = {
......@@ -115,12 +120,23 @@ tw_lptype model_net_base_lp = {
(pre_run_f) NULL,
(event_f) model_net_base_event,
(revent_f) model_net_base_event_rc,
(commit_f) NULL,
(commit_f) model_net_commit_event,
(final_f) model_net_base_finalize,
(map_f) codes_mapping,
sizeof(model_net_base_state),
};
static void model_net_commit_event(model_net_base_state * ns, tw_bf *b, model_net_wrap_msg * m, tw_lp * lp)
{
if(m->h.event_type == MN_BASE_PASS)
{
void * sub_msg;
sub_msg = ((char*)m)+msg_offsets[ns->net_id];
if(ns->sub_type->commit != NULL)
ns->sub_type->commit(ns->sub_state, b, sub_msg, lp);
}
}
/* setup for the ROSS event tracing
*/
void mn_event_collect(model_net_wrap_msg *m, tw_lp *lp, char *buffer, int *collect_flag)
......@@ -532,7 +548,7 @@ void model_net_base_event(
tw_lp * lp){
if(m->h.magic != model_net_base_magic)
printf("\n LP ID mismatched %llu ", lp->gid);
printf("\n LP ID mismatched %llu %d ", lp->gid);
assert(m->h.magic == model_net_base_magic);
......
......@@ -271,12 +271,12 @@ static model_net_event_return model_net_noop_event(
model_net_event_return num_rng_calls = 0;
tw_stime poffset = mn_in_sequence ? mn_msg_offset : 0.0;
tw_stime delay = codes_local_latency(sender);
num_rng_calls++; // rng call is in codes_local_latency
tw_stime sendTime = message_size * codes_cn_delay;
if (self_event_size && self_event != NULL) {
poffset += delay;
num_rng_calls++;
tw_event *e = tw_event_new(sender->gid, poffset+offset+sendTime, sender);
memcpy(tw_event_data(e), self_event, self_event_size);
tw_event_send(e);
......@@ -284,7 +284,6 @@ static model_net_event_return model_net_noop_event(
if (remote_event_size && remote_event != NULL) {
poffset += delay;
num_rng_calls++;
/* special case - in a "pull" event, the "remote" message is actually
* to self */
tw_event *e = tw_event_new(is_pull ? sender->gid : final_dest_lp,
......@@ -314,6 +313,7 @@ static model_net_event_return model_net_event_impl_base(
void const * self_event,
tw_lp *sender) {
if (remote_event_size + self_event_size + sizeof(model_net_wrap_msg)
> g_tw_msg_sz){
tw_error(TW_LOC, "Error: model_net trying to transmit an event of size "
......@@ -328,11 +328,12 @@ static model_net_event_return model_net_event_impl_base(
tw_lpid dest_mn_lp = model_net_find_local_device_mctx(net_id, recv_map_ctx,
final_dest_lp);
if (src_mn_lp == dest_mn_lp && message_size < (uint64_t)codes_node_eager_limit)
if ( src_mn_lp == dest_mn_lp && message_size < (uint64_t)codes_node_eager_limit)
{
return model_net_noop_event(final_dest_lp, is_pull, offset, message_size,
remote_event_size, remote_event, self_event_size, self_event,
sender);
}
tw_stime poffset = codes_local_latency(sender);
if (mn_in_sequence){
tw_stime tmp = mn_msg_offset;
......
......@@ -23,6 +23,7 @@
#include "lammps.h"
#include "nekbone_swm_user_code.h"
#include "nearest_neighbor_swm_user_code.h"
#include "all_to_one_swm_user_code.h"
#define ALLREDUCE_SHORT_MSG_SIZE 2048
......@@ -754,7 +755,6 @@ static void workload_caller(void * arg)
{
shared_context* sctx = static_cast<shared_context*>(arg);
//printf("\n workload name %s ", sctx->workload_name);
if(strcmp(sctx->workload_name, "lammps") == 0)
{
LAMMPS_SWM * lammps_swm = static_cast<LAMMPS_SWM*>(sctx->swm_obj);
......@@ -770,6 +770,11 @@ static void workload_caller(void * arg)
NearestNeighborSWMUserCode * nn_swm = static_cast<NearestNeighborSWMUserCode*>(sctx->swm_obj);
nn_swm->call();
}
else if(strcmp(sctx->workload_name, "incast") == 0 || strcmp(sctx->workload_name, "incast1") == 0 || strcmp(sctx->workload_name, "incast2") == 0)
{
AllToOneSWMUserCode * incast_swm = static_cast<AllToOneSWMUserCode*>(sctx->swm_obj);
incast_swm->call();
}
}
static int comm_online_workload_load(const char * params, int app_id, int rank)
{
......@@ -807,10 +812,21 @@ static int comm_online_workload_load(const char * params, int app_id, int rank)
{
path.append("/skeleton.json");
}
else if(strcmp(o_params->workload_name, "incast") == 0)
{
path.append("/incast.json");
}
else if(strcmp(o_params->workload_name, "incast1") == 0)
{
path.append("/incast1.json");
}
else if(strcmp(o_params->workload_name, "incast2") == 0)
{
path.append("/incast2.json");
}
else
tw_error(TW_LOC, "\n Undefined workload type %s ", o_params->workload_name);
//printf("\n path %s ", path.c_str());
try {
std::ifstream jsonFile(path.c_str());
boost::property_tree::json_parser::read_json(jsonFile, root);
......@@ -837,6 +853,11 @@ static int comm_online_workload_load(const char * params, int app_id, int rank)
NearestNeighborSWMUserCode * nn_swm = new NearestNeighborSWMUserCode(root, generic_ptrs);
my_ctx->sctx.swm_obj = (void*)nn_swm;
}
else if(strcmp(o_params->workload_name, "incast") == 0 || strcmp(o_params->workload_name, "incast1") == 0 || strcmp(o_params->workload_name, "incast2") == 0)
{
AllToOneSWMUserCode * incast_swm = new AllToOneSWMUserCode(root, generic_ptrs);
my_ctx->sctx.swm_obj = (void*)incast_swm;
}
if(global_prod_thread == NULL)
{
......