Commit 67dc477b authored by Misbah Mubarak's avatar Misbah Mubarak

Merge branch 'dragonfly-validation' into 'master'

Dragonfly validation

See merge request !26
parents ae514ea2 5735e4a2
......@@ -3,8 +3,10 @@ Nikhil Jain, Abhinav Bhatele (LLNL)
- Addition of direct scheme for setting up dragonfly network topology.
- Network configuration setup for custom dragonfly model.
- Topology generations scripts for custom dragonfly model.
- Bug fix for virtual channel deadlocks in custom dragonfly model.
- Bug reporter for CODES network models.
- Fat tree network setup and adaptive routing.
- Pending: Merging Express mesh model to master.
Jens Domke (U. of Dresden)
- Static routing in fat tree network model.
......
......@@ -22,6 +22,7 @@ extern "C" {
#include "model-net.h"
#include "model-net-sched.h"
#include "net/dragonfly.h"
#include "net/dragonfly-custom.h"
#include "net/slimfly.h"
#include "net/fattree.h"
#include "net/loggp.h"
......@@ -124,6 +125,7 @@ typedef struct model_net_wrap_msg {
union {
model_net_base_msg m_base; // base lp
terminal_message m_dfly; // dragonfly
terminal_custom_message m_custom_dfly; // dragonfly-custom
slim_terminal_message m_slim; // slimfly
fattree_message m_fat; // fattree
loggp_message m_loggp; // loggp
......
......@@ -29,12 +29,15 @@ int main(int argc, char **argv) {
int r = atoi(argv[2]);
int c = atoi(argv[3]);
int total_routers = g * r * c;
FILE *intra = fopen(argv[4], "wb");
FILE *inter = fopen(argv[5], "wb");
int router = 0;
int green = 0, black = 1;
int groups = 0;
printf("\n Rows %d Cols %d Groups %d ", r, c, g);
for(int rows = 0; rows < r; rows++) {
for(int cols = 0; cols < c; cols++) {
for(int cols1 = 0; cols1 < c; cols1++) {
......@@ -71,7 +74,7 @@ int main(int argc, char **argv) {
} else {
ndstg--;
}
int gsize = 2, gs = 16;
int gsize = 2, gs = c;
for(int row = 0; row < r; row++) {
int srcrB = srcg * r * c + row * c, srcr;
int dstrB = dstg * r * c + row * c, dstr;
......@@ -79,15 +82,19 @@ int main(int argc, char **argv) {
int dstB = (nsrcg % (gs/2)) * 2;
srcr = srcrB + srcB;
dstr = dstrB + dstB;
for(int r = 0; r < 2; r++) {
for(int block = 0; block < gsize; block++) {
fwrite(&srcr, sizeof(int), 1, inter);
fwrite(&dstr, sizeof(int), 1, inter);
printf("INTER %d %d\n", srcr, dstr);
if(srcr >= total_routers || dstr >= total_routers)
printf("\n connection between invalid routers src %d and dest %d ", srcr, dstr);
for(int r = 0; r < 2; r++) {
for(int block = 0; block < gsize; block++) {
fwrite(&srcr, sizeof(int), 1, inter);
fwrite(&dstr, sizeof(int), 1, inter);
printf("INTER %d %d srcg %d destg %d srcrb %d dstrB %d \n", srcr, dstr, srcg, dstg, srcrB, dstrB);
}
srcr++;
dstr++;
}
}
}
}
}
......
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
//
// Written by:
// Nikhil Jain <nikhil.jain@acm.org>
// Abhinav Bhatele <bhatele@llnl.gov>
// Peer-Timo Bremer <ptbremer@llnl.gov>
//
// LLNL-CODE-678961. All rights reserved.
//
// This file is part of Damselfly. For details, see:
// https://github.com/scalability-llnl/damselfly
// Please also read the LICENSE file for our notice and the LGPL.
//////////////////////////////////////////////////////////////////////////////
#include "stdio.h"
#include "stdlib.h"
//Usage ./binary num_groups num_rows num_columns intra_file inter_file
int main(int argc, char **argv) {
if(argc < 3) {
printf("Correct usage: %s <num_group> <num_rows> <num_cols> <cons_across_groups> <cons_in_row> <cons_in_col> <intra_file> <inter_file>", argv[0]);
exit(0);
}
int g = atoi(argv[1]);
int r = atoi(argv[2]);
int c = atoi(argv[3]);
int g_p = atoi(argv[4]);
int r_p = atoi(argv[5]);
int c_p = atoi(argv[6]);
int total_routers = g * r * c;
int routers_per_g = r * c;
FILE *intra = fopen(argv[7], "wb");
FILE *inter = fopen(argv[8], "wb");
int router = 0;
int green = 0, black = 1;
int groups = 0;
for(int rows = 0; rows < r; rows++) {
for(int cols = 0; cols < c; cols++) {
for(int cols1 = 0; cols1 < c; cols1++) {
if(cols1 != cols) {
int dest = (rows * c) + cols1;
for(int link = 0; link < c_p; link++) {
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&green, sizeof(int), 1, intra);
printf("INTRA %d %d %d\n", router, dest, green);
}
}
}
for(int rows1 = 0; rows1 < r; rows1++) {
if(rows1 != rows) {
int dest = (rows1 * c) + cols;
for(int link = 0; link < r_p; link++) {
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&black, sizeof(int), 1, intra);
printf("INTRA %d %d %d\n", router, dest, black);
}
}
}
router++;
}
}
for(int srcg = 0; srcg < g; srcg++) {
for(int dstg = 0; dstg < g; dstg++) {
if(srcg != dstg) {
int nsrcg = srcg;
int ndstg = dstg;
if(srcg > dstg) {
nsrcg--;
} else {
ndstg--;
}
int startSrc = ndstg * g_p;
int startDst = nsrcg * g_p;
for(int link = 0; link < g_p; link++) {
int srcrB = srcg * routers_per_g, srcr;
int dstrB = dstg * routers_per_g, dstr;
srcr = srcrB + (startSrc + link) % routers_per_g;
dstr = dstrB + (startDst + link) % routers_per_g;
if(srcr >= total_routers || dstr >= total_routers)
printf("\n connection between invalid routers src %d and dest %d ", srcr, dstr);
fwrite(&srcr, sizeof(int), 1, inter);
fwrite(&dstr, sizeof(int), 1, inter);
printf("INTER %d %d srcg %d destg %d\n", srcr, dstr, srcg, dstg);
}
}
}
}
fclose(intra);
fclose(inter);
}
......@@ -49,9 +49,9 @@ PARAMS
# number of global channels per router
num_global_channels="10";
# network config file for intra-group connections
intra-group-connections="@abs_srcdir@/intra-custom-small";
intra-group-connections="@abs_srcdir@/intra-theta";
# network config file for inter-group connections
inter-group-connections="@abs_srcdir@/inter-custom-small";
inter-group-connections="@abs_srcdir@/inter-theta";
# routing protocol to be used
routing="prog-adaptive";
}
......@@ -2,11 +2,11 @@ LPGROUPS
{
MODELNET_GRP
{
repetitions="1600";
repetitions="1520";
# name of this lp changes according to the model
nw-lp="4";
nw-lp="8";
# these lp names will be the same for dragonfly-custom model
modelnet_dragonfly_custom="4";
modelnet_dragonfly_custom="8";
modelnet_dragonfly_custom_router="1";
}
}
......@@ -24,11 +24,11 @@ PARAMS
# number of routers within each group
# this is dictated by the dragonfly configuration files
# intra-group rows for routers
num_router_rows="4";
num_router_rows="1";
# intra-group columns for routers
num_router_cols="20";
num_router_cols="40";
# number of groups in the network
num_groups="20";
num_groups="38";
# buffer size in bytes for local virtual channels
local_vc_size="8192";
#buffer size in bytes for global virtual channels
......@@ -38,20 +38,24 @@ PARAMS
#bandwidth in GiB/s for local channels
local_bandwidth="5.25";
# bandwidth in GiB/s for global channels
global_bandwidth="18.75";
global_bandwidth="4.69";
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="8.0";
# Number of row channels
num_row_chans="2";
# Number of column channels
num_col_chans="1";
# ROSS message size
message_size="592";
# number of compute nodes connected to router, dictated by dragonfly config
# file
num_cns_per_router="4";
num_cns_per_router="8";
# number of global channels per router
num_global_channels="10";
num_global_channels="4";
# network config file for intra-group connections
intra-group-connections="../src/network-workloads/conf/dragonfly-custom/intra-custom";
intra-group-connections="/Users/mmubarak/Documents/software_development/codes/scripts/gen-cray-topo/intratest";
# network config file for inter-group connections
inter-group-connections="../src/network-workloads/conf/dragonfly-custom/inter-custom";
inter-group-connections="/Users/mmubarak/Documents/software_development/codes/scripts/gen-cray-topo/intertest";
# routing protocol to be used
routing="prog-adaptive";
}
......@@ -37,7 +37,7 @@ PARAMS
#bandwidth in GiB/s for local channels
local_bandwidth="5.25";
# bandwidth in GiB/s for global channels
global_bandwidth="18.75";
global_bandwidth="4.69";
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="16.0";
# ROSS message size
......@@ -46,7 +46,7 @@ PARAMS
# file
num_cns_per_router="4";
# number of global channels per router
num_global_channels="10";
num_global_channels="4";
# network config file for intra-group connections
intra-group-connections="../src/network-workloads/conf/dragonfly-custom/intra-theta";
# network config file for inter-group connections
......
......@@ -13,26 +13,42 @@
#include "codes/model-net.h"
#include "codes/rc-stack.h"
#include "codes/quicklist.h"
#include "codes/quickhash.h"
#include "codes/codes-jobmap.h"
/* turning on track lp will generate a lot of output messages */
#define MN_LP_NM "modelnet_dragonfly_custom"
#define CONTROL_MSG_SZ 64
#define TRACK_LP -1
#define TRACE -1
#define MAX_WAIT_REQS 512
#define CS_LP_DBG 0
#define EAGER_THRESHOLD 81920000
#define RANK_HASH_TABLE_SZ 2000
#define NOISE 3.0
#define NW_LP_NM "nw-lp"
#define lprintf(_fmt, ...) \
do {if (CS_LP_DBG) printf(_fmt, __VA_ARGS__);} while (0)
#define MAX_STATS 65536
#define PAYLOAD_SZ 1024
static int msg_size_hash_compare(
void *key, struct qhash_head *link);
int enable_msg_tracking = 0;
int unmatched = 0;
char workload_type[128];
char workload_file[8192];
char offset_file[8192];
static int wrkld_id;
static int num_net_traces = 0;
static int num_dumpi_traces = 0;
static int alloc_spec = 0;
static double self_overhead = 10.0;
static tw_stime self_overhead = 10.0;
static tw_stime mean_interval = 100000;
/* Doing LP IO*/
static char lp_io_dir[256] = {'\0'};
......@@ -41,15 +57,16 @@ static unsigned int lp_io_use_suffix = 0;
static int do_lp_io = 0;
/* variables for loading multiple applications */
/* Xu's additions start */
char workloads_conf_file[8192];
char alloc_file[8192];
int num_traces_of_job[5];
tw_stime soft_delay_mpi = 2500;
tw_stime nic_delay = 1000;
tw_stime copy_per_byte_eager = 0.55;
char file_name_of_job[5][8192];
struct codes_jobmap_ctx *jobmap_ctx;
struct codes_jobmap_params_list jobmap_p;
/* Xu's additions end */
/* Variables for Cortex Support */
/* Matthieu's additions start */
......@@ -66,9 +83,12 @@ typedef int32_t dumpi_req_id;
static int net_id = 0;
static float noise = 5.0;
static int num_net_lps = 0, num_mpi_lps = 0;
static int num_nw_lps = 0, num_mpi_lps = 0;
static int num_syn_clients;
FILE * workload_log = NULL;
FILE * msg_size_log = NULL;
FILE * workload_agg_log = NULL;
FILE * workload_meta_log = NULL;
......@@ -77,6 +97,9 @@ static uint64_t sample_bytes_written = 0;
long long num_bytes_sent=0;
long long num_bytes_recvd=0;
long long num_syn_bytes_sent = 0;
long long num_syn_bytes_recvd = 0;
double max_time = 0, max_comm_time = 0, max_wait_time = 0, max_send_time = 0, max_recv_time = 0;
double avg_time = 0, avg_comm_time = 0, avg_wait_time = 0, avg_send_time = 0, avg_recv_time = 0;
......@@ -103,6 +126,12 @@ enum MPI_NW_EVENTS
MPI_SEND_ARRIVED,
MPI_SEND_ARRIVED_CB, // for tracking message times on sender
MPI_SEND_POSTED,
MPI_REND_ARRIVED,
MPI_REND_ACK_ARRIVED,
CLI_BCKGND_FIN,
CLI_BCKGND_ARRIVE,
CLI_BCKGND_GEN,
CLI_NBR_FINISH,
};
struct mpi_workload_sample
......@@ -146,6 +175,15 @@ struct pending_waits
struct qlist_head ql;
};
struct msg_size_info
{
int64_t msg_size;
int num_msgs;
tw_stime agg_latency;
tw_stime avg_latency;
struct qhash_head * hash_link;
struct qlist_head ql;
};
typedef struct mpi_msgs_queue mpi_msgs_queue;
typedef struct completed_requests completed_requests;
typedef struct pending_waits pending_waits;
......@@ -159,6 +197,9 @@ struct nw_state
int app_id;
int local_rank;
int is_finished;
int neighbor_completed;
struct rc_stack * processed_ops;
struct rc_stack * matched_reqs;
......@@ -195,9 +236,18 @@ struct nw_state
/* Pending wait operation */
struct pending_waits * wait_op;
/* Message size latency information */
struct qhash_table * msg_sz_table;
struct qlist_head msg_sz_list;
/* quick hash for maintaining message latencies */
unsigned long num_bytes_sent;
unsigned long num_bytes_recvd;
unsigned long syn_data;
unsigned long gen_data;
/* For sampling data */
int sampling_indx;
int max_arr_size;
......@@ -243,9 +293,12 @@ struct nw_message
} rc;
};
static void send_ack_back(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp, mpi_msgs_queue * mpi_op);
static void send_ack_back_rc(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp);
/* executes MPI isend and send operations */
static void codes_exec_mpi_send(
nw_state* s, tw_bf * bf, nw_message * m, tw_lp* lp, struct codes_workload_op * mpi_op);
nw_state* s, tw_bf * bf, nw_message * m, tw_lp* lp, struct codes_workload_op * mpi_op, int is_rend);
/* execute MPI irecv operation */
static void codes_exec_mpi_recv(
nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp, struct codes_workload_op * mpi_op);
......@@ -293,6 +346,286 @@ static void update_message_time_rc(
/* conversion from seconds to eanaoseconds */
static tw_stime s_to_ns(tw_stime ns);
static void update_message_size_rc(
struct nw_state * ns,
tw_lp * lp,
tw_bf * bf,
struct nw_message * m)
{
}
/* update the message size */
static void update_message_size(
struct nw_state * ns,
tw_lp * lp,
tw_bf * bf,
struct nw_message * m,
mpi_msgs_queue * qitem,
int is_eager,
int is_send)
{
struct qhash_head * hash_link = NULL;
tw_stime msg_init_time = qitem->req_init_time;
if(!ns->msg_sz_table)
ns->msg_sz_table = qhash_init(msg_size_hash_compare, quickhash_64bit_hash, RANK_HASH_TABLE_SZ);
hash_link = qhash_search(ns->msg_sz_table, &(qitem->num_bytes));
if(is_send)
msg_init_time = m->fwd.sim_start_time;
/* update hash table */
if(!hash_link)
{
struct msg_size_info * msg_info = malloc(sizeof(struct msg_size_info));
msg_info->msg_size = qitem->num_bytes;
msg_info->num_msgs = 1;
msg_info->agg_latency = tw_now(lp) - msg_init_time;
msg_info->avg_latency = msg_info->agg_latency;
qhash_add(ns->msg_sz_table, &(msg_info->msg_size), &(msg_info->hash_link));
qlist_add(&msg_info->ql, &ns->msg_sz_list);
//printf("\n Msg size %d aggregate latency %f num messages %d ", m->fwd.num_bytes, msg_info->agg_latency, msg_info->num_msgs);
}
else
{
struct msg_size_info * tmp = qhash_entry(hash_link, struct msg_size_info, hash_link);
tmp->num_msgs++;
tmp->agg_latency += tw_now(lp) - msg_init_time;
tmp->avg_latency = (tmp->agg_latency / tmp->num_msgs);
// printf("\n Msg size %d aggregate latency %f num messages %d ", qitem->num_bytes, tmp->agg_latency, tmp->num_msgs);
}
}
static void notify_background_traffic_rc(
struct nw_state * ns,
tw_lp * lp,
tw_bf * bf,
struct nw_message * m)
{
tw_rand_reverse_unif(lp->rng);
}
static void notify_background_traffic(
struct nw_state * ns,
tw_lp * lp,
tw_bf * bf,
struct nw_message * m)
{
struct codes_jobmap_id jid;
jid = codes_jobmap_to_local_id(ns->nw_id, jobmap_ctx);
int num_jobs = codes_jobmap_get_num_jobs(jobmap_ctx);
for(int other_id = 0; other_id < num_jobs; other_id++)
{
if(other_id == jid.job)
continue;
struct codes_jobmap_id other_jid;
other_jid.job = other_id;
int num_other_ranks = codes_jobmap_get_num_ranks(other_id, jobmap_ctx);
lprintf("\n Other ranks %ld ", num_other_ranks);
tw_stime ts = (1.1 * g_tw_lookahead) + tw_rand_exponential(lp->rng, mean_interval/10000);
tw_lpid global_dest_id;
for(int k = 0; k < num_other_ranks; k++)
{
other_jid.rank = k;
int intm_dest_id = codes_jobmap_to_global_id(other_jid, jobmap_ctx);
global_dest_id = codes_mapping_get_lpid_from_relative(intm_dest_id, NULL, NW_LP_NM, NULL, 0);
tw_event * e;
struct nw_message * m_new;
e = tw_event_new(global_dest_id, ts, lp);
m_new = tw_event_data(e);
m_new->msg_type = CLI_BCKGND_FIN;
tw_event_send(e);
}
}
return;
}
static void notify_neighbor_rc(
struct nw_state * ns,
tw_lp * lp,
tw_bf * bf,
struct nw_message * m)
{
if(bf->c0)
{
notify_background_traffic_rc(ns, lp, bf, m);
return;
}
if(bf->c1)
{
tw_rand_reverse_unif(lp->rng);
}
}
static void notify_neighbor(
struct nw_state * ns,
tw_lp * lp,
tw_bf * bf,
struct nw_message * m)
{
if(ns->local_rank == num_dumpi_traces - 1
&& ns->is_finished == 1
&& ns->neighbor_completed == 1)
{
printf("\n All workloads completed, notifying background traffic ");
bf->c0 = 1;
notify_background_traffic(ns, lp, bf, m);
return;
}
struct codes_jobmap_id nbr_jid;
nbr_jid.job = ns->app_id;
tw_lpid global_dest_id;
if(ns->is_finished == 1 && (ns->neighbor_completed == 1 || ns->local_rank == 0))
{
bf->c1 = 1;
printf("\n Local rank %d notifying neighbor %d ", ns->local_rank, ns->local_rank+1);
tw_stime ts = (1.1 * g_tw_lookahead) + tw_rand_exponential(lp->rng, mean_interval/10000);
nbr_jid.rank = ns->local_rank + 1;
/* Send a notification to the neighbor about completion */
int intm_dest_id = codes_jobmap_to_global_id(nbr_jid, jobmap_ctx);
global_dest_id = codes_mapping_get_lpid_from_relative(intm_dest_id, NULL, NW_LP_NM, NULL, 0);
tw_event * e;
struct nw_message * m_new;
e = tw_event_new(global_dest_id, ts, lp);
m_new = tw_event_data(e);
m_new->msg_type = CLI_NBR_FINISH;
tw_event_send(e);
}
}
void finish_bckgnd_traffic_rc(
struct nw_state * ns,
tw_bf * b,
struct nw_message * msg,
tw_lp * lp)
{
ns->is_finished = 0;
return;
}
void finish_bckgnd_traffic(
struct nw_state * ns,
tw_bf * b,
struct nw_message * msg,
tw_lp * lp)
{
ns->is_finished = 1;
lprintf("\n LP %llu completed sending data %lld completed at time %lf ", lp->gid, ns->gen_data, tw_now(lp));
return;
}
void finish_nbr_wkld_rc(
struct nw_state * ns,
tw_bf * b,
struct nw_message * msg,
tw_lp * lp)
{
ns->neighbor_completed = 0;
notify_neighbor_rc(ns, lp, b, msg);
}
void finish_nbr_wkld(
struct nw_state * ns,
tw_bf * b,
struct nw_message * msg,
tw_lp * lp)
{
printf("\n Workload completed, notifying neighbor ");
ns->neighbor_completed = 1;
notify_neighbor(ns, lp, b, msg);
}
static void gen_synthetic_tr_rc(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp)
{
if(bf->c0)
return;
model_net_event_rc2(lp, &m->event_rc);
s->gen_data -= PAYLOAD_SZ;
num_syn_bytes_sent -= PAYLOAD_SZ;
tw_rand_reverse_unif(lp->rng);
tw_rand_reverse_unif(lp->rng);
}
/* generate synthetic traffic */
static void gen_synthetic_tr(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp)
{
if(s->is_finished == 1)
{
bf->c0 = 1;
return;
}
/* Get job information */
tw_lpid global_dest_id;
struct codes_jobmap_id jid;
jid = codes_jobmap_to_local_id(s->nw_id, jobmap_ctx);
int num_clients = codes_jobmap_get_num_ranks(jid.job, jobmap_ctx);
int dest_svr = tw_rand_integer(lp->rng, 0, num_clients - 1);
if(dest_svr == s->local_rank)
{
dest_svr = (s->local_rank + 1) % num_clients;
}
jid.rank = dest_svr;
int intm_dest_id = codes_jobmap_to_global_id(jid, jobmap_ctx);
global_dest_id = codes_mapping_get_lpid_from_relative(intm_dest_id, NULL, NW_LP_NM, NULL, 0);
nw_message remote_m;
remote_m.fwd.sim_start_time = tw_now(lp);
remote_m.fwd.dest_rank = dest_svr;
remote_m.msg_type = CLI_BCKGND_ARRIVE;
remote_m.fwd.num_bytes = PAYLOAD_SZ;
remote_m.fwd.app_id = s->app_id;
remote_m.fwd.src_rank = s->local_rank;
m->event_rc = model_net_event(net_id, "synthetic-tr", global_dest_id, PAYLOAD_SZ, 0.0,
sizeof(nw_message), (const void*)&remote_m,
0, NULL, lp);
s->gen_data += PAYLOAD_SZ;
num_syn_bytes_sent += PAYLOAD_SZ;
/* New event after MEAN_INTERVAL */
tw_stime ts = mean_interval + tw_rand_exponential(lp->rng, NOISE);
tw_event * e;
nw_message * m_new;
e = tw_event_new(lp->gid, ts, lp);
m_new = tw_event_data(e);
m_new->msg_type = CLI_BCKGND_GEN;
tw_event_send(e);
}
void arrive_syn_tr_rc(nw_state * s, tw_bf * bf, nw_message * m, tw_lp * lp)
{
// printf("\n Data arrived %d total data %ld ", m->fwd.num_bytes, s->syn_data);
int data = m->fwd.num_bytes;