Commit 2533fc20 authored by Misbah Mubarak's avatar Misbah Mubarak
Browse files

Adding first pass of dragonfly custom model: routings (adaptive, progressive...

Adding first pass of dragonfly custom model: routings (adaptive, progressive adaptive, minimal, non-minimal) should be functional with DF traces, need to update for making optimistic mode functional
parent ef14e173
/*
* Copyright (C) 2014 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
*/
#ifndef DRAGONFLY_H
#define DRAGONFLY_H
#ifdef __cplusplus
extern "C" {
#endif
#include <ross.h>
typedef struct terminal_message terminal_message;
/* this message is used for both dragonfly compute nodes and routers */
struct terminal_message
{
/* magic number */
int magic;
/* flit travel start time*/
tw_stime travel_start_time;
/* packet ID of the flit */
unsigned long long packet_ID;
/* event type of the flit */
short type;
/* category: comes from codes */
char category[CATEGORY_NAME_MAX];
/* store category hash in the event */
uint32_t category_hash;
/* final destination LP ID, this comes from codes can be a server or any other LP type*/
tw_lpid final_dest_gid;
/*sending LP ID from CODES, can be a server or any other LP type */
tw_lpid sender_lp;
tw_lpid sender_mn_lp; // source modelnet id
/* destination terminal ID of the dragonfly */
tw_lpid dest_terminal_id;
/* source terminal ID of the dragonfly */
unsigned int src_terminal_id;
/* message originating router id. MM: Can we calculate it through
* sender_mn_lp??*/
unsigned int origin_router_id;
/* number of hops traversed by the packet */
short my_N_hop;
short my_l_hop, my_g_hop;
short saved_channel;
short saved_vc;
short nonmin_done;
/* Intermediate LP ID from which this message is coming */
unsigned int intm_lp_id;
/* last hop of the message, can be a terminal, local router or global router */
short last_hop;
/* For routing */
int intm_rtr_id;
int saved_src_dest;
int saved_src_chan;
uint32_t chunk_id;
uint32_t packet_size;
uint32_t message_id;
uint32_t total_size;
int remote_event_size_bytes;
int local_event_size_bytes;
// For buffer message
short vc_index;
int output_chan;
model_net_event_return event_rc;
int is_pull;
uint32_t pull_size;
/* for reverse computation */
int path_type;
tw_stime saved_available_time;
tw_stime saved_avg_time;
tw_stime saved_rcv_time;
tw_stime saved_busy_time;
tw_stime saved_total_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
};
#ifdef __cplusplus
}
#endif
#endif /* end of include guard: DRAGONFLY_H */
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* End:
*
* vim: ft=c ts=8 sts=4 sw=4 expandtab
*/
......@@ -28,10 +28,8 @@ struct terminal_message
short type;
/* category: comes from codes */
char category[CATEGORY_NAME_MAX];
/* store category hash in the event */
uint32_t category_hash;
/* final destination LP ID, this comes from codes can be a server or any other LP type*/
tw_lpid final_dest_gid;
/*sending LP ID from CODES, can be a server or any other LP type */
......@@ -41,36 +39,37 @@ struct terminal_message
tw_lpid dest_terminal_id;
/* source terminal ID of the dragonfly */
unsigned int src_terminal_id;
/* local LP ID to calculate the radix of the sender node/router */
unsigned int local_id;
/* message originating router id */
/* message originating router id. MM: Can we calculate it through
* sender_mn_lp??*/
unsigned int origin_router_id;
/* number of hops traversed by the packet */
short my_N_hop;
short my_l_hop, my_g_hop;
short saved_channel;
short saved_vc;
short nonmin_done;
/* Intermediate LP ID from which this message is coming */
unsigned int intm_lp_id;
short new_vc;
short saved_vc;
/* last hop of the message, can be a terminal, local router or global router */
int last_hop;
short last_hop;
/* For routing */
int intm_group_id;
uint64_t chunk_id;
uint64_t packet_size;
uint64_t message_id;
uint64_t total_size;
int intm_rtr_id;
int intm_group_id;
int saved_src_dest;
int saved_src_chan;
uint32_t chunk_id;
uint32_t packet_size;
uint32_t message_id;
uint32_t total_size;
int saved_remote_esize;
int remote_event_size_bytes;
int local_event_size_bytes;
// For buffer message
int vc_index;
int sender_radix;
int output_chan;
model_net_event_return event_rc;
int is_pull;
......@@ -83,20 +82,10 @@ struct terminal_message
tw_stime saved_rcv_time;
tw_stime saved_busy_time;
tw_stime saved_total_time;
tw_stime saved_hist_start_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
int saved_hist_num;
int saved_occupancy;
/* for reverse computation of a node's fan in*/
int saved_fan_nodes;
tw_lpid sender_svr;
/* LP ID of the sending node, has to be a network node in the dragonfly */
tw_lpid sender_node;
tw_lpid next_stop;
};
......
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
//
// Written by:
// Nikhil Jain <nikhil.jain@acm.org>
// Abhinav Bhatele <bhatele@llnl.gov>
// Peer-Timo Bremer <ptbremer@llnl.gov>
//
// LLNL-CODE-678961. All rights reserved.
//
// This file is part of Damselfly. For details, see:
// https://github.com/LLNL/damselfly
// Please also read the LICENSE file for our notice and the LGPL.
//////////////////////////////////////////////////////////////////////////////
#include "stdio.h"
#include "stdlib.h"
//Usage ./binary num_groups num_rows num_columns intra_file inter_file
int main(int argc, char **argv) {
int g = atoi(argv[1]);
int r = atoi(argv[2]);
int c = atoi(argv[3]);
FILE *intra = fopen(argv[4], "w+");
FILE *inter = fopen(argv[5], "w+");
int router = 0;
int dest = 0;
int num_globs = 4;
int green = 0;
int black = 1;
int blue = 2;
printf("\n Num groups %d num_global_chans %d num_rows %d num_cols %d ", g, num_globs, r, c);
for(int groups = 0; groups < g; groups++)
{
/* First connect the router to other routers in the same row */
for(int rows = 0; rows < r; rows++) {
int offset = c * rows;
for(int out_col = 0; out_col < c; out_col++)
{
if(groups == 0)
{
/* Do it for group 0 only */
for(int cols = 0; cols < c; cols++) {
dest = offset + cols;
if((router % (c * r)) != dest)
{
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&green, sizeof(int), 1, intra);
printf("\n INTRA Same row %d %d ", router, dest);
}
}
for(int r_up = 0; r_up < r; r_up++)
{
dest = (c * r_up) + (router % c);
if((router % (c * r)) != dest)
{
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&black, sizeof(int), 1, intra);
printf("\n INTRA Same col %d %d ", router, dest);
}
}
} // end if
// Now setup global connections
//
int myOff = router % (r * c);
int numLink = g / (r*c);
if(g % (r*c) != 0) {
if((router % (r*c)) < (g % (r*c))) {
numLink++;
}
}
int myG = router / (r * c);
for(int blues = 0; blues < numLink; blues++) {
int dest = (blues * r * c) + myOff;
if(dest != myG) {
dest = (dest * r * c ) + (myG % (r * c));
for(int pair = 0; pair < 2; pair++)
{
fwrite(&router, sizeof(int), 1, inter);
fwrite(&dest, sizeof(int), 1, inter);
printf("INTER %d %d %d \n", router, dest, blue);
}
}
}
router++;
}
}
}
fclose(intra);
fclose(inter);
}
#!/usr/bin/env python
##############################################################################
# Copyright (c) 2014, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
#
# Written by:
# Nikhil Jain <nikhil.jain@acm.org>
# Abhinav Bhatele <bhatele@llnl.gov>
# Peer-Timo Bremer <ptbremer@llnl.gov>
#
# LLNL-CODE-678961. All rights reserved.
#
# This file is part of Damselfly. For details, see:
# https://github.com/LLNL/damselfly
# Please also read the LICENSE file for our notice and the LGPL.
##############################################################################
import sys
import re
import numpy as np
import struct
filename = sys.argv[1]
intracon = open(sys.argv[2], "wb")
intercon = open(sys.argv[3], "wb")
def router(group, row, col):
return group*96 + row*16 + col
numblack = np.zeros((1440,1440), dtype=np.int)
numblue = np.zeros((1440,1440), dtype=np.int)
with open(filename) as ofile:
matches = re.findall('c\d-\dc\ds\d+a0l\d+\((\d+):(\d):(\d+)\).(\w+).->.c\d-\dc\ds\d+a0l\d+\((\d+):(\d):(\d+)\)', ofile.read(), re.MULTILINE)
for match in matches:
srcgrp = int(match[0])
if(srcgrp > 12):
srcgrp = srcgrp - 1
srcrow = int(match[1])
srccol = int(match[2])
srcrouter = router(srcgrp, srcrow, srccol)
color = match[3]
dstgrp = int(match[4])
if(dstgrp > 12):
dstgrp = dstgrp - 1
dstrow = int(match[5])
dstcol = int(match[6])
dstrouter = router(dstgrp, dstrow, dstcol)
# count number of black and blue links per router pair
if color == 'black':
numblack[srcrouter][dstrouter] += 1
if color == 'blue':
numblue[srcrouter][dstrouter] += 1
if srcgrp == 0:
if color == 'blue':
# write to inter-con file
intercon.write(struct.pack('2i', srcrouter, dstrouter))
print 'BLUE', srcrouter, dstrouter
else:
# write to intra-con file
if color == 'green':
intracon.write(struct.pack('3i', srcrouter, dstrouter, 0))
print 'GREEN', srcrouter, dstrouter, 0
elif numblack[srcrouter][dstrouter] < 4:
intracon.write(struct.pack('3i', srcrouter, dstrouter, 1))
print 'BLACK', srcrouter, dstrouter, 1
else:
if color == 'blue':
# only write the inter-con file
intercon.write(struct.pack('2i', srcrouter, dstrouter))
print 'BLUE', srcrouter, dstrouter
for i in range(0, 1440):
for j in range(0, 1440):
if(numblack[i][j] != 0):
print numblack[i][j],
print "\n"
for i in range(0, 1440):
for j in range(0, 1440):
if(numblue[i][j] != 0):
print numblue[i][j],
intracon.close()
intercon.close()
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -87,6 +87,7 @@ nobase_include_HEADERS = \
codes/model-net-sched.h \
codes/model-net-inspect.h \
codes/net/dragonfly.h \
codes/net/dragonfly-custom.h \
codes/net/slimfly.h \
codes/net/loggp.h \
codes/net/simplenet-upd.h \
......
LPGROUPS
{
MODELNET_GRP
{
repetitions="1440";
nw-lp="1";
modelnet_dragonfly_custom="1";
modelnet_dragonfly_custom_router="1";
}
}
PARAMS
{
packet_size="512";
modelnet_order=( "dragonfly_custom","dragonfly_custom_router" );
# scheduler options
modelnet_scheduler="fcfs";
chunk_size="256";
# modelnet_scheduler="round-robin";
num_routers="96";
num_groups="15";
local_vc_size="2048";
global_vc_size="8192";
cn_vc_size="1024";
local_bandwidth="5.25";
global_bandwidth="4.7";
cn_bandwidth="5.25";
message_size="584";
num_cns_per_router="1";
num_global_channels="4";
intra-group-connections="intracray2";
inter-group-connections="intercray2";
routing="prog-adaptive";
}
......@@ -16,7 +16,7 @@
#include "codes/codes-jobmap.h"
/* turning on track lp will generate a lot of output messages */
#define MN_LP_NM "modelnet_dragonfly"
#define MN_LP_NM "modelnet_dragonfly_custom"
#define TRACK_LP -1
#define TRACE -1
......@@ -229,7 +229,7 @@ struct nw_message
double saved_recv_time;
double saved_wait_time;
double saved_delay;
int64_t saved_num_bytes;
int16_t saved_num_bytes;
struct codes_workload_op * saved_op;
} rc;
};
......
......@@ -13,7 +13,7 @@
#include "codes/model-net.h"
#include "codes/model-net-method.h"
#include "codes/model-net-lp.h"
#include "codes/net/dragonfly.h"
#include "codes/net/dragonfly-custom.h"
#include "sys/file.h"
#include "codes/quickhash.h"
#include "codes/rc-stack.h"
......@@ -23,7 +23,7 @@
#define GREEN 0
#define BLACK 1
#define BLUE 2
#define DUMP_CONNECTIONS 1
#define DUMP_CONNECTIONS 0
using namespace std;
struct Link {
......@@ -55,13 +55,6 @@ struct InterGroupLink {
#define CREDIT_SIZE 8
#define MEAN_PROCESS 1.0
/* collective specific parameters */
#define TREE_DEGREE 4
#define LEVEL_DELAY 1000
#define DRAGONFLY_COLLECTIVE_DEBUG 0
#define NUM_COLLECTIVES 1
#define COLLECTIVE_COMPUTATION_DELAY 5700
#define DRAGONFLY_FAN_OUT_DELAY 20.0
#define WINDOW_LENGTH 0
#define DFLY_HASH_TABLE_SIZE 262144
......@@ -222,39 +215,12 @@ struct terminal_state
terminal_message_list **terminal_msgs;
terminal_message_list **terminal_msgs_tail;
int in_send_loop;
// Terminal generate, sends and arrival T_SEND, T_ARRIVAL, T_GENERATE
// Router-Router Intra-group sends and receives RR_LSEND, RR_LARRIVE
// Router-Router Inter-group sends and receives RR_GSEND, RR_GARRIVE
struct mn_stats dragonfly_stats_array[CATEGORY_MAX];
/* collective init time */
tw_stime collective_init_time;
/* node ID in the tree */
tw_lpid node_id;
/* messages sent & received in collectives may get interchanged several times so we have to save the
origin server information in the node's state */
tw_lpid origin_svr;
/* parent node ID of the current node */
tw_lpid parent_node_id;
/* array of children to be allocated in terminal_init*/
tw_lpid* children;
/* children of a node can be less than or equal to the tree degree */
int num_children;
short is_root;
short is_leaf;
struct rc_stack * st;
int issueIdle;
int terminal_length;
/* to maintain a count of child nodes that have fanned in at the parent during the collective
fan-in phase*/
int num_fan_nodes;
const char * anno;
const dragonfly_param *params;
......@@ -301,25 +267,15 @@ typedef enum event_t
R_SEND,
R_ARRIVE,
R_BUFFER,
D_COLLECTIVE_INIT,
D_COLLECTIVE_FAN_IN,
D_COLLECTIVE_FAN_OUT
} event_t;
/* status of a virtual channel can be idle, active, allocated or wait for credit */
enum vc_status
{
VC_IDLE,
VC_ACTIVE,
VC_ALLOC,
VC_CREDIT
};
/* whether the last hop of a packet was global, local or a terminal */
enum last_hop
{
GLOBAL,
LOCAL,
TERMINAL
TERMINAL,
ROOT
};
/* three forms of routing algorithms available, adaptive routing is not
......@@ -380,7 +336,6 @@ static short routing = MINIMAL;
static tw_stime dragonfly_total_time = 0;
static tw_stime dragonfly_max_latency = 0;
static tw_stime max_collective = 0;
static long long total_hops = 0;
......@@ -519,7 +474,6 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
p->num_routers);
}
p->num_vcs = 3;
rc = configuration_get_value_int(&config, "PARAMS", "local_vc_size", anno, &p->local_vc_size);
if(rc) {
......@@ -591,6 +545,11 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
routing = -1;
}
if(routing == PROG_ADAPTIVE)
p->num_vcs = 10;
else
p->num_vcs = 8;
rc = configuration_get_value_int(&config, "PARAMS", "num_groups", anno, &p->num_groups);
if(rc) {
printf("Number of groups not specified. Aborting");
......@@ -606,7 +565,7 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
printf("\n Number of global channels per router not specified, setting to %d ", p->num_routers/2);
p->num_global_channels = p->num_routers/2;
}
p->radix = p->num_cn + p->num_routers + p->num_global_channels;
p->radix = p->num_routers + p->num_global_channels + p->num_cn;
p->total_routers = p->num_groups * p->num_routers;
p->total_terminals = p->total_routers * p->num_cn;
......@@ -619,6 +578,9 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
tw_error(TW_LOC, "Intra group connections file not specified. Aborting");
}
FILE *groupFile = fopen(intraFile, "rb");
if(!groupFile)
tw_error(TW_LOC, "intra-group file not found ");
if(!myRank)
printf("Reading intra-group connectivity file: %s\n", intraFile);
......@@ -688,6 +650,8 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
for(; it != curMap.end(); it++) {
printf(" ( %d - ", it->first);
for(int l = 0; l < it->second.size(); l++) {
// offset is number of local connections
// type is blue or green according to Cray architecture
printf("%d,%d ", it->second[l].offset, it->second[l].type);
}
printf(")");
......@@ -702,8 +666,11 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
map< int, vector<bLink> > &curMap = interGroupLinks[a];
map< int, vector<bLink> >::iterator it = curMap.begin();
for(; it != curMap.end(); it++) {
// dest group ID
printf(" ( %d - ", it->first);
for(int l = 0; l < it->second.size(); l++) {
// dest is dest router ID
// offset is number of global connections
printf("%d,%d ", it->second[l].offset, it->second[l].dest);
}
printf(")");
......@@ -734,7 +701,7 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
p->cn_delay = bytes_to_ns(p->chunk_size, p->cn_bandwidth);
p->local_delay = bytes_to_ns(p->chunk_size, p->local_bandwidth);
p->global_delay = bytes_to_ns(p->chunk_size, p->global_bandwidth);
p->credit_delay = bytes_to_ns(8.0, p->local_bandwidth); //assume 8 bytes packet
p->credit_delay = bytes_to_ns(CREDIT_SIZE, p->local_bandwidth); //assume 8 bytes packet
}
void dragonfly_custom_configure(){
......@@ -791,63 +758,6 @@ void dragonfly_custom_report_stats()
return;
}
static void dragonfly_collective_init(terminal_state * s,
tw_lp * lp)