Commit 2533fc20 authored by Misbah Mubarak's avatar Misbah Mubarak
Browse files

Adding first pass of dragonfly custom model: routings (adaptive, progressive...

Adding first pass of dragonfly custom model: routings (adaptive, progressive adaptive, minimal, non-minimal) should be functional with DF traces, need to update for making optimistic mode functional
parent ef14e173
/*
* Copyright (C) 2014 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
*/
#ifndef DRAGONFLY_H
#define DRAGONFLY_H
#ifdef __cplusplus
extern "C" {
#endif
#include <ross.h>
typedef struct terminal_message terminal_message;
/* this message is used for both dragonfly compute nodes and routers */
struct terminal_message
{
/* magic number */
int magic;
/* flit travel start time*/
tw_stime travel_start_time;
/* packet ID of the flit */
unsigned long long packet_ID;
/* event type of the flit */
short type;
/* category: comes from codes */
char category[CATEGORY_NAME_MAX];
/* store category hash in the event */
uint32_t category_hash;
/* final destination LP ID, this comes from codes can be a server or any other LP type*/
tw_lpid final_dest_gid;
/*sending LP ID from CODES, can be a server or any other LP type */
tw_lpid sender_lp;
tw_lpid sender_mn_lp; // source modelnet id
/* destination terminal ID of the dragonfly */
tw_lpid dest_terminal_id;
/* source terminal ID of the dragonfly */
unsigned int src_terminal_id;
/* message originating router id. MM: Can we calculate it through
* sender_mn_lp??*/
unsigned int origin_router_id;
/* number of hops traversed by the packet */
short my_N_hop;
short my_l_hop, my_g_hop;
short saved_channel;
short saved_vc;
short nonmin_done;
/* Intermediate LP ID from which this message is coming */
unsigned int intm_lp_id;
/* last hop of the message, can be a terminal, local router or global router */
short last_hop;
/* For routing */
int intm_rtr_id;
int saved_src_dest;
int saved_src_chan;
uint32_t chunk_id;
uint32_t packet_size;
uint32_t message_id;
uint32_t total_size;
int remote_event_size_bytes;
int local_event_size_bytes;
// For buffer message
short vc_index;
int output_chan;
model_net_event_return event_rc;
int is_pull;
uint32_t pull_size;
/* for reverse computation */
int path_type;
tw_stime saved_available_time;
tw_stime saved_avg_time;
tw_stime saved_rcv_time;
tw_stime saved_busy_time;
tw_stime saved_total_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
};
#ifdef __cplusplus
}
#endif
#endif /* end of include guard: DRAGONFLY_H */
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* End:
*
* vim: ft=c ts=8 sts=4 sw=4 expandtab
*/
......@@ -28,10 +28,8 @@ struct terminal_message
short type;
/* category: comes from codes */
char category[CATEGORY_NAME_MAX];
/* store category hash in the event */
uint32_t category_hash;
/* final destination LP ID, this comes from codes can be a server or any other LP type*/
tw_lpid final_dest_gid;
/*sending LP ID from CODES, can be a server or any other LP type */
......@@ -41,36 +39,37 @@ struct terminal_message
tw_lpid dest_terminal_id;
/* source terminal ID of the dragonfly */
unsigned int src_terminal_id;
/* local LP ID to calculate the radix of the sender node/router */
unsigned int local_id;
/* message originating router id */
/* message originating router id. MM: Can we calculate it through
* sender_mn_lp??*/
unsigned int origin_router_id;
/* number of hops traversed by the packet */
short my_N_hop;
short my_l_hop, my_g_hop;
short saved_channel;
short saved_vc;
short nonmin_done;
/* Intermediate LP ID from which this message is coming */
unsigned int intm_lp_id;
short new_vc;
short saved_vc;
/* last hop of the message, can be a terminal, local router or global router */
int last_hop;
short last_hop;
/* For routing */
int intm_rtr_id;
int intm_group_id;
uint64_t chunk_id;
uint64_t packet_size;
uint64_t message_id;
uint64_t total_size;
int saved_src_dest;
int saved_src_chan;
uint32_t chunk_id;
uint32_t packet_size;
uint32_t message_id;
uint32_t total_size;
int saved_remote_esize;
int remote_event_size_bytes;
int local_event_size_bytes;
// For buffer message
int vc_index;
int sender_radix;
int output_chan;
model_net_event_return event_rc;
int is_pull;
......@@ -83,20 +82,10 @@ struct terminal_message
tw_stime saved_rcv_time;
tw_stime saved_busy_time;
tw_stime saved_total_time;
tw_stime saved_hist_start_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
int saved_hist_num;
int saved_occupancy;
/* for reverse computation of a node's fan in*/
int saved_fan_nodes;
tw_lpid sender_svr;
/* LP ID of the sending node, has to be a network node in the dragonfly */
tw_lpid sender_node;
tw_lpid next_stop;
};
......
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
//
// Written by:
// Nikhil Jain <nikhil.jain@acm.org>
// Abhinav Bhatele <bhatele@llnl.gov>
// Peer-Timo Bremer <ptbremer@llnl.gov>
//
// LLNL-CODE-678961. All rights reserved.
//
// This file is part of Damselfly. For details, see:
// https://github.com/LLNL/damselfly
// Please also read the LICENSE file for our notice and the LGPL.
//////////////////////////////////////////////////////////////////////////////
#include "stdio.h"
#include "stdlib.h"
//Usage ./binary num_groups num_rows num_columns intra_file inter_file
int main(int argc, char **argv) {
int g = atoi(argv[1]);
int r = atoi(argv[2]);
int c = atoi(argv[3]);
FILE *intra = fopen(argv[4], "w+");
FILE *inter = fopen(argv[5], "w+");
int router = 0;
int dest = 0;
int num_globs = 4;
int green = 0;
int black = 1;
int blue = 2;
printf("\n Num groups %d num_global_chans %d num_rows %d num_cols %d ", g, num_globs, r, c);
for(int groups = 0; groups < g; groups++)
{
/* First connect the router to other routers in the same row */
for(int rows = 0; rows < r; rows++) {
int offset = c * rows;
for(int out_col = 0; out_col < c; out_col++)
{
if(groups == 0)
{
/* Do it for group 0 only */
for(int cols = 0; cols < c; cols++) {
dest = offset + cols;
if((router % (c * r)) != dest)
{
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&green, sizeof(int), 1, intra);
printf("\n INTRA Same row %d %d ", router, dest);
}
}
for(int r_up = 0; r_up < r; r_up++)
{
dest = (c * r_up) + (router % c);
if((router % (c * r)) != dest)
{
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&black, sizeof(int), 1, intra);
printf("\n INTRA Same col %d %d ", router, dest);
}
}
} // end if
// Now setup global connections
//
int myOff = router % (r * c);
int numLink = g / (r*c);
if(g % (r*c) != 0) {
if((router % (r*c)) < (g % (r*c))) {
numLink++;
}
}
int myG = router / (r * c);
for(int blues = 0; blues < numLink; blues++) {
int dest = (blues * r * c) + myOff;
if(dest != myG) {
dest = (dest * r * c ) + (myG % (r * c));
for(int pair = 0; pair < 2; pair++)
{
fwrite(&router, sizeof(int), 1, inter);
fwrite(&dest, sizeof(int), 1, inter);
printf("INTER %d %d %d \n", router, dest, blue);
}
}
}
router++;
}
}
}
fclose(intra);
fclose(inter);
}
#!/usr/bin/env python
##############################################################################
# Copyright (c) 2014, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
#
# Written by:
# Nikhil Jain <nikhil.jain@acm.org>
# Abhinav Bhatele <bhatele@llnl.gov>
# Peer-Timo Bremer <ptbremer@llnl.gov>
#
# LLNL-CODE-678961. All rights reserved.
#
# This file is part of Damselfly. For details, see:
# https://github.com/LLNL/damselfly
# Please also read the LICENSE file for our notice and the LGPL.
##############################################################################
import sys
import re
import numpy as np
import struct
filename = sys.argv[1]
intracon = open(sys.argv[2], "wb")
intercon = open(sys.argv[3], "wb")
def router(group, row, col):
return group*96 + row*16 + col
numblack = np.zeros((1440,1440), dtype=np.int)
numblue = np.zeros((1440,1440), dtype=np.int)
with open(filename) as ofile:
matches = re.findall('c\d-\dc\ds\d+a0l\d+\((\d+):(\d):(\d+)\).(\w+).->.c\d-\dc\ds\d+a0l\d+\((\d+):(\d):(\d+)\)', ofile.read(), re.MULTILINE)
for match in matches:
srcgrp = int(match[0])
if(srcgrp > 12):
srcgrp = srcgrp - 1
srcrow = int(match[1])
srccol = int(match[2])
srcrouter = router(srcgrp, srcrow, srccol)
color = match[3]
dstgrp = int(match[4])
if(dstgrp > 12):
dstgrp = dstgrp - 1
dstrow = int(match[5])
dstcol = int(match[6])
dstrouter = router(dstgrp, dstrow, dstcol)
# count number of black and blue links per router pair
if color == 'black':
numblack[srcrouter][dstrouter] += 1
if color == 'blue':
numblue[srcrouter][dstrouter] += 1
if srcgrp == 0:
if color == 'blue':
# write to inter-con file
intercon.write(struct.pack('2i', srcrouter, dstrouter))
print 'BLUE', srcrouter, dstrouter
else:
# write to intra-con file
if color == 'green':
intracon.write(struct.pack('3i', srcrouter, dstrouter, 0))
print 'GREEN', srcrouter, dstrouter, 0
elif numblack[srcrouter][dstrouter] < 4:
intracon.write(struct.pack('3i', srcrouter, dstrouter, 1))
print 'BLACK', srcrouter, dstrouter, 1
else:
if color == 'blue':
# only write the inter-con file
intercon.write(struct.pack('2i', srcrouter, dstrouter))
print 'BLUE', srcrouter, dstrouter
for i in range(0, 1440):
for j in range(0, 1440):
if(numblack[i][j] != 0):
print numblack[i][j],
print "\n"
for i in range(0, 1440):
for j in range(0, 1440):
if(numblue[i][j] != 0):
print numblue[i][j],
intracon.close()
intercon.close()
This diff is collapsed.
......@@ -87,6 +87,7 @@ nobase_include_HEADERS = \
codes/model-net-sched.h \
codes/model-net-inspect.h \
codes/net/dragonfly.h \
codes/net/dragonfly-custom.h \
codes/net/slimfly.h \
codes/net/loggp.h \
codes/net/simplenet-upd.h \
......
LPGROUPS
{
MODELNET_GRP
{
repetitions="1440";
nw-lp="1";
modelnet_dragonfly_custom="1";
modelnet_dragonfly_custom_router="1";
}
}
PARAMS
{
packet_size="512";
modelnet_order=( "dragonfly_custom","dragonfly_custom_router" );
# scheduler options
modelnet_scheduler="fcfs";
chunk_size="256";
# modelnet_scheduler="round-robin";
num_routers="96";
num_groups="15";
local_vc_size="2048";
global_vc_size="8192";
cn_vc_size="1024";
local_bandwidth="5.25";
global_bandwidth="4.7";
cn_bandwidth="5.25";
message_size="584";
num_cns_per_router="1";
num_global_channels="4";
intra-group-connections="intracray2";
inter-group-connections="intercray2";
routing="prog-adaptive";
}
......@@ -16,7 +16,7 @@
#include "codes/codes-jobmap.h"
/* turning on track lp will generate a lot of output messages */
#define MN_LP_NM "modelnet_dragonfly"
#define MN_LP_NM "modelnet_dragonfly_custom"
#define TRACK_LP -1
#define TRACE -1
......@@ -229,7 +229,7 @@ struct nw_message
double saved_recv_time;
double saved_wait_time;
double saved_delay;
int64_t saved_num_bytes;
int16_t saved_num_bytes;
struct codes_workload_op * saved_op;
} rc;
};
......
This diff is collapsed.
......@@ -23,16 +23,6 @@
#include "codes/rc-stack.h"
#define CREDIT_SIZE 8
#define MEAN_PROCESS 1.0
/* collective specific parameters */
#define TREE_DEGREE 4
#define LEVEL_DELAY 1000
#define DRAGONFLY_COLLECTIVE_DEBUG 0
#define NUM_COLLECTIVES 1
#define COLLECTIVE_COMPUTATION_DELAY 5700
#define DRAGONFLY_FAN_OUT_DELAY 20.0
#define WINDOW_LENGTH 0
#define DFLY_HASH_TABLE_SIZE 262144
// debugging parameters
......@@ -197,26 +187,6 @@ struct terminal_state
// Router-Router Intra-group sends and receives RR_LSEND, RR_LARRIVE
// Router-Router Inter-group sends and receives RR_GSEND, RR_GARRIVE
struct mn_stats dragonfly_stats_array[CATEGORY_MAX];
/* collective init time */
tw_stime collective_init_time;
/* node ID in the tree */
tw_lpid node_id;
/* messages sent & received in collectives may get interchanged several times so we have to save the
origin server information in the node's state */
tw_lpid origin_svr;
/* parent node ID of the current node */
tw_lpid parent_node_id;
/* array of children to be allocated in terminal_init*/
tw_lpid* children;
/* children of a node can be less than or equal to the tree degree */
int num_children;
short is_root;
short is_leaf;
struct rc_stack * st;
int issueIdle;
......@@ -271,10 +241,7 @@ enum event_t
T_BUFFER,
R_SEND,
R_ARRIVE,
R_BUFFER,
D_COLLECTIVE_INIT,
D_COLLECTIVE_FAN_IN,
D_COLLECTIVE_FAN_OUT
R_BUFFER
};
/* status of a virtual channel can be idle, active, allocated or wait for credit */
enum vc_status
......@@ -351,7 +318,6 @@ static short routing = MINIMAL;
static tw_stime dragonfly_total_time = 0;
static tw_stime dragonfly_max_latency = 0;
static tw_stime max_collective = 0;
static long long total_hops = 0;
......@@ -488,7 +454,7 @@ static void dragonfly_read_config(const char * anno, dragonfly_param *params){
p->num_routers);
}
p->num_vcs = 3;
p->num_vcs = 8;
rc = configuration_get_value_int(&config, "PARAMS", "local_vc_size", anno, &p->local_vc_size);
if(rc) {
......@@ -635,64 +601,6 @@ static void dragonfly_report_stats()
return;
}
void dragonfly_collective_init(terminal_state * s,
tw_lp * lp)
{
// TODO: be annotation-aware
codes_mapping_get_lp_info(lp->gid, lp_group_name, &mapping_grp_id, NULL,
&mapping_type_id, NULL, &mapping_rep_id, &mapping_offset);
int num_lps = codes_mapping_get_lp_count(lp_group_name, 1, LP_CONFIG_NM_TERM,
NULL, 1);
int num_reps = codes_mapping_get_group_reps(lp_group_name);
s->node_id = (mapping_rep_id * num_lps) + mapping_offset;
int i;
/* handle collective operations by forming a tree of all the LPs */
/* special condition for root of the tree */
if( s->node_id == 0)
{
s->parent_node_id = -1;
s->is_root = 1;
}
else
{
s->parent_node_id = (s->node_id - ((s->node_id - 1) % TREE_DEGREE)) / TREE_DEGREE;
s->is_root = 0;
}
s->children = (tw_lpid*)malloc(TREE_DEGREE * sizeof(tw_lpid));
/* set the isleaf to zero by default */
s->is_leaf = 1;
s->num_children = 0;
/* calculate the children of the current node. If its a leaf, no need to set children,
only set isleaf and break the loop*/
for( i = 0; i < TREE_DEGREE; i++ )
{
tw_lpid next_child = (TREE_DEGREE * s->node_id) + i + 1;
if(next_child < ((tw_lpid)num_lps * (tw_lpid)num_reps))
{
s->num_children++;
s->is_leaf = 0;
s->children[i] = next_child;
}
else
s->children[i] = -1;
}
#if DRAGONFLY_COLLECTIVE_DEBUG == 1
printf("\n LP %ld parent node id ", s->node_id);
for( i = 0; i < TREE_DEGREE; i++ )
printf(" child node ID %ld ", s->children[i]);
printf("\n");
if(s->is_leaf)
printf("\n LP %ld is leaf ", s->node_id);
#endif
}
/* initialize a dragonfly compute node terminal */
void
terminal_init( terminal_state * s,
......@@ -766,7 +674,6 @@ terminal_init( terminal_state * s,
s->in_send_loop = 0;
s->issueIdle = 0;
dragonfly_collective_init(s, lp);
return;
}
......@@ -1645,273 +1552,6 @@ void packet_arrive(terminal_state * s, tw_bf * bf, terminal_message * msg,
return;
}
/* collective operation for the torus network */
void dragonfly_collective(char const * category, int message_size, int remote_event_size, const void* remote_event, tw_lp* sender)
{
tw_event * e_new;
tw_stime xfer_to_nic_time;
terminal_message * msg;
tw_lpid local_nic_id;
char* tmp_ptr;
codes_mapping_get_lp_info(sender->gid, lp_group_name, &mapping_grp_id,
NULL, &mapping_type_id, NULL, &mapping_rep_id, &mapping_offset);
codes_mapping_get_lp_id(lp_group_name, LP_CONFIG_NM_TERM, NULL, 1,
mapping_rep_id, mapping_offset, &local_nic_id);
xfer_to_nic_time = codes_local_latency(sender);
e_new = model_net_method_event_new(local_nic_id, xfer_to_nic_time,
sender, DRAGONFLY, (void**)&msg, (void**)&tmp_ptr);
msg->remote_event_size_bytes = message_size;
strcpy(msg->category, category);
msg->sender_svr=sender->gid;
msg->type = D_COLLECTIVE_INIT;
tmp_ptr = (char*)msg;
tmp_ptr += dragonfly_get_msg_sz();
if(remote_event_size > 0)
{
msg->remote_event_size_bytes = remote_event_size;