Commit 2533fc20 authored by Misbah Mubarak's avatar Misbah Mubarak

Adding first pass of dragonfly custom model: routings (adaptive, progressive...

Adding first pass of dragonfly custom model: routings (adaptive, progressive adaptive, minimal, non-minimal) should be functional with DF traces, need to update for making optimistic mode functional
parent ef14e173
/*
* Copyright (C) 2014 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
*/
#ifndef DRAGONFLY_H
#define DRAGONFLY_H
#ifdef __cplusplus
extern "C" {
#endif
#include <ross.h>
typedef struct terminal_message terminal_message;
/* this message is used for both dragonfly compute nodes and routers */
struct terminal_message
{
/* magic number */
int magic;
/* flit travel start time*/
tw_stime travel_start_time;
/* packet ID of the flit */
unsigned long long packet_ID;
/* event type of the flit */
short type;
/* category: comes from codes */
char category[CATEGORY_NAME_MAX];
/* store category hash in the event */
uint32_t category_hash;
/* final destination LP ID, this comes from codes can be a server or any other LP type*/
tw_lpid final_dest_gid;
/*sending LP ID from CODES, can be a server or any other LP type */
tw_lpid sender_lp;
tw_lpid sender_mn_lp; // source modelnet id
/* destination terminal ID of the dragonfly */
tw_lpid dest_terminal_id;
/* source terminal ID of the dragonfly */
unsigned int src_terminal_id;
/* message originating router id. MM: Can we calculate it through
* sender_mn_lp??*/
unsigned int origin_router_id;
/* number of hops traversed by the packet */
short my_N_hop;
short my_l_hop, my_g_hop;
short saved_channel;
short saved_vc;
short nonmin_done;
/* Intermediate LP ID from which this message is coming */
unsigned int intm_lp_id;
/* last hop of the message, can be a terminal, local router or global router */
short last_hop;
/* For routing */
int intm_rtr_id;
int saved_src_dest;
int saved_src_chan;
uint32_t chunk_id;
uint32_t packet_size;
uint32_t message_id;
uint32_t total_size;
int remote_event_size_bytes;
int local_event_size_bytes;
// For buffer message
short vc_index;
int output_chan;
model_net_event_return event_rc;
int is_pull;
uint32_t pull_size;
/* for reverse computation */
int path_type;
tw_stime saved_available_time;
tw_stime saved_avg_time;
tw_stime saved_rcv_time;
tw_stime saved_busy_time;
tw_stime saved_total_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
};
#ifdef __cplusplus
}
#endif
#endif /* end of include guard: DRAGONFLY_H */
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* End:
*
* vim: ft=c ts=8 sts=4 sw=4 expandtab
*/
......@@ -28,10 +28,8 @@ struct terminal_message
short type;
/* category: comes from codes */
char category[CATEGORY_NAME_MAX];
/* store category hash in the event */
uint32_t category_hash;
/* final destination LP ID, this comes from codes can be a server or any other LP type*/
tw_lpid final_dest_gid;
/*sending LP ID from CODES, can be a server or any other LP type */
......@@ -41,36 +39,37 @@ struct terminal_message
tw_lpid dest_terminal_id;
/* source terminal ID of the dragonfly */
unsigned int src_terminal_id;
/* local LP ID to calculate the radix of the sender node/router */
unsigned int local_id;
/* message originating router id */
/* message originating router id. MM: Can we calculate it through
* sender_mn_lp??*/
unsigned int origin_router_id;
/* number of hops traversed by the packet */
short my_N_hop;
short my_l_hop, my_g_hop;
short saved_channel;
short saved_vc;
short nonmin_done;
/* Intermediate LP ID from which this message is coming */
unsigned int intm_lp_id;
short new_vc;
short saved_vc;
/* last hop of the message, can be a terminal, local router or global router */
int last_hop;
short last_hop;
/* For routing */
int intm_group_id;
uint64_t chunk_id;
uint64_t packet_size;
uint64_t message_id;
uint64_t total_size;
int intm_rtr_id;
int intm_group_id;
int saved_src_dest;
int saved_src_chan;
uint32_t chunk_id;
uint32_t packet_size;
uint32_t message_id;
uint32_t total_size;
int saved_remote_esize;
int remote_event_size_bytes;
int local_event_size_bytes;
// For buffer message
int vc_index;
int sender_radix;
int output_chan;
model_net_event_return event_rc;
int is_pull;
......@@ -83,20 +82,10 @@ struct terminal_message
tw_stime saved_rcv_time;
tw_stime saved_busy_time;
tw_stime saved_total_time;
tw_stime saved_hist_start_time;
tw_stime saved_sample_time;
tw_stime msg_start_time;
int saved_hist_num;
int saved_occupancy;
/* for reverse computation of a node's fan in*/
int saved_fan_nodes;
tw_lpid sender_svr;
/* LP ID of the sending node, has to be a network node in the dragonfly */
tw_lpid sender_node;
tw_lpid next_stop;
};
......
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
//
// Written by:
// Nikhil Jain <nikhil.jain@acm.org>
// Abhinav Bhatele <bhatele@llnl.gov>
// Peer-Timo Bremer <ptbremer@llnl.gov>
//
// LLNL-CODE-678961. All rights reserved.
//
// This file is part of Damselfly. For details, see:
// https://github.com/LLNL/damselfly
// Please also read the LICENSE file for our notice and the LGPL.
//////////////////////////////////////////////////////////////////////////////
#include "stdio.h"
#include "stdlib.h"
//Usage ./binary num_groups num_rows num_columns intra_file inter_file
int main(int argc, char **argv) {
int g = atoi(argv[1]);
int r = atoi(argv[2]);
int c = atoi(argv[3]);
FILE *intra = fopen(argv[4], "w+");
FILE *inter = fopen(argv[5], "w+");
int router = 0;
int dest = 0;
int num_globs = 4;
int green = 0;
int black = 1;
int blue = 2;
printf("\n Num groups %d num_global_chans %d num_rows %d num_cols %d ", g, num_globs, r, c);
for(int groups = 0; groups < g; groups++)
{
/* First connect the router to other routers in the same row */
for(int rows = 0; rows < r; rows++) {
int offset = c * rows;
for(int out_col = 0; out_col < c; out_col++)
{
if(groups == 0)
{
/* Do it for group 0 only */
for(int cols = 0; cols < c; cols++) {
dest = offset + cols;
if((router % (c * r)) != dest)
{
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&green, sizeof(int), 1, intra);
printf("\n INTRA Same row %d %d ", router, dest);
}
}
for(int r_up = 0; r_up < r; r_up++)
{
dest = (c * r_up) + (router % c);
if((router % (c * r)) != dest)
{
fwrite(&router, sizeof(int), 1, intra);
fwrite(&dest, sizeof(int), 1, intra);
fwrite(&black, sizeof(int), 1, intra);
printf("\n INTRA Same col %d %d ", router, dest);
}
}
} // end if
// Now setup global connections
//
int myOff = router % (r * c);
int numLink = g / (r*c);
if(g % (r*c) != 0) {
if((router % (r*c)) < (g % (r*c))) {
numLink++;
}
}
int myG = router / (r * c);
for(int blues = 0; blues < numLink; blues++) {
int dest = (blues * r * c) + myOff;
if(dest != myG) {
dest = (dest * r * c ) + (myG % (r * c));
for(int pair = 0; pair < 2; pair++)
{
fwrite(&router, sizeof(int), 1, inter);
fwrite(&dest, sizeof(int), 1, inter);
printf("INTER %d %d %d \n", router, dest, blue);
}
}
}
router++;
}
}
}
fclose(intra);
fclose(inter);
}
#!/usr/bin/env python
##############################################################################
# Copyright (c) 2014, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
#
# Written by:
# Nikhil Jain <nikhil.jain@acm.org>
# Abhinav Bhatele <bhatele@llnl.gov>
# Peer-Timo Bremer <ptbremer@llnl.gov>
#
# LLNL-CODE-678961. All rights reserved.
#
# This file is part of Damselfly. For details, see:
# https://github.com/LLNL/damselfly
# Please also read the LICENSE file for our notice and the LGPL.
##############################################################################
import sys
import re
import numpy as np
import struct
filename = sys.argv[1]
intracon = open(sys.argv[2], "wb")
intercon = open(sys.argv[3], "wb")
def router(group, row, col):
return group*96 + row*16 + col
numblack = np.zeros((1440,1440), dtype=np.int)
numblue = np.zeros((1440,1440), dtype=np.int)
with open(filename) as ofile:
matches = re.findall('c\d-\dc\ds\d+a0l\d+\((\d+):(\d):(\d+)\).(\w+).->.c\d-\dc\ds\d+a0l\d+\((\d+):(\d):(\d+)\)', ofile.read(), re.MULTILINE)
for match in matches:
srcgrp = int(match[0])
if(srcgrp > 12):
srcgrp = srcgrp - 1
srcrow = int(match[1])
srccol = int(match[2])
srcrouter = router(srcgrp, srcrow, srccol)
color = match[3]
dstgrp = int(match[4])
if(dstgrp > 12):
dstgrp = dstgrp - 1
dstrow = int(match[5])
dstcol = int(match[6])
dstrouter = router(dstgrp, dstrow, dstcol)
# count number of black and blue links per router pair
if color == 'black':
numblack[srcrouter][dstrouter] += 1
if color == 'blue':
numblue[srcrouter][dstrouter] += 1
if srcgrp == 0:
if color == 'blue':
# write to inter-con file
intercon.write(struct.pack('2i', srcrouter, dstrouter))
print 'BLUE', srcrouter, dstrouter
else:
# write to intra-con file
if color == 'green':
intracon.write(struct.pack('3i', srcrouter, dstrouter, 0))
print 'GREEN', srcrouter, dstrouter, 0
elif numblack[srcrouter][dstrouter] < 4:
intracon.write(struct.pack('3i', srcrouter, dstrouter, 1))
print 'BLACK', srcrouter, dstrouter, 1
else:
if color == 'blue':
# only write the inter-con file
intercon.write(struct.pack('2i', srcrouter, dstrouter))
print 'BLUE', srcrouter, dstrouter
for i in range(0, 1440):
for j in range(0, 1440):
if(numblack[i][j] != 0):
print numblack[i][j],
print "\n"
for i in range(0, 1440):
for j in range(0, 1440):
if(numblue[i][j] != 0):
print numblue[i][j],
intracon.close()
intercon.close()
This diff is collapsed.
......@@ -87,6 +87,7 @@ nobase_include_HEADERS = \
codes/model-net-sched.h \
codes/model-net-inspect.h \
codes/net/dragonfly.h \
codes/net/dragonfly-custom.h \
codes/net/slimfly.h \
codes/net/loggp.h \
codes/net/simplenet-upd.h \
......
LPGROUPS
{
MODELNET_GRP
{
repetitions="1440";
nw-lp="1";
modelnet_dragonfly_custom="1";
modelnet_dragonfly_custom_router="1";
}
}
PARAMS
{
packet_size="512";
modelnet_order=( "dragonfly_custom","dragonfly_custom_router" );
# scheduler options
modelnet_scheduler="fcfs";
chunk_size="256";
# modelnet_scheduler="round-robin";
num_routers="96";
num_groups="15";
local_vc_size="2048";
global_vc_size="8192";
cn_vc_size="1024";
local_bandwidth="5.25";
global_bandwidth="4.7";
cn_bandwidth="5.25";
message_size="584";
num_cns_per_router="1";
num_global_channels="4";
intra-group-connections="intracray2";
inter-group-connections="intercray2";
routing="prog-adaptive";
}
......@@ -16,7 +16,7 @@
#include "codes/codes-jobmap.h"
/* turning on track lp will generate a lot of output messages */
#define MN_LP_NM "modelnet_dragonfly"
#define MN_LP_NM "modelnet_dragonfly_custom"
#define TRACK_LP -1
#define TRACE -1
......@@ -229,7 +229,7 @@ struct nw_message
double saved_recv_time;
double saved_wait_time;
double saved_delay;
int64_t saved_num_bytes;
int16_t saved_num_bytes;
struct codes_workload_op * saved_op;
} rc;
};
......
This diff is collapsed.
This diff is collapsed.
......@@ -387,6 +387,10 @@ void model_net_base_event(
tw_bf * b,
model_net_wrap_msg * m,
tw_lp * lp){
if(m->h.magic != model_net_base_magic)
printf("\n LP ID mismatched %d ", lp->gid);
assert(m->h.magic == model_net_base_magic);
void * sub_msg;
......
......@@ -3,8 +3,8 @@ LPGROUPS
MODELNET_GRP
{
repetitions="288";
server="2";
modelnet_dragonfly_custom="2";
server="4";
modelnet_dragonfly_custom="4";
modelnet_dragonfly_custom_router="1";
}
}
......@@ -25,8 +25,9 @@ PARAMS
global_bandwidth="4.7";
cn_bandwidth="5.25";
message_size="368";
num_cns_per_router="4";
intra-group-connections="intra";
inter-group-connections="inter";
routing="nonminimal";
routing="minimal";
router_radix="48";
}
......@@ -24,18 +24,16 @@
#include "codes/configuration.h"
#include "codes/lp-type-lookup.h"
#define SVR_LP_NM "server"
#define NUM_REQS 2 /* number of requests sent by each server */
#define PAYLOAD_SZ 4096 /* size of simulated data payload, bytes */
static int net_id = 0;
static int num_routers = 0;
static int num_servers = 0;
static int offset = 2;
/* whether to pull instead of push */
static int do_pull = 0;
static int num_routers_per_rep = 0;
static int num_servers_per_rep = 0;
static int lps_per_rep = 0;
......@@ -60,6 +58,7 @@ struct svr_state
int local_recvd_count; /* number of local messages received */
tw_stime start_ts; /* time that we started sending requests */
tw_stime end_ts; /* time that we ended sending requests */
tw_lpid svr_rel_id; /* relative ID of the server */
};
struct svr_msg
......@@ -175,26 +174,9 @@ int main(
net_id = *net_ids;
free(net_ids);
num_servers = codes_mapping_get_lp_count("MODELNET_GRP", 0, "server",
num_servers = codes_mapping_get_lp_count("MODELNET_GRP", 0, SVR_LP_NM,
NULL, 1);
if(net_id == DRAGONFLY)
{
strcpy(router_name, "modelnet_dragonfly_router");
}
if(net_id == SLIMFLY)
{
strcpy(router_name, "slimfly_router");
}
if(net_id == SLIMFLY || net_id == DRAGONFLY)
{
num_routers = codes_mapping_get_lp_count("MODELNET_GRP", 0,
router_name, NULL, 1);
offset = 1;
}
if(lp_io_prepare("modelnet-test", LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD) < 0)
{
return(-1);
......@@ -232,11 +214,11 @@ static void svr_init(
memset(ns, 0, sizeof(*ns));
ns->svr_rel_id = codes_mapping_get_lp_relative_id(lp->gid, 0, 0);
/* each server sends a dummy event to itself that will kick off the real
* simulation
*/
//printf("\n Initializing servers %d ", (int)lp->gid);
/* skew each kickoff event slightly to help avoid event ties later on */
kickoff_time = g_tw_lookahead + tw_rand_unif(lp->rng);
......@@ -346,24 +328,9 @@ static void handle_kickoff_event(
/* record when transfers started on this server */
ns->start_ts = tw_now(lp);
num_servers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
"server", NULL, 1);
num_routers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
router_name, NULL, 1);
lps_per_rep = num_servers_per_rep * 2 + num_routers_per_rep;
int opt_offset = 0;
int total_lps = num_servers * 2 + num_routers;
if(net_id == DRAGONFLY && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */
if(net_id == SLIMFLY && (lp->gid % lps_per_rep == num_servers_per_rep -1))
opt_offset = num_servers_per_rep + num_routers_per_rep;
int dest_id = (ns->svr_rel_id + 1) % num_servers;
/* each server sends a request to the next highest server */
int dest_id = (lp->gid + offset + opt_offset)%total_lps;
if (do_pull){
m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
sizeof(svr_msg), (const void*)m_remote, lp);
......@@ -449,15 +416,7 @@ static void handle_ack_event(
/* safety check that this request got to the right server */
// printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
int opt_offset = 0;
if(net_id == DRAGONFLY && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */
if(net_id == SLIMFLY && (lp->gid % lps_per_rep == num_servers_per_rep -1))
opt_offset = num_servers_per_rep + num_routers_per_rep;
tw_lpid dest_id = (lp->gid + offset + opt_offset)%(num_servers*2 + num_routers);
tw_lpid dest_id = codes_mapping_get_lpid_from_relative(m->src, NULL, SVR_LP_NM, NULL, 0);
/* in the "pull" case, src should actually be self */
if (do_pull){
......@@ -510,15 +469,6 @@ static void handle_req_event(
/* safety check that this request got to the right server */
// printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
int opt_offset = 0;
if(net_id == DRAGONFLY && (m->src % lps_per_rep == num_servers_per_rep - 1))
opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */
if(net_id == SLIMFLY && (m->src % lps_per_rep == num_servers_per_rep -1))
opt_offset = num_servers_per_rep + num_routers_per_rep;
assert(lp->gid == (m->src + offset + opt_offset)%(num_servers*2 + num_routers));
ns->msg_recvd_count++;
/* send ack back */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment