Commit 29a07ffd authored by Misbah Mubarak's avatar Misbah Mubarak

Creating branch for multiapp workloads, Xu's multiapp workloads functional,...

Creating branch for multiapp workloads, Xu's multiapp workloads functional, will merge with master once things are working with the BB model
parent 43059659
......@@ -116,6 +116,18 @@ else
AM_CONDITIONAL(USE_DUMPI, false)
fi
if test "x${with_profiling}" != "x" ; then
AC_CHECK_FILE([${with_dumpi}/lib/libundumpi.la],
AM_CONDITIONAL(USE_PROFILE, true),
AC_MSG_ERROR(Could not find libdumpi.la))
DUMPI_CFLAGS="-I${with_dumpi}/include"
DUMPI_LIBS="-L${with_dumpi}/lib/ -lundumpi"
AC_SUBST(DUMPI_LIBS)
AC_SUBST(DUMPI_CFLAGS)
else
AM_CONDITIONAL(USE_PROFILE, false)
fi
dnl ======================================================================
dnl Try harder to be valgrind safe
dnl ======================================================================
......
......@@ -2,9 +2,9 @@ LPGROUPS
{
MODELNET_GRP
{
repetitions="36";
nw-lp="2";
modelnet_dragonfly="2";
repetitions="510";
nw-lp="5";
modelnet_dragonfly="5";
modelnet_dragonfly_router="1";
}
}
......@@ -14,12 +14,12 @@ PARAMS
modelnet_order=( "dragonfly", "dragonfly_router");
# scheduler options
modelnet_scheduler="fcfs";
chunk_size="256";
chunk_size="512";
# modelnet_scheduler="round-robin";
num_routers="4";
local_vc_size="16384";
global_vc_size="32768";
cn_vc_size="16384";
num_routers="10";
local_vc_size="4096";
global_vc_size="8192";
cn_vc_size="4096";
local_bandwidth="5.25";
global_bandwidth="4.7";
cn_bandwidth="5.25";
......
......@@ -2,7 +2,7 @@ LPGROUPS
{
MODELNET_GRP
{
repetitions="32";
repetitions="64";
nw-lp="1";
modelnet_torus="1";
}
......@@ -17,7 +17,7 @@ PARAMS
net_startup_ns="1.5";
net_bw_mbps="20000";
n_dims="3";
dim_length="4,4,2";
dim_length="4,4,4";
link_bandwidth="10.0";
buffer_size="8192";
chunk_size="256";
......
......@@ -13,6 +13,7 @@
#include "codes/model-net.h"
#include "codes/rc-stack.h"
#include "codes/quicklist.h"
#include "codes/codes-jobmap.h"
/* turning on track lp will generate a lot of output messages */
#define TRACK_LP -1
......@@ -31,6 +32,17 @@ static lp_io_handle io_handle;
static unsigned int lp_io_use_suffix = 0;
static int do_lp_io = 0;
/* variables for loading multiple applications */
/* Xu's additions start */
char workloads_conf_file[8192];
char alloc_file[8192];
int num_traces_of_job[5];
char file_name_of_job[5][8192];
struct codes_jobmap_ctx *jobmap_ctx;
struct codes_jobmap_params_list jobmap_p;
/* Xu's additions end */
typedef struct nw_state nw_state;
typedef struct nw_message nw_message;
typedef int32_t dumpi_req_id;
......@@ -108,6 +120,8 @@ struct nw_state
long num_events_per_lp;
tw_lpid nw_id;
short wrkld_end;
int app_id;
int local_rank;
struct rc_stack * processed_ops;
struct rc_stack * matched_reqs;
......@@ -286,6 +300,25 @@ static void add_completed_reqs(nw_state * s,
qlist_add(&req->ql, &s->completed_reqs);
}
}
void find_glp_for_msg( struct codes_workload_op * mpi_op , struct codes_jobmap_id *jp_id)
{
jp_id->rank = mpi_op->u.send.dest_rank;
int global_dest_rank = codes_jobmap_to_global_id(*jp_id, jobmap_ctx);
if(jp_id->rank != global_dest_rank)
{
mpi_op->u.send.dest_rank = global_dest_rank;
mpi_op->u.recv.dest_rank = global_dest_rank;
}
jp_id->rank = mpi_op->u.send.source_rank;
int global_src_rank = codes_jobmap_to_global_id(*jp_id, jobmap_ctx);
if(jp_id->rank != global_src_rank)
{
mpi_op->u.send.source_rank = global_src_rank;
mpi_op->u.recv.source_rank = global_src_rank;
}
}
/* helper function - maps an MPI rank to an LP id */
static tw_lpid rank_to_lpid(int rank)
{
......@@ -704,6 +737,17 @@ static void codes_exec_mpi_send(nw_state* s,
tw_lp* lp,
struct codes_workload_op * mpi_op)
{
struct codes_jobmap_id jid;
jid = codes_jobmap_to_local_id(s->nw_id, jobmap_ctx);
if(jid.job == -1)
{
printf("network LP nw id %d not generating events, lp gid is %ld \n", (int)s->nw_id, lp->gid);
s->app_id = -1;
s->local_rank = -1;
return;
}
m->rc.saved_num_bytes = mpi_op->u.send.num_bytes;
/* model-net event */
tw_lpid dest_rank;
......@@ -948,12 +992,28 @@ void nw_test_init(nw_state* s, tw_lp* lp)
assert(num_net_traces <= num_net_lps);
if (strcmp(workload_type, "dumpi") == 0){
strcpy(params_d.file_name, workload_file);
params_d.num_net_traces = num_net_traces;
struct codes_jobmap_id lid;
lid = codes_jobmap_to_local_id(s->nw_id, jobmap_ctx);
if(lid.job == -1)
{
printf("network LP nw id %d not generating events, lp gid is %ld \n", (int)s->nw_id, lp->gid);
s->app_id = -1;
s->local_rank = -1;
return;
}
if (strcmp(workload_type, "dumpi") == 0){
strcpy(params_d.file_name, file_name_of_job[lid.job]);
params_d.num_net_traces = num_traces_of_job[lid.job];
params = (char*)&params_d;
s->app_id = lid.job;
s->local_rank = lid.rank;
printf("lp global id: %llu, file name: %s, num traces: %d, app id: %d, local id: %d\n",
s->nw_id, params_d.file_name, params_d.num_net_traces, s->app_id, s->local_rank);
}
wrkld_id = codes_workload_load("dumpi-trace-workload", params, s->app_id, s->local_rank);
/* In this case, the LP will not generate any workload related events*/
if(s->nw_id >= params_d.num_net_traces)
return;
......@@ -1289,7 +1349,7 @@ int main( int argc, char** argv )
tw_opt_add(app_opt);
tw_init(&argc, &argv);
if(strlen(workload_file) == 0 || strcmp(workload_type, "dumpi") != 0 || num_net_traces <= 0)
if(strlen(workloads_conf_file) == 0 || strcmp(workload_type, "dumpi") != 0 || num_net_traces <= 0)
{
if(tw_ismaster())
printf("Usage: mpirun -np n ./modelnet-mpi-replay --sync=1/3"
......@@ -1301,6 +1361,38 @@ int main( int argc, char** argv )
return -1;
}
FILE *name_file = fopen(workloads_conf_file, "r");
if(!name_file)
tw_error(TW_LOC, "\n Could not open file %s ", workloads_conf_file);
int i = 0;
char ref = '\n';
while(!feof(name_file))
{
ref = fscanf(name_file, "%d %s", &num_traces_of_job[i], file_name_of_job[i]);
if(ref!=EOF)
{
printf("\n%d traces of app %s \n", num_traces_of_job[i], file_name_of_job[i]);
num_net_traces += num_traces_of_job[i];
i++;
}
}
fclose(name_file);
if(strlen(alloc_file) == 0)
{
if(tw_ismaster())
{
printf("\n Usage: mpirun -np n ./codes-nw-test --sync=1/2/3"
"--workload_type=type --workloads_conf_file = workloads.conf"
"--alloc_file=alloc.conf");
tw_end();
return -1;
}
}
jobmap_p.alloc_file = alloc_file;
jobmap_ctx = codes_jobmap_configure(CODES_JOBMAP_LIST, &jobmap_p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
......@@ -1366,6 +1458,8 @@ int main( int argc, char** argv )
assert(ret == 0 || !"lp_io_flush failure");
}
model_net_report_stats(net_id);
codes_jobmap_destroy(jobmap_ctx);
tw_end();
return 0;
......
......@@ -12,10 +12,16 @@
#include "codes/codes_mapping.h"
#include "codes/model-net.h"
#include "codes/rc-stack.h"
#include "codes/codes-jobmap.h"
#define TRACE -1
#define TRACK 0
/*global variable for loading multiple jobs' traces*/
char workloads_conf_file[8192];//the file in which the path and name of each job's traces are
char alloc_file[8192];// the file in which the preassgined LP lists for the jobs
int num_traces_of_job[5];//the number_of_traces of each job
char file_name_of_job[5][8192];//the name of each job
char workload_type[128];
char workload_file[8192];
char offset_file[8192];
......@@ -44,6 +50,8 @@ double avg_time = 0, avg_comm_time = 0, avg_wait_time = 0, avg_send_time = 0, av
static char lp_group_name[MAX_NAME_LENGTH], lp_type_name[MAX_NAME_LENGTH], annotation[MAX_NAME_LENGTH];
static int mapping_grp_id, mapping_type_id, mapping_rep_id, mapping_offset;
struct codes_jobmap_ctx *jobmap_ctx;
struct codes_jobmap_params_list jobmap_p;
/* runtime option for disabling computation time simulation */
static int disable_delay = 0;
......@@ -99,6 +107,8 @@ struct nw_state
uint64_t num_completed;
int app_id;
int local_rank;
/* count of sends, receives, collectives and delays */
unsigned long num_sends;
unsigned long num_recvs;
......@@ -163,6 +173,8 @@ struct nw_message
double saved_wait_time;
};
/*find the global LP id of source and destination lps of each message*/
void find_glp_for_msg( struct codes_workload_op * mpi_op , struct codes_jobmap_id *jp_id);
/* executes MPI wait operation */
static void codes_exec_mpi_wait(
nw_state* s, tw_lp* lp, nw_message * m, struct codes_workload_op * mpi_op);
......@@ -853,6 +865,9 @@ static void codes_exec_mpi_recv(nw_state* s, tw_lp* lp, nw_message * m, struct c
/* Once an irecv is posted, list of completed sends is checked to find a matching isend.
If no matching isend is found, the receive operation is queued in the pending queue of
receive operations. */
struct codes_jobmap_id lid;
lid.job = s->app_id;
find_glp_for_msg(mpi_op, &lid);
m->saved_recv_time = s->recv_time;
mpi_op->sim_start_time = tw_now(lp);
......@@ -888,9 +903,31 @@ static void codes_exec_mpi_recv(nw_state* s, tw_lp* lp, nw_message * m, struct c
}
}
void find_glp_for_msg( struct codes_workload_op * mpi_op , struct codes_jobmap_id *jp_id)
{
jp_id->rank = mpi_op->u.send.dest_rank;
int global_dest_rank = codes_jobmap_to_global_id(*jp_id, jobmap_ctx);
if(jp_id->rank != global_dest_rank)
{
mpi_op->u.send.dest_rank = global_dest_rank;
mpi_op->u.recv.dest_rank = global_dest_rank;
}
jp_id->rank = mpi_op->u.send.source_rank;
int global_src_rank = codes_jobmap_to_global_id(*jp_id, jobmap_ctx);
if(jp_id->rank != global_src_rank)
{
mpi_op->u.send.source_rank = global_src_rank;
mpi_op->u.recv.source_rank = global_src_rank;
}
}
/* executes MPI send and isend operations */
static void codes_exec_mpi_send(nw_state* s, tw_lp* lp, struct codes_workload_op * mpi_op)
{
struct codes_jobmap_id lid;
lid.job = s->app_id;
find_glp_for_msg(mpi_op, &lid);
/* model-net event */
tw_lpid dest_rank;
......@@ -1092,28 +1129,34 @@ void nw_test_init(nw_state* s, tw_lp* lp)
s->completed_reqs = NULL;
s->pending_waits = NULL;
if(!num_net_traces)
num_net_traces = num_net_lps;
struct codes_jobmap_id lid;
lid = codes_jobmap_to_local_id(s->nw_id, jobmap_ctx);
if (strcmp(workload_type, "dumpi") == 0){
strcpy(params_d.file_name, workload_file);
params_d.num_net_traces = num_net_traces;
if(lid.job == -1)
{
printf("network LP nw id %d not generating events, lp gid is %ld \n", (int)s->nw_id, lp->gid);
s->app_id = -1;
s->local_rank = -1;
return ;
}
params = (char*)&params_d;
}
/* In this case, the LP will not generate any workload related events*/
if(s->nw_id >= params_d.num_net_traces)
{
//printf("\n network LP not generating events %d ", (int)s->nw_id);
return;
}
if(strcmp(workload_type, "dumpi") == 0)
{
strcpy(params_d.file_name, file_name_of_job[lid.job]);
params_d.num_net_traces = num_traces_of_job[lid.job];
params = (char*)&params_d;
s->app_id = lid.job;
s->local_rank = lid.rank;
printf("lp global id: %llu, file name: %s, num traces: %d, app id: %d, local id: %d\n", s->nw_id, params_d.file_name, params_d.num_net_traces, s->app_id, s->local_rank);
}
wrkld_id = codes_workload_load("dumpi-trace-workload", params, s->app_id, s->local_rank);
/* Initialize the RC stack */
rc_stack_create(&s->st);
assert(s->st != NULL);
wrkld_id = codes_workload_load("dumpi-trace-workload", params, 0, (int)s->nw_id);
s->arrival_queue = queue_init();
s->pending_recvs_queue = queue_init();
......@@ -1154,7 +1197,7 @@ static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, t
struct codes_workload_op * mpi_op = m->saved_op;
//(struct codes_workload_op *)rc_stack_pop(s->st);
codes_workload_get_next_rc2(wrkld_id, 0, (int)s->nw_id);
codes_workload_get_next_rc2(wrkld_id, s->app_id, s->local_rank);
if(mpi_op->op_type == CODES_WK_END)
return;
......@@ -1232,7 +1275,7 @@ static void get_next_mpi_operation_rc(nw_state* s, tw_bf * bf, nw_message * m, t
static void get_next_mpi_operation(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp)
{
struct codes_workload_op * mpi_op = malloc(sizeof(struct codes_workload_op));
codes_workload_get_next(wrkld_id, 0, (int)s->nw_id, mpi_op);
codes_workload_get_next(wrkld_id, s->app_id, s->local_rank, mpi_op);
s->num_completed++;
......@@ -1390,6 +1433,8 @@ const tw_optdef app_opt [] =
TWOPT_CHAR("workload_type", workload_type, "workload type (either \"scalatrace\" or \"dumpi\")"),
TWOPT_CHAR("workload_file", workload_file, "workload file name"),
TWOPT_UINT("num_net_traces", num_net_traces, "number of network traces"),
TWOPT_CHAR("workloads_conf_file", workloads_conf_file, "workload file name"),
TWOPT_CHAR("alloc_file", alloc_file, "allocation file name"),
TWOPT_UINT("disable_compute", disable_delay, "disable compute simulation"),
TWOPT_CHAR("lp-io-dir", lp_io_dir, "Where to place io output (unspecified -> no output"),
TWOPT_UINT("lp-io-use-suffix", lp_io_use_suffix, "Whether to append uniq suffix to lp-io directory (default 0)"),
......@@ -1429,13 +1474,45 @@ int main( int argc, char** argv )
tw_opt_add(app_opt);
tw_init(&argc, &argv);
if(strlen(workload_file) == 0)
if(strlen(workloads_conf_file) == 0){
if(tw_ismaster())
printf("\n Usage: mpirun -np n ./model-net-dumpi-traces-dump --sync=1/2/3 --workload_type=type"
" --workloads_conf_file = workloads.conf --alloc_file=alloc.conf");
tw_end();
return -1;
}
FILE *name_file = fopen(workloads_conf_file, "r");
if (!name_file){
printf("Could not open file %s \n", workloads_conf_file);
exit(1);
}
else{
int i=0;
char ref = '\n';
while(!feof(name_file))
{
if(tw_ismaster())
printf("Usage: mpirun -np n ./codes-nw-test --sync=1/2/3 --workload_type=type --workload_file=workload-file-name\n");
tw_end();
return -1;
ref = fscanf(name_file, "%d %s", &num_traces_of_job[i], file_name_of_job[i]);
if(ref!=EOF){
printf("\n%d traces of app %s \n", num_traces_of_job[i], file_name_of_job[i]);
num_net_traces += num_traces_of_job[i];
i++;
}
}
fclose(name_file);
}
if(strlen(alloc_file) == 0){
if(tw_ismaster())
printf("\n Usage: mpirun -np n ./codes-nw-test --sync=1/2/3 --workload_type=type --workloads_conf_file = workloads.conf --alloc_file=alloc.conf");
tw_end();
return -1;
}
jobmap_p.alloc_file = alloc_file;
jobmap_ctx = codes_jobmap_configure(CODES_JOBMAP_LIST, &jobmap_p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
......@@ -1501,6 +1578,7 @@ int main( int argc, char** argv )
assert(ret == 0 || !"lp_io_flush failure");
}
model_net_report_stats(net_id);
codes_jobmap_destroy(jobmap_ctx);
tw_end();
return 0;
......
......@@ -1386,14 +1386,14 @@ void packet_arrive_rc(terminal_state * s, tw_bf * bf, terminal_message * msg, tw
assert(tmp);
tmp->num_chunks--;
if(bf->c5)
/*if(bf->c5)
{
assert(hash_link);
qhash_del(hash_link);
free(tmp->remote_event_data);
free(tmp);
s->rank_tbl_pop--;
}
}*/
return;
}
void send_remote_event(terminal_state * s, terminal_message * msg, tw_lp * lp, tw_bf * bf, char * event_data, int remote_event_size)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment