Commit 7feea8ea authored by Misbah Mubarak's avatar Misbah Mubarak

Fixing bug in MPI sim layer

parent eaab6593
...@@ -23,7 +23,7 @@ PARAMS ...@@ -23,7 +23,7 @@ PARAMS
local_bandwidth="5.25"; local_bandwidth="5.25";
global_bandwidth="4.7"; global_bandwidth="4.7";
cn_bandwidth="5.25"; cn_bandwidth="5.25";
message_size="592"; message_size="608";
routing="minimal"; routing="minimal";
self_msg_overhead = "10"; self_msg_overhead = "10";
} }
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#define TRACE -1 #define TRACE -1
#define MAX_WAIT_REQS 512 #define MAX_WAIT_REQS 512
#define CS_LP_DBG 1 #define CS_LP_DBG 1
#define EAGER_THRESHOLD 81920000 #define EAGER_THRESHOLD 8192000
#define RANK_HASH_TABLE_SZ 2000 #define RANK_HASH_TABLE_SZ 2000
#define NOISE 3.0 #define NOISE 3.0
#define NW_LP_NM "nw-lp" #define NW_LP_NM "nw-lp"
...@@ -1056,6 +1056,8 @@ static int rm_matching_send(nw_state * ns, ...@@ -1056,6 +1056,8 @@ static int rm_matching_send(nw_state * ns,
bf->c10 = 1; bf->c10 = 1;
send_ack_back(ns, bf, m, lp, qi); send_ack_back(ns, bf, m, lp, qi);
} }
rc_stack_push(lp, qi, free, ns->processed_ops);
m->rc.saved_recv_time = ns->recv_time; m->rc.saved_recv_time = ns->recv_time;
ns->recv_time += (tw_now(lp) - qitem->req_init_time); ns->recv_time += (tw_now(lp) - qitem->req_init_time);
...@@ -1217,7 +1219,6 @@ static void codes_exec_mpi_recv( ...@@ -1217,7 +1219,6 @@ static void codes_exec_mpi_recv(
{ {
m->fwd.found_match = found_matching_sends; m->fwd.found_match = found_matching_sends;
codes_issue_next_event(lp); codes_issue_next_event(lp);
rc_stack_push(lp, recv_op, free, s->processed_ops);
} }
} }
...@@ -1245,11 +1246,16 @@ static void codes_exec_mpi_send_rc(nw_state * s, tw_bf * bf, nw_message * m, tw_ ...@@ -1245,11 +1246,16 @@ static void codes_exec_mpi_send_rc(nw_state * s, tw_bf * bf, nw_message * m, tw_
} }
} }
model_net_event_rc2(lp, &m->event_rc); model_net_event_rc2(lp, &m->event_rc);
if(m->op_type == CODES_WK_ISEND)
if(bf->c4)
codes_issue_next_event_rc(lp); codes_issue_next_event_rc(lp);
s->num_sends--; s->num_sends--;
s->num_bytes_sent -= m->rc.saved_num_bytes;
num_bytes_sent -= m->rc.saved_num_bytes; if(bf->c3)
{
s->num_bytes_sent -= m->rc.saved_num_bytes;
num_bytes_sent -= m->rc.saved_num_bytes;
}
} }
/* executes MPI send and isend operations */ /* executes MPI send and isend operations */
static void codes_exec_mpi_send(nw_state* s, static void codes_exec_mpi_send(nw_state* s,
...@@ -1267,7 +1273,6 @@ static void codes_exec_mpi_send(nw_state* s, ...@@ -1267,7 +1273,6 @@ static void codes_exec_mpi_send(nw_state* s,
global_dest_rank = get_global_id_of_job_rank(mpi_op->u.send.dest_rank, s->app_id); global_dest_rank = get_global_id_of_job_rank(mpi_op->u.send.dest_rank, s->app_id);
} }
// if(mpi_op->u.send.tag == -1006)
// printf("\n Sender rank %llu global dest rank %d dest-rank %d bytes %d Tag %d", s->nw_id, global_dest_rank, mpi_op->u.send.dest_rank, mpi_op->u.send.num_bytes, mpi_op->u.send.tag); // printf("\n Sender rank %llu global dest rank %d dest-rank %d bytes %d Tag %d", s->nw_id, global_dest_rank, mpi_op->u.send.dest_rank, mpi_op->u.send.num_bytes, mpi_op->u.send.tag);
m->rc.saved_num_bytes = mpi_op->u.send.num_bytes; m->rc.saved_num_bytes = mpi_op->u.send.num_bytes;
/* model-net event */ /* model-net event */
...@@ -1275,6 +1280,7 @@ static void codes_exec_mpi_send(nw_state* s, ...@@ -1275,6 +1280,7 @@ static void codes_exec_mpi_send(nw_state* s,
if(!is_rend) if(!is_rend)
{ {
bf->c3 = 1;
num_bytes_sent += mpi_op->u.send.num_bytes; num_bytes_sent += mpi_op->u.send.num_bytes;
s->num_bytes_sent += mpi_op->u.send.num_bytes; s->num_bytes_sent += mpi_op->u.send.num_bytes;
} }
...@@ -1371,7 +1377,10 @@ static void codes_exec_mpi_send(nw_state* s, ...@@ -1371,7 +1377,10 @@ static void codes_exec_mpi_send(nw_state* s,
} }
/* isend executed, now get next MPI operation from the queue */ /* isend executed, now get next MPI operation from the queue */
if(mpi_op->op_type == CODES_WK_ISEND && !is_rend) if(mpi_op->op_type == CODES_WK_ISEND && !is_rend)
{
bf->c4 = 1;
codes_issue_next_event(lp); codes_issue_next_event(lp);
}
} }
/* convert seconds to ns */ /* convert seconds to ns */
...@@ -1427,11 +1436,11 @@ static void update_completed_queue(nw_state* s, ...@@ -1427,11 +1436,11 @@ static void update_completed_queue(nw_state* s,
req->req_id = req_id; req->req_id = req_id;
qlist_add_tail(&req->ql, &s->completed_reqs); qlist_add_tail(&req->ql, &s->completed_reqs);
/* if(lp->gid == TRACK) // if(lp->gid == TRACK)
{ {
printf("\n Forward mode adding %ld ", req_id); printf("\n Forward mode adding %ld ", req_id);
print_completed_queue(&s->completed_reqs); print_completed_queue(&s->completed_reqs);
}*/ }
} }
else else
{ {
...@@ -1477,7 +1486,7 @@ static void send_ack_back(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp, m ...@@ -1477,7 +1486,7 @@ static void send_ack_back(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp, m
remote_m.fwd.num_bytes = mpi_op->num_bytes; remote_m.fwd.num_bytes = mpi_op->num_bytes;
remote_m.fwd.req_id = mpi_op->req_id; remote_m.fwd.req_id = mpi_op->req_id;
// printf("\n Op type %d dest rank %d ", mpi_op->op_type, mpi_op->dest_rank); printf("\n Op type %d dest rank %d ", mpi_op->op_type, mpi_op->dest_rank);
m->event_rc = model_net_event_mctx(net_id, &group_ratio, &group_ratio, m->event_rc = model_net_event_mctx(net_id, &group_ratio, &group_ratio,
"test", dest_rank, CONTROL_MSG_SZ, (self_overhead + soft_delay_mpi + nic_delay), "test", dest_rank, CONTROL_MSG_SZ, (self_overhead + soft_delay_mpi + nic_delay),
sizeof(nw_message), (const void*)&remote_m, 0, NULL, lp); sizeof(nw_message), (const void*)&remote_m, 0, NULL, lp);
...@@ -1780,14 +1789,15 @@ void nw_test_event_handler(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp) ...@@ -1780,14 +1789,15 @@ void nw_test_event_handler(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp)
mpi_op.source_rank = m->fwd.src_rank; mpi_op.source_rank = m->fwd.src_rank;
mpi_op.dest_rank = m->fwd.dest_rank; mpi_op.dest_rank = m->fwd.dest_rank;
mpi_op.req_init_time = m->fwd.sim_start_time; mpi_op.req_init_time = m->fwd.sim_start_time;
update_message_size(s, lp, bf, m, &mpi_op, 0, 1); if(enable_msg_tracking)
update_message_size(s, lp, bf, m, &mpi_op, 0, 1);
int global_src_id = m->fwd.src_rank; int global_src_id = m->fwd.src_rank;
if(alloc_spec) if(alloc_spec)
{ global_src_id = get_global_id_of_job_rank(m->fwd.src_rank, s->app_id);
global_src_id = get_global_id_of_job_rank(m->fwd.src_rank, s->app_id);
}
tw_event *e_callback = tw_event *e_callback =
tw_event_new(rank_to_lpid(global_src_id), tw_event_new(rank_to_lpid(global_src_id),
codes_local_latency(lp), lp); codes_local_latency(lp), lp);
...@@ -1811,6 +1821,7 @@ void nw_test_event_handler(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp) ...@@ -1811,6 +1821,7 @@ void nw_test_event_handler(nw_state* s, tw_bf * bf, nw_message * m, tw_lp * lp)
mpi_op.u.send.dest_rank = m->fwd.dest_rank; mpi_op.u.send.dest_rank = m->fwd.dest_rank;
mpi_op.sim_start_time = m->fwd.sim_start_time; mpi_op.sim_start_time = m->fwd.sim_start_time;
mpi_op.u.send.req_id = m->fwd.req_id; mpi_op.u.send.req_id = m->fwd.req_id;
printf("\n Global dest rank %d ", m->fwd.dest_rank);
codes_exec_mpi_send(s, bf, m, lp, &mpi_op, is_rend); codes_exec_mpi_send(s, bf, m, lp, &mpi_op, is_rend);
} }
......
...@@ -472,7 +472,8 @@ int handleDUMPISend(const dumpi_send *prm, uint16_t thread, ...@@ -472,7 +472,8 @@ int handleDUMPISend(const dumpi_send *prm, uint16_t thread,
wrkld_per_rank.u.send.source_rank = myctx->my_rank; wrkld_per_rank.u.send.source_rank = myctx->my_rank;
wrkld_per_rank.u.send.req_id = -1; wrkld_per_rank.u.send.req_id = -1;
update_times_and_insert(&wrkld_per_rank, wall, myctx);
update_times_and_insert(&wrkld_per_rank, wall, myctx);
return 0; return 0;
} }
...@@ -524,7 +525,8 @@ int handleDUMPISendrecv(const dumpi_sendrecv* prm, uint16_t thread, ...@@ -524,7 +525,8 @@ int handleDUMPISendrecv(const dumpi_sendrecv* prm, uint16_t thread,
wrkld_per_rank.u.send.data_type = prm->sendtype; wrkld_per_rank.u.send.data_type = prm->sendtype;
wrkld_per_rank.u.send.num_bytes = prm->sendcount * get_num_bytes(myctx,prm->sendtype); wrkld_per_rank.u.send.num_bytes = prm->sendcount * get_num_bytes(myctx,prm->sendtype);
assert(wrkld_per_rank.u.send.num_bytes >= 0);
assert(wrkld_per_rank.u.send.num_bytes >= 0);
wrkld_per_rank.u.send.dest_rank = prm->dest; wrkld_per_rank.u.send.dest_rank = prm->dest;
wrkld_per_rank.u.send.source_rank = myctx->my_rank; wrkld_per_rank.u.send.source_rank = myctx->my_rank;
wrkld_per_rank.u.send.req_id = -1; wrkld_per_rank.u.send.req_id = -1;
...@@ -538,7 +540,8 @@ int handleDUMPISendrecv(const dumpi_sendrecv* prm, uint16_t thread, ...@@ -538,7 +540,8 @@ int handleDUMPISendrecv(const dumpi_sendrecv* prm, uint16_t thread,
wrkld_per_rank.u.recv.count = prm->recvcount; wrkld_per_rank.u.recv.count = prm->recvcount;
wrkld_per_rank.u.recv.data_type = prm->recvtype; wrkld_per_rank.u.recv.data_type = prm->recvtype;
wrkld_per_rank.u.recv.num_bytes = prm->recvcount * get_num_bytes(myctx,prm->recvtype); wrkld_per_rank.u.recv.num_bytes = prm->recvcount * get_num_bytes(myctx,prm->recvtype);
assert(wrkld_per_rank.u.recv.num_bytes >= 0);
assert(wrkld_per_rank.u.recv.num_bytes >= 0);
wrkld_per_rank.u.recv.source_rank = prm->source; wrkld_per_rank.u.recv.source_rank = prm->source;
wrkld_per_rank.u.recv.dest_rank = -1; wrkld_per_rank.u.recv.dest_rank = -1;
update_times_and_insert(&wrkld_per_rank, wall, myctx); update_times_and_insert(&wrkld_per_rank, wall, myctx);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment