Commit c4ba141e authored by Zhang Jingwang's avatar Zhang Jingwang Committed by Dries Kimpe
Browse files

Fix a race condition which may lead to client hung up.

The correctness of BMI depends on the order of two events: the
completion event of the work request for the MSG_CTS message, the
arrival of the message with type MSG_RTS_DONE.

If the MSG_RTS_DONE messages arrives first, then there would be no
request in the state of RQ_RTS_WAITING_RTS_DONE, so it can't advance
the state machine. As a result, a log message is printed and the
message is lost.
parent 6d96c9ba
......@@ -224,8 +224,8 @@ static int ib_check_cq(void)
struct ib_work *rq = sq; /* rename */
rq_state_t state = rq->state.recv;
if (state == RQ_RTS_WAITING_CTS_SEND_COMPLETION)
rq->state.recv = RQ_RTS_WAITING_RTS_DONE;
if (state & RQ_RTS_WAITING_CTS_SEND_COMPLETION)
rq->state.recv &= ~RQ_RTS_WAITING_CTS_SEND_COMPLETION;
else if (state == RQ_CANCELLED)
;
else
......@@ -589,8 +589,11 @@ encourage_recv_incoming(struct buf_head *bh, msg_type_t type, u_int32_t byte_len
if (rq->state.recv == RQ_RTS_WAITING_CTS_BUFFER) {
int ret;
ret = send_cts(rq);
if (ret == 0)
rq->state.recv = RQ_RTS_WAITING_CTS_SEND_COMPLETION;
if (ret == 0) {
rq->state.recv = RQ_RTS_WAITING_RTS_DONE |
RQ_RTS_WAITING_CTS_SEND_COMPLETION |
RQ_RTS_WAITING_USER_TEST;
}
/* else keep waiting until we can send that cts */
}
......@@ -608,7 +611,7 @@ encourage_recv_incoming(struct buf_head *bh, msg_type_t type, u_int32_t byte_len
rq = NULL;
qlist_for_each_entry(rqt, &ib_device->recvq, list) {
if (rqt->c == c && rqt->rts_mop_id == mh_rts_done.mop_id &&
rqt->state.recv == RQ_RTS_WAITING_RTS_DONE) {
rqt->state.recv & RQ_RTS_WAITING_RTS_DONE) {
rq = rqt;
break;
}
......@@ -626,7 +629,7 @@ encourage_recv_incoming(struct buf_head *bh, msg_type_t type, u_int32_t byte_len
post_rr(c, bh);
rq->state.recv = RQ_RTS_WAITING_USER_TEST;
rq->state.recv &= ~RQ_RTS_WAITING_RTS_DONE;
} else if (type == MSG_BYE) {
/*
......@@ -1032,8 +1035,11 @@ post_recv(bmi_op_id_t *id, struct bmi_method_addr *remote_map,
memcache_register(ib_device->memcache, &rq->buflist);
#endif
sret = send_cts(rq);
if (sret == 0)
rq->state.recv = RQ_RTS_WAITING_CTS_SEND_COMPLETION;
if (sret == 0) {
rq->state.recv = RQ_RTS_WAITING_RTS_DONE |
RQ_RTS_WAITING_CTS_SEND_COMPLETION |
RQ_RTS_WAITING_USER_TEST;
}
goto out;
}
......@@ -1167,9 +1173,11 @@ test_rq(struct ib_work *rq, bmi_op_id_t *outid, bmi_error_code_t *err,
debug(2, "%s: rq %p %s, encouraging", __func__, rq,
rq_state_name(rq->state.recv));
ret = send_cts(rq);
if (ret == 0)
rq->state.recv = RQ_RTS_WAITING_CTS_SEND_COMPLETION;
/* else keep waiting until we can send that cts */
if (ret == 0) {
rq->state.recv = RQ_RTS_WAITING_RTS_DONE |
RQ_RTS_WAITING_CTS_SEND_COMPLETION |
RQ_RTS_WAITING_USER_TEST;
} /* else keep waiting until we can send that cts */
debug(2, "%s: rq %p now %s", __func__, rq, rq_state_name(rq->state.recv));
} else if (rq->state.recv == RQ_CANCELLED && complete) {
debug(2, "%s: rq %p cancelled", __func__, rq);
......@@ -1485,12 +1493,10 @@ BMI_ib_cancel(bmi_op_id_t id, bmi_context_id context_id __unused)
struct ib_work *rq = qlist_upcast(l);
if (rq->c != c) continue;
#if !MEMCACHE_BOUNCEBUF
if (rq->state.recv == RQ_RTS_WAITING_RTS_DONE)
if (rq->state.recv & RQ_RTS_WAITING_RTS_DONE)
memcache_deregister(ib_device->memcache, &rq->buflist);
# if MEMCACHE_EARLY_REG
/* pin on post, dereg all these */
if (rq->state.recv == RQ_RTS_WAITING_CTS_SEND_COMPLETION)
memcache_deregister(ib_device->memcache, &rq->buflist);
if (rq->state.recv == RQ_WAITING_INCOMING
&& rq->buflist.tot_len > ib_device->eager_buf_payload)
memcache_deregister(ib_device->memcache, &rq->buflist);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment