Commit c229b00e authored by Charles J Archer's avatar Charles J Archer
Browse files

OFI Netmod: Improve *msg functionality and fix bugs



 * Fix 32-bit integer overflow
 * Add non-contig support for contig *Msg functions when OFI permits
 * Remove asserts for better error handling in out of memory condition

Change-Id: I0eaa848c9919b7f4b3088b64b9fef79fd5ad2406
Signed-off-by: default avatarCharles J Archer <charles.j.archer@intel.com>
parent 125304f7
......@@ -166,10 +166,10 @@ static inline int MPID_nem_ofi_handle_packet(cq_tagged_entry_t * wc ATTRIBUTE((u
BEGIN_FUNC(FCNAME);
if (MPID_cc_get(rreq->cc) == 1) {
vc = REQ_OFI(rreq)->vc;
MPIU_Assert(vc);
MPI_RC(MPID_nem_handle_pkt(vc, REQ_OFI(rreq)->pack_buffer, REQ_OFI(rreq)->pack_buffer_size))
MPIU_Free(REQ_OFI(rreq)->pack_buffer);
vc = REQ_OFI(rreq)->vc;
MPIU_Assert(vc);
MPI_RC(MPID_nem_handle_pkt(vc, REQ_OFI(rreq)->pack_buffer, REQ_OFI(rreq)->pack_buffer_size));
MPIU_Free(REQ_OFI(rreq)->pack_buffer);
}
MPIDI_CH3U_Request_complete(rreq);
END_FUNC_RC(FCNAME);
......@@ -215,10 +215,13 @@ static inline int MPID_nem_ofi_preposted_callback(cq_tagged_entry_t * wc, MPID_R
MPIU_Assert(vc);
VC_READY_CHECK(vc);
pkt_len = rreq->dev.user_count;
pkt_len = REQ_OFI(rreq)->msg_bytes;
pack_buffer = (char *) MPIU_Malloc(pkt_len);
MPIU_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "Pack Buffer alloc");
/* If the pack buffer is NULL, let OFI handle the truncation
* in the progress loop
*/
if(pack_buffer == NULL)
pkt_len = 0;
c = 1;
MPID_nem_ofi_create_req(&new_rreq, 1);
MPID_cc_incr(new_rreq->cc_ptr, &c);
......@@ -248,15 +251,17 @@ static inline int MPID_nem_ofi_preposted_callback(cq_tagged_entry_t * wc, MPID_R
wc->tag | MPID_MSG_CTS, &(REQ_OFI(sreq)->ofi_context)), tsend);
MPIU_Assert(gl_data.persistent_req == rreq);
rreq->dev.user_count = 0;
FI_RC_RETRY(fi_trecv(gl_data.endpoint,
&rreq->dev.user_count,
sizeof rreq->dev.user_count,
&REQ_OFI(rreq)->msg_bytes,
sizeof REQ_OFI(rreq)->msg_bytes,
gl_data.mr,
FI_ADDR_UNSPEC,
MPID_MSG_RTS,
GET_RCD_IGNORE_MASK(),
&(REQ_OFI(rreq)->ofi_context)), trecv);
/* Return a proper error to MPI to indicate out of memory condition */
MPIU_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "Pack Buffer alloc");
END_FUNC_RC(FCNAME);
}
......@@ -319,8 +324,8 @@ int MPID_nem_ofi_cm_init(MPIDI_PG_t * pg_p, int pg_rank ATTRIBUTE((unused)))
REQ_OFI(persistent_req)->vc = NULL;
REQ_OFI(persistent_req)->event_callback = MPID_nem_ofi_preposted_callback;
FI_RC_RETRY(fi_trecv(gl_data.endpoint,
&persistent_req->dev.user_count,
sizeof persistent_req->dev.user_count,
&REQ_OFI(persistent_req)->msg_bytes,
sizeof REQ_OFI(persistent_req)->msg_bytes,
gl_data.mr,
FI_ADDR_UNSPEC,
MPID_MSG_RTS,
......
......@@ -57,6 +57,7 @@ typedef struct {
MPID_Request *persistent_req; /* Unexpected request queue */
MPID_Request *conn_req; /* Connection request */
MPIDI_Comm_ops_t comm_ops;
size_t iov_limit; /* Max send iovec limit */
int rts_cts_in_flight;
int api_set;
} MPID_nem_ofi_global_t;
......@@ -80,13 +81,17 @@ typedef struct {
typedef struct {
context_t ofi_context; /* Context Object */
void *addr; /* OFI Address */
event_callback_fn event_callback; /* Callback Event */
char *pack_buffer; /* MPI Pack Buffer */
int pack_buffer_size; /* Pack buffer size */
event_callback_fn event_callback; /* Callback Event */
char *pack_buffer; /* MPI Pack Buffer */
size_t pack_buffer_size; /* Pack buffer size */
size_t msg_bytes; /* msg api bytes */
int iov_count; /* Number of iovecs */
void *real_hdr; /* Extended header */
int match_state; /* State of the match */
int req_started; /* Request state */
MPIDI_VC_t *vc; /* VC paired with this request */
uint64_t tag; /* 64 bit tag request */
struct iovec iov[3]; /* scatter gather list */
MPID_Request *parent; /* Parent request */
} MPID_nem_ofi_req_t;
#define REQ_OFI(req) ((MPID_nem_ofi_req_t *)((req)->ch.netmod_area.padding))
......
......@@ -126,8 +126,8 @@ int MPID_nem_ofi_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_
&gl_data.fabric, /* Out: Fabric descriptor */
NULL), openfabric); /* Context: fabric events */
gl_data.iov_limit = prov_use->tx_attr->iov_limit;
gl_data.api_set = API_SET_1;
/* ------------------------------------------------------------------------ */
/* Create the access domain, which is the physical or virtual network or */
/* hardware port/collection of ports. Returns a domain object that can be */
......
......@@ -125,18 +125,36 @@ static int MPID_nem_ofi_data_callback(cq_tagged_entry_t * wc, MPID_Request * sre
switch (wc->tag & MPID_PROTOCOL_MASK) {
case MPID_MSG_CTS | MPID_MSG_RTS:
vc = REQ_OFI(sreq)->vc;
FI_RC_RETRY(fi_tsend(gl_data.endpoint,
REQ_OFI(sreq)->pack_buffer,
REQ_OFI(sreq)->pack_buffer_size,
gl_data.mr,
VC_OFI(vc)->direct_addr,
wc->tag | MPID_MSG_DATA, (void *) &(REQ_OFI(sreq)->ofi_context)), tsend);
if(REQ_OFI(sreq)->pack_buffer) {
FI_RC_RETRY(fi_tsend(gl_data.endpoint,
REQ_OFI(sreq)->pack_buffer,
REQ_OFI(sreq)->pack_buffer_size,
gl_data.mr,
VC_OFI(vc)->direct_addr,
wc->tag | MPID_MSG_DATA,
(void *) &(REQ_OFI(sreq)->ofi_context)), tsend);
} else {
struct fi_msg_tagged msg;
void *desc = NULL;
msg.msg_iov = REQ_OFI(sreq)->iov;
msg.desc = &desc;
msg.iov_count = REQ_OFI(sreq)->iov_count;
msg.addr = VC_OFI(vc)->direct_addr;
msg.tag = wc->tag | MPID_MSG_DATA,
msg.ignore = 0ULL;
msg.context = &(REQ_OFI(sreq)->ofi_context);
msg.data = 0ULL;
FI_RC_RETRY(fi_tsendmsg(gl_data.endpoint,&msg,0ULL),tsend);
}
MPIDI_CH3U_Request_complete(sreq);
break;
case MPID_MSG_CTS | MPID_MSG_RTS | MPID_MSG_DATA:
if (REQ_OFI(sreq)->pack_buffer)
MPIU_Free(REQ_OFI(sreq)->pack_buffer);
if (REQ_OFI(sreq)->real_hdr)
MPIU_Free(REQ_OFI(sreq)->real_hdr);
reqFn = sreq->dev.OnDataAvail;
if (!reqFn) {
MPIDI_CH3U_Request_complete(sreq);
......@@ -171,11 +189,7 @@ static int MPID_nem_ofi_cts_recv_callback(cq_tagged_entry_t * wc, MPID_Request *
/* ------------------------------------------------------------------------ */
/* The nemesis API implementations: */
/* These functions currently memory copy into a pack buffer before sending */
/* To improve performance, we can replace the memory copy with a non-contig */
/* send (using tsendmsg) */
/* For now, the memory copy is the simplest implementation of these */
/* functions over a tagged msg interface */
/* Use packing if iovecs are not supported by the OFI provider */
/* ------------------------------------------------------------------------ */
#undef FCNAME
#define FCNAME DECL_FUNC(MPID_nem_ofi_iSendContig)
......@@ -183,25 +197,54 @@ int MPID_nem_ofi_iSendContig(MPIDI_VC_t * vc,
MPID_Request * sreq,
void *hdr, MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz)
{
int pgid, c, pkt_len, mpi_errno = MPI_SUCCESS;
char *pack_buffer;
int pgid, c, mpi_errno = MPI_SUCCESS;
char *pack_buffer = NULL;
uint64_t match_bits;
MPID_Request *cts_req;
MPIDI_msg_sz_t buf_offset = 0;
size_t pkt_len;
BEGIN_FUNC(FCNAME);
MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));
MPID_nem_ofi_init_req(sreq);
pkt_len = sizeof(MPIDI_CH3_Pkt_t) + sreq->dev.ext_hdr_sz + data_sz;
pack_buffer = MPIU_Malloc(pkt_len);
MPIU_Assert(pack_buffer);
MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
buf_offset += sizeof(MPIDI_CH3_Pkt_t);
if (sreq->dev.ext_hdr_sz > 0) {
if (sreq->dev.ext_hdr_sz > 0 && gl_data.iov_limit > 2) {
REQ_OFI(sreq)->real_hdr = MPIU_Malloc(sizeof(MPIDI_CH3_Pkt_t)+sreq->dev.ext_hdr_sz);
MPIU_ERR_CHKANDJUMP1(REQ_OFI(sreq)->real_hdr == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "iSendContig extended header allocation");
REQ_OFI(sreq)->iov[0].iov_base = REQ_OFI(sreq)->real_hdr;
REQ_OFI(sreq)->iov[0].iov_len = hdr_sz;
REQ_OFI(sreq)->iov[1].iov_base = REQ_OFI(sreq)->real_hdr+sizeof(MPIDI_CH3_Pkt_t);
REQ_OFI(sreq)->iov[1].iov_len = sreq->dev.ext_hdr_sz;
REQ_OFI(sreq)->iov[2].iov_base = data;
REQ_OFI(sreq)->iov[2].iov_len = data_sz;
REQ_OFI(sreq)->iov_count = 3;
MPIU_Memcpy(REQ_OFI(sreq)->real_hdr, hdr, hdr_sz);
MPIU_Memcpy(REQ_OFI(sreq)->real_hdr + sizeof(MPIDI_CH3_Pkt_t),
sreq->dev.ext_hdr_ptr, sreq->dev.ext_hdr_sz);
}
else if(sreq->dev.ext_hdr_sz == 0 && gl_data.iov_limit > 1) {
REQ_OFI(sreq)->real_hdr = MPIU_Malloc(sizeof(MPIDI_CH3_Pkt_t));
MPIU_ERR_CHKANDJUMP1(REQ_OFI(sreq)->real_hdr == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "iSendContig header allocation");
MPIU_Memcpy(REQ_OFI(sreq)->real_hdr, hdr, hdr_sz);
REQ_OFI(sreq)->iov[0].iov_base = REQ_OFI(sreq)->real_hdr;
REQ_OFI(sreq)->iov[0].iov_len = sizeof(MPIDI_CH3_Pkt_t);
REQ_OFI(sreq)->iov[1].iov_base = data;
REQ_OFI(sreq)->iov[1].iov_len = data_sz;
REQ_OFI(sreq)->iov_count = 2;
}
else {
pack_buffer = MPIU_Malloc(pkt_len);
MPIU_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "iSendContig pack buffer allocation");
MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
buf_offset += sizeof(MPIDI_CH3_Pkt_t);
if (sreq->dev.ext_hdr_sz > 0) {
MPIU_Memcpy(pack_buffer + buf_offset, sreq->dev.ext_hdr_ptr, sreq->dev.ext_hdr_sz);
buf_offset += sreq->dev.ext_hdr_sz;
}
MPIU_Memcpy(pack_buffer + buf_offset, data, data_sz);
}
MPIU_Memcpy(pack_buffer + buf_offset, data, data_sz);
START_COMM();
END_FUNC_RC(FCNAME);
}
......@@ -211,23 +254,25 @@ int MPID_nem_ofi_iSendContig(MPIDI_VC_t * vc,
int MPID_nem_ofi_SendNoncontig(MPIDI_VC_t * vc,
MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz)
{
int c, pgid, pkt_len, mpi_errno = MPI_SUCCESS;
int c, pgid, mpi_errno = MPI_SUCCESS;
char *pack_buffer;
MPI_Aint data_sz;
uint64_t match_bits;
MPID_Request *cts_req;
MPIDI_msg_sz_t first, last;
MPIDI_msg_sz_t buf_offset = 0;
void *data = NULL;
size_t pkt_len;
BEGIN_FUNC(FCNAME);
MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));
MPID_nem_ofi_init_req(sreq);
first = sreq->dev.segment_first;
last = sreq->dev.segment_size;
data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
pkt_len = sizeof(MPIDI_CH3_Pkt_t) + sreq->dev.ext_hdr_sz + data_sz;
pack_buffer = MPIU_Malloc(pkt_len);
MPIU_Assert(pack_buffer);
MPIU_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "SendNonContig pack buffer allocation");
MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
buf_offset += sizeof(MPIDI_CH3_Pkt_t);
if (sreq->dev.ext_hdr_sz > 0) {
......@@ -247,11 +292,12 @@ int MPID_nem_ofi_iStartContigMsg(MPIDI_VC_t * vc,
MPIDI_msg_sz_t hdr_sz,
void *data, MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr)
{
int pkt_len, c, pgid, mpi_errno = MPI_SUCCESS;
int c, pgid, mpi_errno = MPI_SUCCESS;
MPID_Request *sreq;
MPID_Request *cts_req;
char *pack_buffer;
char *pack_buffer = NULL;
uint64_t match_bits;
size_t pkt_len;
BEGIN_FUNC(FCNAME);
MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));
......@@ -260,11 +306,23 @@ int MPID_nem_ofi_iStartContigMsg(MPIDI_VC_t * vc,
sreq->dev.OnDataAvail = NULL;
sreq->dev.next = NULL;
pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
pack_buffer = MPIU_Malloc(pkt_len);
MPIU_Assert(pack_buffer);
MPIU_Memcpy((void *) pack_buffer, hdr, hdr_sz);
if (data_sz)
if(gl_data.iov_limit > 1) {
REQ_OFI(sreq)->real_hdr = MPIU_Malloc(sizeof(MPIDI_CH3_Pkt_t));
MPIU_Memcpy(REQ_OFI(sreq)->real_hdr, hdr, hdr_sz);
REQ_OFI(sreq)->iov[0].iov_base = REQ_OFI(sreq)->real_hdr;
REQ_OFI(sreq)->iov[0].iov_len = sizeof(MPIDI_CH3_Pkt_t);
REQ_OFI(sreq)->iov[1].iov_base = data;
REQ_OFI(sreq)->iov[1].iov_len = data_sz;
REQ_OFI(sreq)->iov_count = 2;
}
else {
pack_buffer = MPIU_Malloc(pkt_len);
MPIU_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
"**nomem", "**nomem %s", "iStartContig pack buffer allocation");
MPIU_Memcpy((void *) pack_buffer, hdr, hdr_sz);
if (data_sz)
MPIU_Memcpy((void *) (pack_buffer + sizeof(MPIDI_CH3_Pkt_t)), data, data_sz);
}
START_COMM();
*sreq_ptr = sreq;
END_FUNC_RC(FCNAME);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment