ofi_msg.c 11.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 *
 *  Portions of this code were written by Intel Corporation.
 *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
 *  to Argonne National Laboratory subject to Software Grant and Corporate
 *  Contributor License Agreement dated February 8, 2012.
 */
10
#include "ofi_impl.h"
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

/* ------------------------------------------------------------------------ */
/* GET_PGID_AND_SET_MATCH macro looks up the process group to find the      */
/* correct rank in multiple process groups.  The "contigmsg" family of apis */
/* work on a global scope, not on a communicator scope(like tagged MPI.)    */
/* The pgid matching is used for uniquely scoping the tag, usually in       */
/* intercomms and dynamic process management where there are multiple       */
/* global world spaces with similar ranks in the global space               */
/* ------------------------------------------------------------------------ */
#define GET_PGID_AND_SET_MATCH()                                        \
({                                                                      \
  if (vc->pg) {                                                         \
    MPIDI_PG_IdToNum(gl_data.pg_p, &pgid);                              \
  } else {                                                              \
    pgid = NO_PGID;                                                     \
  }                                                                     \
  match_bits = (uint64_t)MPIR_Process.comm_world->rank <<               \
    (MPID_PORT_SHIFT);                                                  \
  if (0 == pgid) {                                                      \
    match_bits |= (uint64_t)vc->port_name_tag<<                         \
      (MPID_PORT_SHIFT+MPID_PSOURCE_SHIFT);                             \
  }                                                                     \
  match_bits |= pgid;                                                   \
  match_bits |= MPID_MSG_RTS;                                           \
})

/* ------------------------------------------------------------------------ */
/* START_COMM is common code used by the nemesis netmod functions:          */
/* iSendContig                                                              */
/* SendNoncontig                                                            */
/* iStartContigMsg                                                          */
/* These routines differ slightly in their behaviors, but can share common  */
/* code to perform the send.  START_COMM provides that common code, which   */
/* is based on a tagged rendezvous message.                                 */
/* The rendezvous is implemented with an RTS-CTS-Data send protocol:        */
/* CTS_POST()   |                                  |                        */
47
/* RTS_SEND()   | -------------------------------> | ue_callback()(ofi_cm.c)*/
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/*              |                                  |   pack_buffer()        */
/*              |                                  |   DATA_POST()          */
/*              |                                  |   RTS_POST()           */
/*              |                                  |   CTS_SEND()           */
/* CTS_MATCH()  | <------------------------------- |                        */
/* DATA_SEND()  | ===============================> | handle_packet()        */
/*              |                                  |   notify_ch3_pkt()     */
/*              v                                  v                        */
/* ------------------------------------------------------------------------ */
#define START_COMM()                                                    \
  ({                                                                    \
    GET_PGID_AND_SET_MATCH();                                           \
    VC_READY_CHECK(vc);                                                 \
    c = 1;                                                              \
    MPID_cc_incr(sreq->cc_ptr, &c);                                     \
    MPID_cc_incr(sreq->cc_ptr, &c);                                     \
64
65
66
67
68
    REQ_OFI(sreq)->event_callback   = MPID_nem_ofi_data_callback;       \
    REQ_OFI(sreq)->pack_buffer      = pack_buffer;                      \
    REQ_OFI(sreq)->pack_buffer_size = pkt_len;                          \
    REQ_OFI(sreq)->vc               = vc;                               \
    REQ_OFI(sreq)->tag              = match_bits;                       \
69
                                                                        \
70
    MPID_nem_ofi_create_req(&cts_req, 1);                               \
71
72
    cts_req->dev.OnDataAvail         = NULL;                            \
    cts_req->dev.next                = NULL;                            \
73
74
    REQ_OFI(cts_req)->event_callback = MPID_nem_ofi_cts_recv_callback;  \
    REQ_OFI(cts_req)->parent         = sreq;                            \
75
                                                                        \
76
    FI_RC(fi_trecv(gl_data.endpoint,                                \
77
78
79
                       NULL,                                            \
                       0,                                               \
                       gl_data.mr,                                      \
80
                       VC_OFI(vc)->direct_addr,                         \
81
82
                       match_bits | MPID_MSG_CTS,                       \
                       0, /* Exact tag match, no ignore bits */         \
83
                       &(REQ_OFI(cts_req)->ofi_context)),trecv);    \
84
    FI_RC(fi_tsend(gl_data.endpoint,                                  \
85
86
                     &REQ_OFI(sreq)->pack_buffer_size,                  \
                     sizeof(REQ_OFI(sreq)->pack_buffer_size),           \
87
                     gl_data.mr,                                        \
88
                     VC_OFI(vc)->direct_addr,                           \
89
                     match_bits,                                        \
90
                     &(REQ_OFI(sreq)->ofi_context)),tsend);           \
91
92
93
94
95
96
97
98
99
  })


/* ------------------------------------------------------------------------ */
/* General handler for RTS-CTS-Data protocol.  Waits for the cc counter     */
/* to hit two (send RTS and receive CTS decrementers) before kicking off the*/
/* bulk data transfer.  On data send completion, the request can be freed   */
/* ------------------------------------------------------------------------ */
#undef FCNAME
100
101
#define FCNAME DECL_FUNC(MPID_nem_ofi_data_callback)
static int MPID_nem_ofi_data_callback(cq_tagged_entry_t * wc, MPID_Request * sreq)
102
103
104
105
106
107
108
{
    int complete = 0, mpi_errno = MPI_SUCCESS;
    MPIDI_VC_t *vc;
    req_fn reqFn;
    uint64_t tag = 0;
    BEGIN_FUNC(FCNAME);
    if (sreq->cc == 2) {
109
110
        vc = REQ_OFI(sreq)->vc;
        REQ_OFI(sreq)->tag = tag | MPID_MSG_DATA;
111
        FI_RC(fi_tsend(gl_data.endpoint,
112
113
                         REQ_OFI(sreq)->pack_buffer,
                         REQ_OFI(sreq)->pack_buffer_size,
114
                         gl_data.mr,
115
116
                         VC_OFI(vc)->direct_addr,
                         wc->tag | MPID_MSG_DATA, (void *) &(REQ_OFI(sreq)->ofi_context)), tsend);
117
118
    }
    if (sreq->cc == 1) {
119
120
        if (REQ_OFI(sreq)->pack_buffer)
            MPIU_Free(REQ_OFI(sreq)->pack_buffer);
121
122
123
124
125
126

        reqFn = sreq->dev.OnDataAvail;
        if (!reqFn) {
            MPIDI_CH3U_Request_complete(sreq);
        }
        else {
127
            vc = REQ_OFI(sreq)->vc;
128
129
130
131
132
133
134
135
136
137
            MPI_RC(reqFn(vc, sreq, &complete));
        }
    }
    else {
        MPIDI_CH3U_Request_complete(sreq);
    }
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
138
/* Signals the CTS has been received.  Call MPID_nem_ofi_data_callback on   */
139
140
141
/* the parent send request to kick off the bulk data transfer               */
/* ------------------------------------------------------------------------ */
#undef FCNAME
142
143
#define FCNAME DECL_FUNC(MPID_nem_ofi_cts_recv_callback)
static int MPID_nem_ofi_cts_recv_callback(cq_tagged_entry_t * wc, MPID_Request * rreq)
144
145
146
{
    int mpi_errno = MPI_SUCCESS;
    BEGIN_FUNC(FCNAME);
147
    MPI_RC(MPID_nem_ofi_data_callback(wc, REQ_OFI(rreq)->parent));
148
149
150
151
152
153
154
155
156
157
158
159
160
    MPIDI_CH3U_Request_complete(rreq);
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
/* The nemesis API implementations:                                         */
/* These functions currently memory copy into a pack buffer before sending  */
/* To improve performance, we can replace the memory copy with a non-contig */
/* send (using tsendmsg)                                                    */
/* For now, the memory copy is the simplest implementation of these         */
/* functions over a tagged msg interface                                    */
/* ------------------------------------------------------------------------ */
#undef FCNAME
161
162
#define FCNAME DECL_FUNC(MPID_nem_ofi_iSendContig)
int MPID_nem_ofi_iSendContig(MPIDI_VC_t * vc,
163
164
165
166
167
168
169
170
171
172
                             MPID_Request * sreq,
                             void *hdr, MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz)
{
    int pgid, c, pkt_len, mpi_errno = MPI_SUCCESS;
    char *pack_buffer;
    uint64_t match_bits;
    MPID_Request *cts_req;

    BEGIN_FUNC(FCNAME);
    MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));
173
    MPID_nem_ofi_init_req(sreq);
174
175
176
177
178
179
180
181
182
183
    pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
    pack_buffer = MPIU_Malloc(pkt_len);
    MPIU_Assert(pack_buffer);
    MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
    MPIU_Memcpy(pack_buffer + sizeof(MPIDI_CH3_Pkt_t), data, data_sz);
    START_COMM();
    END_FUNC_RC(FCNAME);
}

#undef FCNAME
184
185
#define FCNAME DECL_FUNC(MPID_nem_ofi_SendNoncontig)
int MPID_nem_ofi_SendNoncontig(MPIDI_VC_t * vc,
186
187
188
189
190
191
192
                               MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz)
{
    int c, pgid, pkt_len, mpi_errno = MPI_SUCCESS;
    char *pack_buffer;
    MPI_Aint data_sz;
    uint64_t match_bits;
    MPID_Request *cts_req;
193
    MPIDI_msg_sz_t first, last;
194
195
196
197

    BEGIN_FUNC(FCNAME);
    MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));

198
199
200
    first = sreq->dev.segment_first;
    last = sreq->dev.segment_size;
    data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
201
202
203
204
    pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
    pack_buffer = MPIU_Malloc(pkt_len);
    MPIU_Assert(pack_buffer);
    MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
205
    MPID_Segment_pack(sreq->dev.segment_ptr, first, &last, pack_buffer + sizeof(MPIDI_CH3_Pkt_t));
206
    START_COMM();
207
    MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
208
209
210
211
    END_FUNC_RC(FCNAME);
}

#undef FCNAME
212
213
#define FCNAME DECL_FUNC(MPID_nem_ofi_iStartContigMsg)
int MPID_nem_ofi_iStartContigMsg(MPIDI_VC_t * vc,
214
215
216
217
218
219
220
221
222
223
224
225
                                 void *hdr,
                                 MPIDI_msg_sz_t hdr_sz,
                                 void *data, MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr)
{
    int pkt_len, c, pgid, mpi_errno = MPI_SUCCESS;
    MPID_Request *sreq;
    MPID_Request *cts_req;
    char *pack_buffer;
    uint64_t match_bits;
    BEGIN_FUNC(FCNAME);
    MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));

226
    MPID_nem_ofi_create_req(&sreq, 2);
227
228
229
230
231
232
233
234
235
236
237
238
239
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = NULL;
    sreq->dev.next = NULL;
    pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
    pack_buffer = MPIU_Malloc(pkt_len);
    MPIU_Assert(pack_buffer);
    MPIU_Memcpy((void *) pack_buffer, hdr, hdr_sz);
    if (data_sz)
        MPIU_Memcpy((void *) (pack_buffer + sizeof(MPIDI_CH3_Pkt_t)), data, data_sz);
    START_COMM();
    *sreq_ptr = sreq;
    END_FUNC_RC(FCNAME);
}