ofi_msg.c 12.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 *
 *  Portions of this code were written by Intel Corporation.
 *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
 *  to Argonne National Laboratory subject to Software Grant and Corporate
 *  Contributor License Agreement dated February 8, 2012.
 */
10
#include "ofi_impl.h"
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

/* ------------------------------------------------------------------------ */
/* GET_PGID_AND_SET_MATCH macro looks up the process group to find the      */
/* correct rank in multiple process groups.  The "contigmsg" family of apis */
/* work on a global scope, not on a communicator scope(like tagged MPI.)    */
/* The pgid matching is used for uniquely scoping the tag, usually in       */
/* intercomms and dynamic process management where there are multiple       */
/* global world spaces with similar ranks in the global space               */
/* ------------------------------------------------------------------------ */
#define GET_PGID_AND_SET_MATCH()                                        \
({                                                                      \
  if (vc->pg) {                                                         \
    MPIDI_PG_IdToNum(gl_data.pg_p, &pgid);                              \
  } else {                                                              \
    pgid = NO_PGID;                                                     \
  }                                                                     \
27
28
29
30
31
32
  if (gl_data.api_set == API_SET_1){                                    \
      match_bits = ((uint64_t)pgid << MPID_PGID_SHIFT);                 \
  }else{                                                                \
      match_bits = 0;                                                   \
  }                                                                     \
  if (NO_PGID == pgid) {                                                \
33
    match_bits |= (uint64_t)vc->port_name_tag<<                         \
34
35
36
37
        (MPID_PORT_SHIFT);                                              \
  }else{                                                                \
      match_bits |= (uint64_t)MPIR_Process.comm_world->rank <<          \
          (MPID_PSOURCE_SHIFT);                                         \
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  }                                                                     \
  match_bits |= MPID_MSG_RTS;                                           \
})

/* ------------------------------------------------------------------------ */
/* START_COMM is common code used by the nemesis netmod functions:          */
/* iSendContig                                                              */
/* SendNoncontig                                                            */
/* iStartContigMsg                                                          */
/* These routines differ slightly in their behaviors, but can share common  */
/* code to perform the send.  START_COMM provides that common code, which   */
/* is based on a tagged rendezvous message.                                 */
/* The rendezvous is implemented with an RTS-CTS-Data send protocol:        */
/* CTS_POST()   |                                  |                        */
52
/* RTS_SEND()   | -------------------------------> | ue_callback()(ofi_cm.c)*/
53
54
55
56
57
58
59
60
61
62
/*              |                                  |   pack_buffer()        */
/*              |                                  |   DATA_POST()          */
/*              |                                  |   RTS_POST()           */
/*              |                                  |   CTS_SEND()           */
/* CTS_MATCH()  | <------------------------------- |                        */
/* DATA_SEND()  | ===============================> | handle_packet()        */
/*              |                                  |   notify_ch3_pkt()     */
/*              v                                  v                        */
/* ------------------------------------------------------------------------ */
#define START_COMM()                                                    \
63
64
65
66
67
68
69
70
71
72
73
74
    ({                                                                  \
        gl_data.rts_cts_in_flight++;                                    \
        GET_PGID_AND_SET_MATCH();                                       \
        VC_READY_CHECK(vc);                                             \
        c = 1;                                                          \
        MPID_cc_incr(sreq->cc_ptr, &c);                                 \
        MPID_cc_incr(sreq->cc_ptr, &c);                                 \
        REQ_OFI(sreq)->event_callback   = MPID_nem_ofi_data_callback;   \
        REQ_OFI(sreq)->pack_buffer      = pack_buffer;                  \
        REQ_OFI(sreq)->pack_buffer_size = pkt_len;                      \
        REQ_OFI(sreq)->vc               = vc;                           \
        REQ_OFI(sreq)->tag              = match_bits;                   \
75
                                                                        \
76
    MPID_nem_ofi_create_req(&cts_req, 1);                               \
77
78
    cts_req->dev.OnDataAvail         = NULL;                            \
    cts_req->dev.next                = NULL;                            \
79
80
    REQ_OFI(cts_req)->event_callback = MPID_nem_ofi_cts_recv_callback;  \
    REQ_OFI(cts_req)->parent         = sreq;                            \
81
                                                                        \
82
    FI_RC(fi_trecv(gl_data.endpoint,                                \
83
84
85
                       NULL,                                            \
                       0,                                               \
                       gl_data.mr,                                      \
86
                       VC_OFI(vc)->direct_addr,                         \
87
88
                       match_bits | MPID_MSG_CTS,                       \
                       0, /* Exact tag match, no ignore bits */         \
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
                       &(REQ_OFI(cts_req)->ofi_context)),trecv);        \
        if (gl_data.api_set == API_SET_1){                              \
            FI_RC(fi_tsend(gl_data.endpoint,                            \
                           &REQ_OFI(sreq)->pack_buffer_size,            \
                           sizeof(REQ_OFI(sreq)->pack_buffer_size),     \
                           gl_data.mr,                                  \
                           VC_OFI(vc)->direct_addr,                     \
                           match_bits,                                  \
                           &(REQ_OFI(sreq)->ofi_context)),tsend);       \
        }else{                                                          \
            FI_RC(fi_tsenddata(gl_data.endpoint,                        \
                               &REQ_OFI(sreq)->pack_buffer_size,        \
                               sizeof(REQ_OFI(sreq)->pack_buffer_size), \
                               gl_data.mr,                              \
                               pgid,                                    \
                               VC_OFI(vc)->direct_addr,                 \
                               match_bits,                              \
                               &(REQ_OFI(sreq)->ofi_context)),tsend);   \
        }                                                               \
    })
109
110
111
112
113
114
115
116


/* ------------------------------------------------------------------------ */
/* General handler for RTS-CTS-Data protocol.  Waits for the cc counter     */
/* to hit two (send RTS and receive CTS decrementers) before kicking off the*/
/* bulk data transfer.  On data send completion, the request can be freed   */
/* ------------------------------------------------------------------------ */
#undef FCNAME
117
118
#define FCNAME DECL_FUNC(MPID_nem_ofi_data_callback)
static int MPID_nem_ofi_data_callback(cq_tagged_entry_t * wc, MPID_Request * sreq)
119
120
121
122
123
124
{
    int complete = 0, mpi_errno = MPI_SUCCESS;
    MPIDI_VC_t *vc;
    req_fn reqFn;
    uint64_t tag = 0;
    BEGIN_FUNC(FCNAME);
125
    if (MPID_cc_get(sreq->cc) == 2) {
126
127
        vc = REQ_OFI(sreq)->vc;
        REQ_OFI(sreq)->tag = tag | MPID_MSG_DATA;
128
        FI_RC(fi_tsend(gl_data.endpoint,
129
130
131
132
133
134
135
                       REQ_OFI(sreq)->pack_buffer,
                       REQ_OFI(sreq)->pack_buffer_size,
                       gl_data.mr,
                       VC_OFI(vc)->direct_addr,
                       MPID_MSG_DATA, (void *) &(REQ_OFI(sreq)->ofi_context)), tsend);
        break;
    case MPID_MSG_DATA:
136
137
        if (REQ_OFI(sreq)->pack_buffer)
            MPIU_Free(REQ_OFI(sreq)->pack_buffer);
138
139
140
141
142
143

        reqFn = sreq->dev.OnDataAvail;
        if (!reqFn) {
            MPIDI_CH3U_Request_complete(sreq);
        }
        else {
144
            vc = REQ_OFI(sreq)->vc;
145
146
            MPI_RC(reqFn(vc, sreq, &complete));
        }
147
148
149
        gl_data.rts_cts_in_flight--;
        break;
    case MPID_MSG_RTS:
150
151
152
153
154
155
        MPIDI_CH3U_Request_complete(sreq);
    }
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
156
/* Signals the CTS has been received.  Call MPID_nem_ofi_data_callback on   */
157
158
159
/* the parent send request to kick off the bulk data transfer               */
/* ------------------------------------------------------------------------ */
#undef FCNAME
160
161
#define FCNAME DECL_FUNC(MPID_nem_ofi_cts_recv_callback)
static int MPID_nem_ofi_cts_recv_callback(cq_tagged_entry_t * wc, MPID_Request * rreq)
162
163
164
{
    int mpi_errno = MPI_SUCCESS;
    BEGIN_FUNC(FCNAME);
165
    MPI_RC(MPID_nem_ofi_data_callback(wc, REQ_OFI(rreq)->parent));
166
167
168
169
170
171
172
173
174
175
176
177
178
    MPIDI_CH3U_Request_complete(rreq);
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
/* The nemesis API implementations:                                         */
/* These functions currently memory copy into a pack buffer before sending  */
/* To improve performance, we can replace the memory copy with a non-contig */
/* send (using tsendmsg)                                                    */
/* For now, the memory copy is the simplest implementation of these         */
/* functions over a tagged msg interface                                    */
/* ------------------------------------------------------------------------ */
#undef FCNAME
179
180
#define FCNAME DECL_FUNC(MPID_nem_ofi_iSendContig)
int MPID_nem_ofi_iSendContig(MPIDI_VC_t * vc,
181
182
183
184
185
186
187
188
189
190
                             MPID_Request * sreq,
                             void *hdr, MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz)
{
    int pgid, c, pkt_len, mpi_errno = MPI_SUCCESS;
    char *pack_buffer;
    uint64_t match_bits;
    MPID_Request *cts_req;

    BEGIN_FUNC(FCNAME);
    MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));
191
    MPID_nem_ofi_init_req(sreq);
192
193
194
195
196
197
198
199
200
201
    pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
    pack_buffer = MPIU_Malloc(pkt_len);
    MPIU_Assert(pack_buffer);
    MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
    MPIU_Memcpy(pack_buffer + sizeof(MPIDI_CH3_Pkt_t), data, data_sz);
    START_COMM();
    END_FUNC_RC(FCNAME);
}

#undef FCNAME
202
203
#define FCNAME DECL_FUNC(MPID_nem_ofi_SendNoncontig)
int MPID_nem_ofi_SendNoncontig(MPIDI_VC_t * vc,
204
205
206
207
208
209
210
                               MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz)
{
    int c, pgid, pkt_len, mpi_errno = MPI_SUCCESS;
    char *pack_buffer;
    MPI_Aint data_sz;
    uint64_t match_bits;
    MPID_Request *cts_req;
211
    MPIDI_msg_sz_t first, last;
212
213
214
215

    BEGIN_FUNC(FCNAME);
    MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));

216
217
218
    first = sreq->dev.segment_first;
    last = sreq->dev.segment_size;
    data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
219
220
221
222
    pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
    pack_buffer = MPIU_Malloc(pkt_len);
    MPIU_Assert(pack_buffer);
    MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
223
    MPID_Segment_pack(sreq->dev.segment_ptr, first, &last, pack_buffer + sizeof(MPIDI_CH3_Pkt_t));
224
    START_COMM();
225
    MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
226
227
228
229
    END_FUNC_RC(FCNAME);
}

#undef FCNAME
230
231
#define FCNAME DECL_FUNC(MPID_nem_ofi_iStartContigMsg)
int MPID_nem_ofi_iStartContigMsg(MPIDI_VC_t * vc,
232
233
234
235
236
237
238
239
240
241
242
243
                                 void *hdr,
                                 MPIDI_msg_sz_t hdr_sz,
                                 void *data, MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr)
{
    int pkt_len, c, pgid, mpi_errno = MPI_SUCCESS;
    MPID_Request *sreq;
    MPID_Request *cts_req;
    char *pack_buffer;
    uint64_t match_bits;
    BEGIN_FUNC(FCNAME);
    MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));

244
    MPID_nem_ofi_create_req(&sreq, 2);
245
246
247
248
249
250
251
252
253
254
255
256
257
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = NULL;
    sreq->dev.next = NULL;
    pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
    pack_buffer = MPIU_Malloc(pkt_len);
    MPIU_Assert(pack_buffer);
    MPIU_Memcpy((void *) pack_buffer, hdr, hdr_sz);
    if (data_sz)
        MPIU_Memcpy((void *) (pack_buffer + sizeof(MPIDI_CH3_Pkt_t)), data, data_sz);
    START_COMM();
    *sreq_ptr = sreq;
    END_FUNC_RC(FCNAME);
}