ofi_impl.h 14.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 *
 *  Portions of this code were written by Intel Corporation.
 *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
 *  to Argonne National Laboratory subject to Software Grant and Corporate
 *  Contributor License Agreement dated February 8, 2012.
 */
10
11
#ifndef OFI_IMPL_H
#define OFI_IMPL_H
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

#include "mpid_nem_impl.h"
#include "mpihandlemem.h"
#include "pmi.h"
#include <rdma/fabric.h>
#include <rdma/fi_errno.h>
#include <rdma/fi_endpoint.h>
#include <rdma/fi_domain.h>
#include <rdma/fi_tagged.h>
#include <rdma/fi_cm.h>
#include <netdb.h>

/* ************************************************************************** */
/* Type Definitions                                                           */
/* ************************************************************************** */
typedef struct iovec iovec_t;
typedef struct fi_info info_t;
typedef struct fi_cq_attr cq_attr_t;
typedef struct fi_av_attr av_attr_t;
typedef struct fi_domain_attr domain_attr_t;
32
typedef struct fi_ep_attr ep_attr_t;
33
typedef struct fi_tx_attr tx_attr_t;
34
35
36
37
38
39
40
41
42
43
typedef struct fi_cq_tagged_entry cq_tagged_entry_t;
typedef struct fi_cq_err_entry cq_err_entry_t;
typedef struct fi_context context_t;
typedef int (*event_callback_fn) (cq_tagged_entry_t * wc, MPID_Request *);
typedef int (*req_fn) (MPIDI_VC_t *, MPID_Request *, int *);

/* ******************************** */
/* Global Object for state tracking */
/* ******************************** */
typedef struct {
44
    char bound_addr[128];       /* This ranks bound address    */
45
46
47
48
49
50
51
52
53
54
55
56
57
    fi_addr_t any_addr;         /* Specifies any source        */
    size_t bound_addrlen;       /* length of the bound address */
    struct fid_fabric *fabric;  /* fabric object               */
    struct fid_domain *domain;  /* domain object               */
    struct fid_ep *endpoint;    /* endpoint object             */
    struct fid_cq *cq;          /* completion queue            */
    struct fid_av *av;          /* address vector              */
    struct fid_mr *mr;          /* memory region               */
    MPIDI_PG_t *pg_p;           /* MPI Process group           */
    MPIDI_VC_t *cm_vcs;         /* temporary VC's              */
    MPID_Request *persistent_req;       /* Unexpected request queue    */
    MPID_Request *conn_req;     /* Connection request          */
    MPIDI_Comm_ops_t comm_ops;
58
} MPID_nem_ofi_global_t;
59
60
61
62
63
64

/* ******************************** */
/* Device channel specific data     */
/* This is per destination          */
/* ******************************** */
typedef struct {
65
    fi_addr_t direct_addr;      /* Remote OFI address */
66
67
68
    int ready;                  /* VC ready state     */
    int is_cmvc;                /* Cleanup VC         */
    MPIDI_VC_t *next;           /* VC queue           */
69
} MPID_nem_ofi_vc_t;
70
#define VC_OFI(vc) ((MPID_nem_ofi_vc_t *)vc->ch.netmod_area.padding)
71
72
73

/* ******************************** */
/* Per request object data          */
74
/* OFI/Netmod specific              */
75
76
/* ******************************** */
typedef struct {
77
    context_t ofi_context;      /* Context Object              */
78
    void *addr;                 /* OFI Address                 */
79
80
81
82
83
84
85
86
    event_callback_fn event_callback;   /* Callback Event              */
    char *pack_buffer;          /* MPI Pack Buffer             */
    int pack_buffer_size;       /* Pack buffer size            */
    int match_state;            /* State of the match          */
    int req_started;            /* Request state               */
    MPIDI_VC_t *vc;             /* VC paired with this request */
    uint64_t tag;               /* 64 bit tag request          */
    MPID_Request *parent;       /* Parent request              */
87
} MPID_nem_ofi_req_t;
88
#define REQ_OFI(req) ((MPID_nem_ofi_req_t *)((req)->ch.netmod_area.padding))
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

/* ******************************** */
/* Logging and function macros      */
/* ******************************** */
#undef FUNCNAME
#define FUNCNAME nothing
#define BEGIN_FUNC(FUNCNAME)                    \
  MPIDI_STATE_DECL(FUNCNAME);                   \
  MPIDI_FUNC_ENTER(FUNCNAME);
#define END_FUNC(FUNCNAME)                      \
  MPIDI_FUNC_EXIT(FUNCNAME);
#define END_FUNC_RC(FUNCNAME) \
  fn_exit:                    \
  MPIDI_FUNC_EXIT(FUNCNAME);  \
  return mpi_errno;           \
fn_fail:                      \
  goto fn_exit;

#define __SHORT_FILE__                          \
  (strrchr(__FILE__,'/')                        \
   ? strrchr(__FILE__,'/')+1                    \
   : __FILE__                                   \
)
#define DECL_FUNC(FUNCNAME)  MPIU_QUOTE(FUNCNAME)
113
#define OFI_COMPILE_TIME_ASSERT(expr_)                                  \
114
115
116
117
118
119
120
121
122
  do { switch(0) { case 0: case (expr_): default: break; } } while (0)

#define FI_RC(FUNC,STR)                                         \
  do                                                            \
    {                                                           \
      ssize_t _ret = FUNC;                                      \
      MPIU_ERR_##CHKANDJUMP4(_ret<0,                            \
                           mpi_errno,                           \
                           MPI_ERR_OTHER,                       \
123
124
                           "**ofi_"#STR,                        \
                           "**ofi_"#STR" %s %d %s %s",          \
125
126
127
128
129
130
131
132
133
134
135
136
137
                           __SHORT_FILE__,                      \
                           __LINE__,                            \
                           FCNAME,                              \
                           fi_strerror(-_ret));                 \
    } while (0)

#define PMI_RC(FUNC,STR)                                        \
  do                                                            \
    {                                                           \
      pmi_errno  = FUNC;                                        \
      MPIU_ERR_##CHKANDJUMP4(pmi_errno!=PMI_SUCCESS,            \
                           mpi_errno,                           \
                           MPI_ERR_OTHER,                       \
138
139
                           "**ofi_"#STR,                        \
                           "**ofi_"#STR" %s %d %s %s",          \
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
                           __SHORT_FILE__,                      \
                           __LINE__,                            \
                           FCNAME,                              \
                           #STR);                               \
    } while (0)

#define MPI_RC(FUNC)                                        \
  do                                                        \
    {                                                       \
      mpi_errno  = FUNC;                                    \
      if (mpi_errno) MPIU_ERR_POP(mpi_errno);               \
    } while (0);

#define VC_READY_CHECK(vc)                      \
({                                              \
155
  if (1 != VC_OFI(vc)->ready) {                 \
156
    MPI_RC(MPID_nem_ofi_vc_connect(vc));        \
157
158
159
  }                                             \
})

160
#define OFI_ADDR_INIT(src, vc, remote_proc) \
161
162
163
164
({                                          \
  if (MPI_ANY_SOURCE != src) {              \
    MPIU_Assert(vc != NULL);                \
    VC_READY_CHECK(vc);                     \
165
    remote_proc = VC_OFI(vc)->direct_addr;  \
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
  } else {                                  \
    MPIU_Assert(vc == NULL);                \
    remote_proc = gl_data.any_addr;         \
  }                                         \
})


#define NO_PGID 0

/* **************************************************************************
 *  match/ignore bit manipulation
 * **************************************************************************
 * 0123 4567 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567
 *     |                  |                  |
 * ^   |    context id    |       source     |       message tag
 * |   |                  |                  |
 * +---- protocol
 * ************************************************************************** */
#define MPID_PROTOCOL_MASK       (0xF000000000000000ULL)
#define MPID_CONTEXT_MASK        (0x0FFFF00000000000ULL)
#define MPID_SOURCE_MASK         (0x00000FFFF0000000ULL)
#define MPID_TAG_MASK            (0x000000000FFFFFFFULL)
#define MPID_PGID_MASK           (0x00000000FFFFFFFFULL)
#define MPID_PSOURCE_MASK        (0x0000FFFF00000000ULL)
#define MPID_PORT_NAME_MASK      (0x0FFF000000000000ULL)
#define MPID_SYNC_SEND           (0x1000000000000000ULL)
#define MPID_SYNC_SEND_ACK       (0x2000000000000000ULL)
#define MPID_MSG_RTS             (0x3000000000000000ULL)
#define MPID_MSG_CTS             (0x4000000000000000ULL)
#define MPID_MSG_DATA            (0x5000000000000000ULL)
#define MPID_CONN_REQ            (0x6000000000000000ULL)
#define MPID_SOURCE_SHIFT        (16)
#define MPID_TAG_SHIFT           (28)
#define MPID_PSOURCE_SHIFT       (16)
#define MPID_PORT_SHIFT          (32)
201
#define OFI_KVSAPPSTRLEN         1024
202
203
204
205

/* ******************************** */
/* Request manipulation inlines     */
/* ******************************** */
206
static inline void MPID_nem_ofi_init_req(MPID_Request * req)
207
{
208
    memset(REQ_OFI(req), 0, sizeof(MPID_nem_ofi_req_t));
209
210
}

211
static inline int MPID_nem_ofi_create_req(MPID_Request ** request, int refcnt)
212
213
214
215
216
217
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *req;
    req = MPID_Request_create();
    MPIU_Assert(req);
    MPIU_Object_set_ref(req, refcnt);
218
    MPID_nem_ofi_init_req(req);
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
    *request = req;
    return mpi_errno;
}

/* ******************************** */
/* Tag Manipulation inlines         */
/* ******************************** */
static inline uint64_t init_sendtag(MPIR_Context_id_t contextid, int source, int tag, uint64_t type)
{
    uint64_t match_bits;
    match_bits = contextid;
    match_bits = (match_bits << MPID_SOURCE_SHIFT);
    match_bits |= source;
    match_bits = (match_bits << MPID_TAG_SHIFT);
    match_bits |= (MPID_TAG_MASK & tag) | type;
    return match_bits;
}

/* receive posting */
static inline uint64_t init_recvtag(uint64_t * mask_bits,
                                    MPIR_Context_id_t contextid, int source, int tag)
{
    uint64_t match_bits = 0;
    *mask_bits = MPID_SYNC_SEND;
    match_bits = contextid;
    match_bits = (match_bits << MPID_SOURCE_SHIFT);
    if (MPI_ANY_SOURCE == source) {
        match_bits = (match_bits << MPID_TAG_SHIFT);
        *mask_bits |= MPID_SOURCE_MASK;
    }
    else {
        match_bits |= source;
        match_bits = (match_bits << MPID_TAG_SHIFT);
    }
    if (MPI_ANY_TAG == tag)
        *mask_bits |= MPID_TAG_MASK;
    else
        match_bits |= (MPID_TAG_MASK & tag);

    return match_bits;
}

static inline int get_tag(uint64_t match_bits)
{
    return ((int) (match_bits & MPID_TAG_MASK));
}

static inline int get_source(uint64_t match_bits)
{
    return ((int) ((match_bits & MPID_SOURCE_MASK) >> (MPID_TAG_SHIFT)));
}

static inline int get_psource(uint64_t match_bits)
{
    return ((int) ((match_bits & MPID_PSOURCE_MASK) >> (MPID_PORT_SHIFT)));
}

static inline int get_pgid(uint64_t match_bits)
{
    return ((int) (match_bits & MPID_PGID_MASK));
}

static inline int get_port(uint64_t match_bits)
{
    return ((int) ((match_bits & MPID_PORT_NAME_MASK) >> MPID_TAG_SHIFT));
}

/* ************************************************************************** */
/* MPICH Comm Override and Netmod functions                                   */
/* ************************************************************************** */
289
290
int MPID_nem_ofi_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req);
int MPID_nem_ofi_send(struct MPIDI_VC *vc, const void *buf, int count,
291
292
                      MPI_Datatype datatype, int dest, int tag, MPID_Comm * comm,
                      int context_offset, struct MPID_Request **request);
293
int MPID_nem_ofi_isend(struct MPIDI_VC *vc, const void *buf, int count,
294
295
                       MPI_Datatype datatype, int dest, int tag, MPID_Comm * comm,
                       int context_offset, struct MPID_Request **request);
296
int MPID_nem_ofi_ssend(struct MPIDI_VC *vc, const void *buf, int count,
297
298
                       MPI_Datatype datatype, int dest, int tag, MPID_Comm * comm,
                       int context_offset, struct MPID_Request **request);
299
int MPID_nem_ofi_issend(struct MPIDI_VC *vc, const void *buf, int count,
300
301
                        MPI_Datatype datatype, int dest, int tag, MPID_Comm * comm,
                        int context_offset, struct MPID_Request **request);
302
303
304
int MPID_nem_ofi_cancel_send(struct MPIDI_VC *vc, struct MPID_Request *sreq);
int MPID_nem_ofi_cancel_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq);
int MPID_nem_ofi_iprobe(struct MPIDI_VC *vc, int source, int tag, MPID_Comm * comm,
305
                        int context_offset, int *flag, MPI_Status * status);
306
int MPID_nem_ofi_improbe(struct MPIDI_VC *vc, int source, int tag, MPID_Comm * comm,
307
308
                         int context_offset, int *flag, MPID_Request ** message,
                         MPI_Status * status);
309
int MPID_nem_ofi_anysource_iprobe(int tag, MPID_Comm * comm, int context_offset,
310
                                  int *flag, MPI_Status * status);
311
int MPID_nem_ofi_anysource_improbe(int tag, MPID_Comm * comm, int context_offset,
312
                                   int *flag, MPID_Request ** message, MPI_Status * status);
313
314
315
316
void MPID_nem_ofi_anysource_posted(MPID_Request * rreq);
int MPID_nem_ofi_anysource_matched(MPID_Request * rreq);
int MPID_nem_ofi_send_data(cq_tagged_entry_t * wc, MPID_Request * sreq);
int MPID_nem_ofi_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq,
317
                               void *hdr, MPIDI_msg_sz_t hdr_sz);
318
int MPID_nem_ofi_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_sz,
319
                                 void *data, MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr);
320
int MPID_nem_ofi_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
321
322
323
                             MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz);

/* ************************************************************************** */
324
/* OFI utility functions : not exposed as a netmod public API                 */
325
326
327
/* ************************************************************************** */
#define MPID_NONBLOCKING_POLL 0
#define MPID_BLOCKING_POLL 1
328
329
330
331
332
333
334
335
336
337
338
int MPID_nem_ofi_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p);
int MPID_nem_ofi_finalize(void);
int MPID_nem_ofi_vc_init(MPIDI_VC_t * vc);
int MPID_nem_ofi_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_p);
int MPID_nem_ofi_poll(int in_blocking_poll);
int MPID_nem_ofi_vc_terminate(MPIDI_VC_t * vc);
int MPID_nem_ofi_vc_connect(MPIDI_VC_t * vc);
int MPID_nem_ofi_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc);
int MPID_nem_ofi_vc_destroy(MPIDI_VC_t * vc);
int MPID_nem_ofi_cm_init(MPIDI_PG_t * pg_p, int pg_rank);
int MPID_nem_ofi_cm_finalize();
339

340
extern MPID_nem_ofi_global_t gl_data;
341
342
343
extern MPIDI_Comm_ops_t _g_comm_ops;

#endif