ofi_init.c 19.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 *
 *  Portions of this code were written by Intel Corporation.
 *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
 *  to Argonne National Laboratory subject to Software Grant and Corporate
 *  Contributor License Agreement dated February 8, 2012.
 */
10
#include "ofi_impl.h"
11
12
13
14

static inline int dump_and_choose_providers(info_t * prov, info_t ** prov_use);
static inline int compile_time_checking();

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_OFI_USE_PROVIDER
      category    : DEVELOPER
      type        : string
      default     : NULL
      class       : device
      verbosity   : MPI_T_VERBOSITY_MPIDEV_DETAIL
      scope       : MPI_T_SCOPE_LOCAL
      description : >-
        If non-null, choose an OFI provider by name

    - name        : MPIR_CVAR_OFI_DUMP_PROVIDERS
      category    : DEVELOPER
      type        : boolean
      default     : false
      class       : device
      verbosity   : MPI_T_VERBOSITY_MPIDEV_DETAIL
      scope       : MPI_T_SCOPE_LOCAL
      description : >-
        If true, dump provider information at init

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/
41
#undef FCNAME
42
43
#define FCNAME DECL_FUNC(MPID_nem_ofi_init)
int MPID_nem_ofi_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p)
44
45
46
{
    int ret, fi_version, i, len, pmi_errno;
    int mpi_errno = MPI_SUCCESS;
47
    info_t *hints, *prov_tagged, *prov_use;
48
49
    cq_attr_t cq_attr;
    av_attr_t av_attr;
50
    char kvsname[OFI_KVSAPPSTRLEN], key[OFI_KVSAPPSTRLEN], bc[OFI_KVSAPPSTRLEN];
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
    char *my_bc, *addrs, *null_addr;
    fi_addr_t *fi_addrs = NULL;
    MPIDI_VC_t *vc;

    BEGIN_FUNC(FCNAME);
    MPIU_CHKLMEM_DECL(2);

    compile_time_checking();
    /* ------------------------------------------------------------------------ */
    /* Hints to filter providers                                                */
    /* See man fi_getinfo for a list                                            */
    /* of all filters                                                           */
    /* mode:  Select capabilities netmod is prepared to support.                */
    /*        In this case, netmod will pass in context into                    */
    /*        communication calls.                                              */
    /*        Note that we do not fill in FI_LOCAL_MR, which means this netmod  */
    /*        does not support exchange of memory regions on communication calls */
68
    /*        OFI requires that all communication calls use a registered mr     */
69
70
71
    /*        but in our case this netmod is written to only support transfers  */
    /*        on a dynamic memory region that spans all of memory.  So, we do   */
    /*        not set the FI_LOCAL_MR mode bit, and we set the FI_DYNAMIC_MR    */
72
    /*        bit to tell OFI our requirement and filter providers appropriately */
73
74
75
76
77
78
79
    /* ep_type:  reliable datagram operation                                    */
    /* caps:     Capabilities required from the provider.  The bits specified   */
    /*           with buffered receive, cancel, and remote complete implements  */
    /*           MPI semantics.  Tagged is used to support tag matching.        */
    /*           We expect to register all memory up front for use with this    */
    /*           endpoint, so the netmod requires dynamic memory regions        */
    /* ------------------------------------------------------------------------ */
80
    hints                   = fi_allocinfo();
81
82
83
    hints->mode             = FI_CONTEXT;
    hints->ep_attr->type    = FI_EP_RDM;      /* Reliable datagram         */
    hints->caps             = FI_TAGGED;      /* Tag matching interface    */
84

85
86
87
    hints->ep_attr->mem_tag_format = MEM_TAG_FORMAT;
    MPIU_Assert(pg_p->size < ((1 << MPID_RANK_BITS) - 1));

88
89
    /* ------------------------------------------------------------------------ */
    /* FI_VERSION provides binary backward and forward compatibility support    */
90
    /* Specify the version of OFI is coded to, the provider will select struct  */
91
92
93
94
95
96
97
98
99
    /* layouts that are compatible with this version.                           */
    /* ------------------------------------------------------------------------ */
    fi_version = FI_VERSION(1, 0);

    /* ------------------------------------------------------------------------ */
    /* fi_getinfo:  returns information about fabric  services for reaching a   */
    /* remote node or service.  this does not necessarily allocate resources.   */
    /* Pass NULL for name/service because we want a list of providers supported */
    /* ------------------------------------------------------------------------ */
100
101
102
    hints->domain_attr->threading        = FI_THREAD_ENDPOINT;
    hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
    hints->domain_attr->data_progress    = FI_PROGRESS_AUTO;
103
104
105
106
    char *provname;
    provname                             = MPIR_CVAR_OFI_USE_PROVIDER?
      MPIU_Strdup(MPIR_CVAR_OFI_USE_PROVIDER):NULL;
    hints->fabric_attr->prov_name        = provname;
107
108
109
110
    FI_RC(fi_getinfo(fi_version,    /* Interface version requested               */
                     NULL,          /* Optional name or fabric to resolve        */
                     NULL,          /* Service name or port number to request    */
                     0ULL,          /* Flag:  node/service specify local address */
111
                     hints,         /* In:  Hints to filter available providers  */
112
                     &prov_tagged), /* Out: List of providers that match hints   */
113
114
          getinfo);
    MPIU_ERR_CHKANDJUMP4(prov_tagged == NULL, mpi_errno, MPI_ERR_OTHER,
115
                         "**ofi_getinfo", "**ofi_getinfo %s %d %s %s",
116
117
118
119
120
121
122
123
124
125
126
127
128
                         __SHORT_FILE__, __LINE__, FCNAME, "No tag matching provider found");
    /* ------------------------------------------------------------------------ */
    /* Open fabric                                                              */
    /* The getinfo struct returns a fabric attribute struct that can be used to */
    /* instantiate the virtual or physical network.  This opens a "fabric       */
    /* provider".   We choose the first available fabric, but getinfo           */
    /* returns a list.  see man fi_fabric for details                           */
    /* ------------------------------------------------------------------------ */
    dump_and_choose_providers(prov_tagged, &prov_use);
    FI_RC(fi_fabric(prov_use->fabric_attr,      /* In:   Fabric attributes */
                    &gl_data.fabric,    /* Out:  Fabric descriptor */
                    NULL), openfabric); /* Context: fabric events  */

129
130
    gl_data.api_set = API_SET_1;

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    /* ------------------------------------------------------------------------ */
    /* Create the access domain, which is the physical or virtual network or    */
    /* hardware port/collection of ports.  Returns a domain object that can be  */
    /* used to create endpoints.  See man fi_domain for details.                */
    /* Refine get_info filter for additional capabilities                       */
    /* threading:  Disable locking, MPICH handles locking model                 */
    /* control_progress:  enable async progress                                 */
    /* op_flags:  Specifies default operation to set on all communication.      */
    /*            In this case, we want remote completion to be set by default  */
    /* ------------------------------------------------------------------------ */
    FI_RC(fi_domain(gl_data.fabric,     /* In:  Fabric object             */
                    prov_use,   /* In:  default domain attributes */
                    &gl_data.domain,    /* Out: domain object             */
                    NULL), opendomain); /* Context: Domain events         */

    /* ------------------------------------------------------------------------ */
    /* Create a transport level communication endpoint.  To use the endpoint,   */
    /* it must be bound to completion counters or event queues and enabled,     */
    /* and the resources consumed by it, such as address vectors, counters,     */
    /* completion queues, etc.                                                  */
    /* see man fi_endpoint for more details                                     */
    /* ------------------------------------------------------------------------ */
    FI_RC(fi_endpoint(gl_data.domain,   /* In: Domain Object        */
                      prov_use, /* In: Configuration object */
                      &gl_data.endpoint,        /* Out: Endpoint Object     */
                      NULL), openep);   /* Context: endpoint events */

    /* ------------------------------------------------------------------------ */
    /* Create the objects that will be bound to the endpoint.                   */
    /* The objects include:                                                     */
    /*     * completion queue for events                                        */
    /*     * address vector of other endpoint addresses                         */
    /* Other objects could be created (for example), but are unused in netmod   */
    /*     * counters for incoming writes                                       */
    /*     * completion counters for put and get                                */
    /* ------------------------------------------------------------------------ */
167
    gl_data.mr = NULL;
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    memset(&cq_attr, 0, sizeof(cq_attr));
    cq_attr.format = FI_CQ_FORMAT_TAGGED;
    FI_RC(fi_cq_open(gl_data.domain,    /* In:  Domain Object         */
                     &cq_attr,  /* In:  Configuration object  */
                     &gl_data.cq,       /* Out: CQ Object             */
                     NULL), opencq);    /* Context: CQ events         */

    memset(&av_attr, 0, sizeof(av_attr));
    av_attr.type = FI_AV_MAP;   /* Mapped addressing mode     */
    FI_RC(fi_av_open(gl_data.domain,    /* In:  Domain Object         */
                     &av_attr,  /* In:  Configuration object  */
                     &gl_data.av,       /* Out: AV Object             */
                     NULL), avopen);    /* Context: AV events         */

    /* --------------------------------------------- */
    /* Bind the MR, CQ and AV to the endpoint object */
    /* --------------------------------------------- */
    FI_RC(fi_ep_bind(gl_data.endpoint, (fid_t) gl_data.cq, FI_SEND | FI_RECV), bind);
    FI_RC(fi_ep_bind(gl_data.endpoint, (fid_t) gl_data.av, 0), bind);

    /* ------------------------------------- */
    /* Enable the endpoint for communication */
    /* This commits the bind operations      */
    /* ------------------------------------- */
    FI_RC(fi_enable(gl_data.endpoint), ep_enable);

    /* --------------------------- */
    /* Free providers info         */
    /* --------------------------- */
197
198
199
200
201
    if(provname) {
      MPIU_Free(provname);
      hints->fabric_attr->prov_name = NULL;
    }

202
    fi_freeinfo(hints);
203
204
205
206
207
208
209
210
211
212
213
214
215
216
    fi_freeinfo(prov_use);

    /* ---------------------------------------------------- */
    /* Exchange endpoint addresses using scalable database  */
    /* or job launcher, in this case, use PMI interfaces    */
    /* ---------------------------------------------------- */
    gl_data.bound_addrlen = sizeof(gl_data.bound_addr);
    FI_RC(fi_getname((fid_t) gl_data.endpoint, &gl_data.bound_addr,
                     &gl_data.bound_addrlen), getname);

    /* -------------------------------- */
    /* Get our business card            */
    /* -------------------------------- */
    my_bc = *bc_val_p;
217
    MPI_RC(MPID_nem_ofi_get_business_card(pg_rank, bc_val_p, val_max_sz_p));
218
219
220
221
222

    /* -------------------------------- */
    /* Publish the business card        */
    /* to the KVS                       */
    /* -------------------------------- */
223
224
    PMI_RC(PMI_KVS_Get_my_name(kvsname, OFI_KVSAPPSTRLEN), pmi);
    sprintf(key, "OFI-%d", pg_rank);
225
226
227
228
229
230
231

    PMI_RC(PMI_KVS_Put(kvsname, key, my_bc), pmi);
    PMI_RC(PMI_KVS_Commit(kvsname), pmi);

    /* -------------------------------- */
    /* Set the MPI maximum tag value    */
    /* -------------------------------- */
232
    MPIR_Process.attrs.tag_ub = (1 << MPID_TAG_BITS) - 1;
233
234
235
236
237

    /* --------------------------------- */
    /* Wait for all the ranks to publish */
    /* their business card               */
    /* --------------------------------- */
238
    gl_data.rts_cts_in_flight = 0;
239
240
241
242
243
244
245
246
247
248
    PMI_Barrier();

    /* --------------------------------- */
    /* Retrieve every rank's address     */
    /* from KVS and store them in local  */
    /* table                             */
    /* --------------------------------- */
    MPIU_CHKLMEM_MALLOC(addrs, char *, pg_p->size * gl_data.bound_addrlen, mpi_errno, "addrs");

    for (i = 0; i < pg_p->size; ++i) {
249
        sprintf(key, "OFI-%d", i);
250

251
252
        PMI_RC(PMI_KVS_Get(kvsname, key, bc, OFI_KVSAPPSTRLEN), pmi);
        ret = MPIU_Str_get_binary_arg(bc, "OFI",
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
                                      (char *) &addrs[i * gl_data.bound_addrlen],
                                      gl_data.bound_addrlen, &len);
        MPIU_ERR_CHKANDJUMP((ret != MPIU_STR_SUCCESS && ret != MPIU_STR_NOMEM) ||
                            (size_t) len != gl_data.bound_addrlen,
                            mpi_errno, MPI_ERR_OTHER, "**badbusinesscard");
    }

    /* ---------------------------------------------------- */
    /* Map the addresses into an address vector             */
    /* The addressing mode is "map", so we must provide     */
    /* storage to store the per destination addresses       */
    /* ---------------------------------------------------- */
    fi_addrs = MPIU_Malloc(pg_p->size * sizeof(fi_addr_t));
    FI_RC(fi_av_insert(gl_data.av, addrs, pg_p->size, fi_addrs, 0ULL, NULL), avmap);

    /* ---------------------------------------------------- */
    /* Insert the ANY_SRC address                           */
    /* ---------------------------------------------------- */

272
    gl_data.any_addr = FI_ADDR_UNSPEC;
273
274
275
276
277
278
279

    /* --------------------------------- */
    /* Store the direct addresses in     */
    /* the ranks' respective VCs         */
    /* --------------------------------- */
    for (i = 0; i < pg_p->size; ++i) {
        MPIDI_PG_Get_vc(pg_p, i, &vc);
280
281
        VC_OFI(vc)->direct_addr = fi_addrs[i];
        VC_OFI(vc)->ready = 1;
282
283
284
285
286
287
288
289
290
291
    }

    /* --------------------------------------------- */
    /* Initialize the connection management routines */
    /* This completes any function handlers and      */
    /* global data structures, and posts any         */
    /* persistent communication requests that are    */
    /* required, like connection management and      */
    /* startcontig messages                          */
    /* --------------------------------------------- */
292
    MPI_RC(MPID_nem_ofi_cm_init(pg_p, pg_rank));
293
294
295
296
297
298
299
300
301
302
303
  fn_exit:
    if (fi_addrs)
        MPIU_Free(fi_addrs);
    MPIU_CHKLMEM_FREEALL();
    END_FUNC(FCNAME);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FCNAME
304
305
#define FCNAME DECL_FUNC(MPID_nem_ofi_finalize)
int MPID_nem_ofi_finalize(void)
306
307
{
    int mpi_errno = MPI_SUCCESS;
308
    mpir_errflag_t ret = MPIR_ERR_NONE;
309
310
    BEGIN_FUNC(FCNAME);

311
312
313
    while(gl_data.rts_cts_in_flight) {
        MPID_nem_ofi_poll(0);
    }
314
315
316
317
318
    /* --------------------------------------------- */
    /* Finalize connection management routines       */
    /* Cancels any persistent/global requests and    */
    /* frees any resources from cm_init()            */
    /* --------------------------------------------- */
319
    MPI_RC(MPID_nem_ofi_cm_finalize());
320
321

    FI_RC(fi_close((fid_t) gl_data.endpoint), epclose);
322
    FI_RC(fi_close((fid_t) gl_data.av), avclose);
323
324
325
326
327
328
329
330
    FI_RC(fi_close((fid_t) gl_data.cq), cqclose);
    FI_RC(fi_close((fid_t) gl_data.domain), domainclose);
    FI_RC(fi_close((fid_t) gl_data.fabric), fabricclose);
    END_FUNC_RC(FCNAME);
}

static inline int compile_time_checking()
{
331
332
333
    OFI_COMPILE_TIME_ASSERT(sizeof(MPID_nem_ofi_vc_t) <= MPID_NEM_VC_NETMOD_AREA_LEN);
    OFI_COMPILE_TIME_ASSERT(sizeof(MPID_nem_ofi_req_t) <= MPID_NEM_REQ_NETMOD_AREA_LEN);
    OFI_COMPILE_TIME_ASSERT(sizeof(iovec_t) == sizeof(MPID_IOV));
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    MPIU_Assert(((void *) &(((iovec_t *) 0)->iov_base)) ==
                ((void *) &(((MPID_IOV *) 0)->MPID_IOV_BUF)));
    MPIU_Assert(((void *) &(((iovec_t *) 0)->iov_len)) ==
                ((void *) &(((MPID_IOV *) 0)->MPID_IOV_LEN)));
    MPIU_Assert(sizeof(((iovec_t *) 0)->iov_len) == sizeof(((MPID_IOV *) 0)->MPID_IOV_LEN));

    /* ------------------------------------------------------------------------ */
    /* Generate the MPICH catalog files                                         */
    /* The high level mpich build scripts inspect MPIU_ERR_ macros to generate  */
    /* the message catalog.  However, this netmod buries the messages under the */
    /* FI_RC macros, so the catalog doesn't get generated.  The build system    */
    /* likely needs a MPIU_ERR_REGISTER macro                                   */
    /* ------------------------------------------------------------------------ */
#if 0
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_avmap", "**ofi_avmap %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_tsend", "**ofi_tsend %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_trecv", "**ofi_trecv %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_getinfo", "**ofi_getinfo %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_openep", "**ofi_openep %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_openfabric", "**ofi_openfabric %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_opendomain", "**ofi_opendomain %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_opencq", "**ofi_opencq %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_avopen", "**ofi_avopen %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_bind", "**ofi_bind %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_ep_enable", "**ofi_ep_enable %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_getname", "**ofi_getname %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_avclose", "**ofi_avclose %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_epclose", "**ofi_epclose %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_cqclose", "**ofi_cqclose %s %d %s %s", a, b, a, a);
363
364
365
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_fabricclose", "**ofi_fabricclose %s %d %s %s", a, b, a,a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_domainclose", "**ofi_domainclose %s %d %s %s", a, b, a,a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_peek", "**ofi_peek %s %d %s %s", a, b, a, a);
366
367
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_poll", "**ofi_poll %s %d %s %s", a, b, a, a);
    MPIU_ERR_SET2(e, MPI_ERR_OTHER, "**ofi_cancel", "**ofi_cancel %s %d %s %s", a, b, a, a);
368
369
370
371
372
373
374
#endif
    return 0;
}


static inline int dump_and_choose_providers(info_t * prov, info_t ** prov_use)
{
375
376
377
  info_t *p = prov;
  int     i = 0;
  *prov_use = prov;
378
  if (MPIR_CVAR_OFI_DUMP_PROVIDERS) {
379
380
381
382
    fprintf(stdout, "Dumping Providers(first=%p):\n", prov);
    while(p) {
      fprintf(stdout, "%s", fi_tostr(p, FI_TYPE_INFO));
      p=p->next;
383
    }
384
385
  }
  return i;
386
}