ofi_cm.c 24.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 *
 *  Portions of this code were written by Intel Corporation.
 *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
 *  to Argonne National Laboratory subject to Software Grant and Corporate
 *  Contributor License Agreement dated February 8, 2012.
 */
10
#include "ofi_impl.h"
11
12

/* ------------------------------------------------------------------------ */
13
/* ofi_tag_to_vc                                                            */
14
15
16
17
18
19
20
21
22
23
/* This routine converts tag information from an incoming preposted receive */
/* into the VC that uses the routine.  There is a possibility of a small    */
/* list of temporary VC's that are used during dynamic task management      */
/* to create the VC's.  This search is linear, but should be a small number */
/* of temporary VC's that will eventually be destroyed by the upper layers  */
/* Otherwise the tag is split into a PG "number", which is a hash of the    */
/* data contained in the process group, and a source.  The source/pg number */
/* is enough to look up the VC.                                             */
/* ------------------------------------------------------------------------ */
#undef FCNAME
24
#define FCNAME DECL_FUNC(ofi_tag_to_vc)
25
static inline MPIDI_VC_t *ofi_wc_to_vc(cq_tagged_entry_t * wc)
26
27
28
29
{
    int pgid = 0, port = 0;
    MPIDI_VC_t *vc = NULL;
    MPIDI_PG_t *pg = NULL;
30
31
    uint64_t match_bits = wc->tag;
    int wc_pgid;
32
    BEGIN_FUNC(FCNAME);
33
34
35
36
37
38
39
    if (gl_data.api_set == API_SET_1) {
        wc_pgid = get_pgid(match_bits);
    } else {
        wc_pgid = wc->data;
    }

    if (NO_PGID == wc_pgid) {
40
41
42
43
44
45
46
47
        /* -------------------------------------------------------------------- */
        /* Dynamic path -- This uses a linear search, but number of cm vc's is  */
        /* a small number, and they should be ephemeral.  This lookup should    */
        /* be fast yet not normally on the critical path.                       */
        /* -------------------------------------------------------------------- */
        port = get_port(match_bits);
        vc = gl_data.cm_vcs;
        while (vc && vc->port_name_tag != port) {
48
            vc = VC_OFI(vc)->next;
49
50
51
52
53
54
55
56
57
58
59
60
61
62
        }
        if (NULL == vc) {
            MPIU_Assertp(0);
        }
    }
    else {
        /* -------------------------------------------------------------------- */
        /* If there are no connection management VC's, this is the normal path  */
        /* Generate the PG number has from each known process group compare to  */
        /* the pg number in the tag.  The number of PG's should be small        */
        /* -------------------------------------------------------------------- */
        pg = gl_data.pg_p;
        while (pg) {
            MPIDI_PG_IdToNum(pg, &pgid);
63
            if (wc_pgid == pgid) {
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
                break;
            }
            pg = pg->next;
        }
        if (pg) {
            MPIDI_PG_Get_vc(pg, get_psource(match_bits), &vc);
        }
        else {
            MPIU_Assert(0);
        }
    }
    END_FUNC(FCNAME);
    return vc;
}

/* ------------------------------------------------------------------------ */
80
/* MPID_nem_ofi_conn_req_callback                                           */
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/* A new process has been created and is connected to the current world     */
/* The address of the new process is exchanged via the business card        */
/* instead of being exchanged up front during the creation of the first     */
/* world.  The new connection routine is usually invoked when two worlds    */
/* are started via dynamic tasking.                                         */
/* This routine:                                                            */
/*     * repost the persistent connection management receive request        */
/*     * malloc/create/initialize the VC                                    */
/*     * grabs the address name from the business card                      */
/*     * uses fi_av_insert to insert the addr into the address vector.      */
/* This is marked as a "connection management" vc, and may be destroyed     */
/* by the upper layers.  We handle the cm vc's slightly differently than    */
/* other VC's because they may not be part of a process group.              */
/* ------------------------------------------------------------------------ */
#undef FCNAME
96
97
#define FCNAME DECL_FUNC(MPID_nem_ofi_conn_req_callback)
static inline int MPID_nem_ofi_conn_req_callback(cq_tagged_entry_t * wc, MPID_Request * rreq)
98
99
{
    int ret, len, mpi_errno = MPI_SUCCESS;
100
    char bc[OFI_KVSAPPSTRLEN];
101
102
103
104
105
106
107
108
109
110

    MPIDI_VC_t *vc;
    char *addr = NULL;
    fi_addr_t direct_addr;

    BEGIN_FUNC(FCNAME);

    MPIU_Memcpy(bc, rreq->dev.user_buf, wc->len);
    bc[wc->len] = '\0';
    MPIU_Assert(gl_data.conn_req == rreq);
111
    FI_RC(fi_trecv(gl_data.endpoint,
112
113
114
115
116
117
118
                   gl_data.conn_req->dev.user_buf,
                   OFI_KVSAPPSTRLEN,
                   gl_data.mr,
                   FI_ADDR_UNSPEC,
                   MPID_CONN_REQ,
                   GET_RCD_IGNORE_MASK(),
                   (void *) &(REQ_OFI(gl_data.conn_req)->ofi_context)), trecv);
119
120
121
122
123
124
125
126
127

    addr = MPIU_Malloc(gl_data.bound_addrlen);
    MPIU_Assertp(addr);

    vc = MPIU_Malloc(sizeof(MPIDI_VC_t));
    MPIU_Assertp(vc);

    MPIDI_VC_Init(vc, NULL, 0);
    MPI_RC(MPIDI_GetTagFromPort(bc, &vc->port_name_tag));
128
    ret = MPIU_Str_get_binary_arg(bc, "OFI", addr, gl_data.bound_addrlen, &len);
129
130
131
132
133
    MPIU_ERR_CHKANDJUMP((ret != MPIU_STR_SUCCESS && ret != MPIU_STR_NOMEM) ||
                        (size_t) len != gl_data.bound_addrlen,
                        mpi_errno, MPI_ERR_OTHER, "**badbusinesscard");

    FI_RC(fi_av_insert(gl_data.av, addr, 1, &direct_addr, 0ULL, NULL), avmap);
134
135
136
137
    VC_OFI(vc)->direct_addr = direct_addr;
    VC_OFI(vc)->ready = 1;
    VC_OFI(vc)->is_cmvc = 1;
    VC_OFI(vc)->next = gl_data.cm_vcs;
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    gl_data.cm_vcs = vc;

    MPIDI_CH3I_Acceptq_enqueue(vc, vc->port_name_tag);
    MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT;
  fn_exit:
    MPIU_Free(addr);
    END_FUNC(FCNAME);
    return mpi_errno;
  fn_fail:
    if (vc)
        MPIU_Free(vc);
    goto fn_exit;
}

/* ------------------------------------------------------------------------ */
153
/* MPID_nem_ofi_handle_packet                                               */
154
155
156
157
158
159
/* The "parent" request tracks the state of the entire rendezvous           */
/* As "child" requests complete, the cc counter is decremented              */
/* Notify CH3 that we have an incoming packet (if cc hits 1).  Otherwise    */
/* decrement the ref counter via request completion                         */
/* ------------------------------------------------------------------------ */
#undef FCNAME
160
161
#define FCNAME DECL_FUNC(MPID_nem_ofi_handle_packet)
static inline int MPID_nem_ofi_handle_packet(cq_tagged_entry_t * wc ATTRIBUTE((unused)),
162
163
164
165
166
167
                                             MPID_Request * rreq)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_VC_t *vc;

    BEGIN_FUNC(FCNAME);
168
    if (MPID_cc_get(rreq->cc) == 1) {
169
        vc = REQ_OFI(rreq)->vc;
170
        MPIU_Assert(vc);
171
172
        MPI_RC(MPID_nem_handle_pkt(vc, REQ_OFI(rreq)->pack_buffer, REQ_OFI(rreq)->pack_buffer_size))
            MPIU_Free(REQ_OFI(rreq)->pack_buffer);
173
174
175
176
177
178
    }
    MPIDI_CH3U_Request_complete(rreq);
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
179
180
/* MPID_nem_ofi_cts_send_callback                                           */
/* A wrapper around MPID_nem_ofi_handle_packet that decrements              */
181
182
183
/* the parent request's counter, and cleans up the CTS request              */
/* ------------------------------------------------------------------------ */
#undef FCNAME
184
185
#define FCNAME DECL_FUNC(MPID_nem_ofi_cts_send_callback)
static inline int MPID_nem_ofi_cts_send_callback(cq_tagged_entry_t * wc, MPID_Request * sreq)
186
187
188
{
    int mpi_errno = MPI_SUCCESS;
    BEGIN_FUNC(FCNAME);
189
    MPI_RC(MPID_nem_ofi_handle_packet(wc, REQ_OFI(sreq)->parent));
190
191
192
193
194
    MPIDI_CH3U_Request_complete(sreq);
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
195
196
/* MPID_nem_ofi_preposted_callback                                          */
/* This callback handles incoming "SendContig" messages (see ofi_msg.c)     */
197
198
199
200
201
202
203
/* for the send routines.  This implements the CTS response and the RTS     */
/* handler.  The steps are as follows:                                      */
/*   * Create a parent data request and post a receive into a pack buffer   */
/*   * Create a child request and send the CTS packet                       */
/*   * Re-Post the RTS receive and handler to handle the next message       */
/* ------------------------------------------------------------------------ */
#undef FCNAME
204
205
#define FCNAME DECL_FUNC(MPID_nem_ofi_preposted_callback)
static inline int MPID_nem_ofi_preposted_callback(cq_tagged_entry_t * wc, MPID_Request * rreq)
206
207
208
209
210
211
212
213
{
    int c, mpi_errno = MPI_SUCCESS;
    size_t pkt_len;
    char *pack_buffer = NULL;
    MPIDI_VC_t *vc;
    MPID_Request *new_rreq, *sreq;
    BEGIN_FUNC(FCNAME);

214
    vc = ofi_wc_to_vc(wc);
215
216
217
218
219
220
221
222
    MPIU_Assert(vc);
    VC_READY_CHECK(vc);

    pkt_len = rreq->dev.user_count;
    pack_buffer = (char *) MPIU_Malloc(pkt_len);
    MPIU_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
                         "**nomem", "**nomem %s", "Pack Buffer alloc");
    c = 1;
223
    MPID_nem_ofi_create_req(&new_rreq, 1);
224
225
226
    MPID_cc_incr(new_rreq->cc_ptr, &c);
    new_rreq->dev.OnDataAvail = NULL;
    new_rreq->dev.next = NULL;
227
228
229
230
    REQ_OFI(new_rreq)->event_callback = MPID_nem_ofi_handle_packet;
    REQ_OFI(new_rreq)->vc = vc;
    REQ_OFI(new_rreq)->pack_buffer = pack_buffer;
    REQ_OFI(new_rreq)->pack_buffer_size = pkt_len;
231
    FI_RC(fi_trecv(gl_data.endpoint,
232
233
                       REQ_OFI(new_rreq)->pack_buffer,
                       REQ_OFI(new_rreq)->pack_buffer_size,
234
                       gl_data.mr,
235
                       VC_OFI(vc)->direct_addr,
236
                       MPID_MSG_DATA, 0, &(REQ_OFI(new_rreq)->ofi_context)), trecv);
237

238
    MPID_nem_ofi_create_req(&sreq, 1);
239
240
    sreq->dev.OnDataAvail = NULL;
    sreq->dev.next = NULL;
241
242
    REQ_OFI(sreq)->event_callback = MPID_nem_ofi_cts_send_callback;
    REQ_OFI(sreq)->parent = new_rreq;
243
    FI_RC(fi_tsend(gl_data.endpoint,
244
245
246
                     NULL,
                     0,
                     gl_data.mr,
247
                     VC_OFI(vc)->direct_addr,
248
                     MPID_MSG_CTS, &(REQ_OFI(sreq)->ofi_context)), tsend);
249
250
251
    MPIU_Assert(gl_data.persistent_req == rreq);

    rreq->dev.user_count = 0;
252
    FI_RC(fi_trecv(gl_data.endpoint,
253
254
255
256
257
258
259
                   &rreq->dev.user_count,
                   sizeof rreq->dev.user_count,
                   gl_data.mr,
                   FI_ADDR_UNSPEC,
                   MPID_MSG_RTS,
                   GET_RCD_IGNORE_MASK(),
                   &(REQ_OFI(rreq)->ofi_context)), trecv);
260
261
262
263
    END_FUNC_RC(FCNAME);
}

/* ------------------------------------------------------------------------ */
264
/* MPID_nem_ofi_connect_to_root_callback                                    */
265
266
267
/* Complete and clean up the request                                        */
/* ------------------------------------------------------------------------ */
#undef FCNAME
268
269
#define FCNAME DECL_FUNC(MPID_nem_ofi_connect_to_root_callback)
int MPID_nem_ofi_connect_to_root_callback(cq_tagged_entry_t * wc ATTRIBUTE((unused)),
270
271
272
273
274
                                          MPID_Request * sreq)
{
    int mpi_errno = MPI_SUCCESS;
    BEGIN_FUNC(FCNAME);

275
276
    if (REQ_OFI(sreq)->pack_buffer)
        MPIU_Free(REQ_OFI(sreq)->pack_buffer);
277
278
279
280
281
282
283
    MPIDI_CH3U_Request_complete(sreq);

    END_FUNC(FCNAME);
    return mpi_errno;
}

/* ------------------------------------------------------------------------ */
284
/* MPID_nem_ofi_cm_init                                                     */
285
286
287
288
289
/* This is a utility routine that sets up persistent connection management  */
/* requests and a persistent data request to handle rendezvous SendContig   */
/* messages.                                                                */
/* ------------------------------------------------------------------------ */
#undef FCNAME
290
291
#define FCNAME DECL_FUNC(MPID_nem_ofi_cm_init)
int MPID_nem_ofi_cm_init(MPIDI_PG_t * pg_p, int pg_rank ATTRIBUTE((unused)))
292
293
294
295
296
297
298
299
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *persistent_req, *conn_req;
    BEGIN_FUNC(FCNAME);

    /* ------------------------------------- */
    /* Set up CH3 and netmod data structures */
    /* ------------------------------------- */
300
301
302
303
304
305
306
307
308
309
310
    if (gl_data.api_set == API_SET_1) {
        MPI_RC(MPIDI_CH3I_Register_anysource_notification(MPID_nem_ofi_anysource_posted,
                                                          MPID_nem_ofi_anysource_matched));
        MPIDI_Anysource_iprobe_fn = MPID_nem_ofi_anysource_iprobe;
        MPIDI_Anysource_improbe_fn = MPID_nem_ofi_anysource_improbe;
    } else {
        MPI_RC(MPIDI_CH3I_Register_anysource_notification(MPID_nem_ofi_anysource_posted_2,
                                                          MPID_nem_ofi_anysource_matched));
        MPIDI_Anysource_iprobe_fn = MPID_nem_ofi_anysource_iprobe_2;
        MPIDI_Anysource_improbe_fn = MPID_nem_ofi_anysource_improbe_2;
    }
311
312
313
314
315
    gl_data.pg_p = pg_p;

    /* ----------------------------------- */
    /* Post a persistent request to handle */
    /* ----------------------------------- */
316
    MPID_nem_ofi_create_req(&persistent_req, 1);
317
318
    persistent_req->dev.OnDataAvail = NULL;
    persistent_req->dev.next = NULL;
319
320
    REQ_OFI(persistent_req)->vc = NULL;
    REQ_OFI(persistent_req)->event_callback = MPID_nem_ofi_preposted_callback;
321
    FI_RC(fi_trecv(gl_data.endpoint,
322
323
324
325
326
327
328
                   &persistent_req->dev.user_count,
                   sizeof persistent_req->dev.user_count,
                   gl_data.mr,
                   FI_ADDR_UNSPEC,
                   MPID_MSG_RTS,
                   GET_RCD_IGNORE_MASK(),
                   (void *) &(REQ_OFI(persistent_req)->ofi_context)), trecv);
329
330
331
332
333
    gl_data.persistent_req = persistent_req;

    /* --------------------------------- */
    /* Post recv for connection requests */
    /* --------------------------------- */
334
    MPID_nem_ofi_create_req(&conn_req, 1);
335
    conn_req->dev.user_buf = MPIU_Malloc(OFI_KVSAPPSTRLEN * sizeof(char));
336
337
    conn_req->dev.OnDataAvail = NULL;
    conn_req->dev.next = NULL;
338
339
    REQ_OFI(conn_req)->vc = NULL;       /* We don't know the source yet */
    REQ_OFI(conn_req)->event_callback = MPID_nem_ofi_conn_req_callback;
340
    FI_RC(fi_trecv(gl_data.endpoint,
341
342
343
344
345
346
347
                   conn_req->dev.user_buf,
                   OFI_KVSAPPSTRLEN,
                   gl_data.mr,
                   FI_ADDR_UNSPEC,
                   MPID_CONN_REQ,
                   GET_RCD_IGNORE_MASK(),
                   (void *) &(REQ_OFI(conn_req)->ofi_context)), trecv);
348
349
350
351
352
353
354
355
356
357
358
359
    gl_data.conn_req = conn_req;


  fn_exit:
    END_FUNC(FCNAME);
    return mpi_errno;

  fn_fail:
    goto fn_exit;
}

/* ------------------------------------------------------------------------ */
360
/* MPID_nem_ofi_cm_finalize                                                 */
361
362
363
/* Clean up and cancle the requests initiated by the cm_init routine        */
/* ------------------------------------------------------------------------ */
#undef FCNAME
364
365
#define FCNAME DECL_FUNC(MPID_nem_ofi_cm_finalize)
int MPID_nem_ofi_cm_finalize()
366
367
368
369
{
    int mpi_errno = MPI_SUCCESS;
    BEGIN_FUNC(FCNAME);
    FI_RC(fi_cancel((fid_t) gl_data.endpoint,
370
                    &(REQ_OFI(gl_data.persistent_req)->ofi_context)), cancel);
371
372
373
374
    MPIR_STATUS_SET_CANCEL_BIT(gl_data.persistent_req->status, TRUE);
    MPIR_STATUS_SET_COUNT(gl_data.persistent_req->status, 0);
    MPIDI_CH3U_Request_complete(gl_data.persistent_req);

375
    FI_RC(fi_cancel((fid_t) gl_data.endpoint, &(REQ_OFI(gl_data.conn_req)->ofi_context)), cancel);
376
377
378
379
380
381
382
383
384
385
386
387
    MPIU_Free(gl_data.conn_req->dev.user_buf);
    MPIR_STATUS_SET_CANCEL_BIT(gl_data.conn_req->status, TRUE);
    MPIR_STATUS_SET_COUNT(gl_data.conn_req->status, 0);
    MPIDI_CH3U_Request_complete(gl_data.conn_req);
  fn_exit:
    END_FUNC(FCNAME);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

/* ------------------------------------------------------------------------ */
388
/* MPID_nem_ofi_vc_connect                                                  */
389
390
391
/* Handle CH3/Nemesis VC connections                                        */
/*   * Query the VC address information.  In particular we are looking for  */
/*     the fabric address name.                                             */
392
/*   * Use fi_av_insert to register the address name with OFI               */
393
394
/* ------------------------------------------------------------------------ */
#undef FCNAME
395
396
#define FCNAME DECL_FUNC(MPID_nem_ofi_vc_connect)
int MPID_nem_ofi_vc_connect(MPIDI_VC_t * vc)
397
398
{
    int len, ret, mpi_errno = MPI_SUCCESS;
399
    char bc[OFI_KVSAPPSTRLEN], *addr = NULL;
400
401
402
403

    BEGIN_FUNC(FCNAME);
    addr = MPIU_Malloc(gl_data.bound_addrlen);
    MPIU_Assert(addr);
404
    MPIU_Assert(1 != VC_OFI(vc)->ready);
405
406
407
408
409

    if (!vc->pg || !vc->pg->getConnInfo) {
        goto fn_exit;
    }

410
411
    MPI_RC(vc->pg->getConnInfo(vc->pg_rank, bc, OFI_KVSAPPSTRLEN, vc->pg));
    ret = MPIU_Str_get_binary_arg(bc, "OFI", addr, gl_data.bound_addrlen, &len);
412
413
414
    MPIU_ERR_CHKANDJUMP((ret != MPIU_STR_SUCCESS && ret != MPIU_STR_NOMEM) ||
                        (size_t) len != gl_data.bound_addrlen,
                        mpi_errno, MPI_ERR_OTHER, "**badbusinesscard");
415
416
    FI_RC(fi_av_insert(gl_data.av, addr, 1, &(VC_OFI(vc)->direct_addr), 0ULL, NULL), avmap);
    VC_OFI(vc)->ready = 1;
417
418
419
420
421
422
423
424
425
426
427
428

  fn_exit:
    if (addr)
        MPIU_Free(addr);
    END_FUNC(FCNAME);
    return mpi_errno;

  fn_fail:
    goto fn_exit;
}

#undef FCNAME
429
430
#define FCNAME DECL_FUNC(MPID_nem_ofi_vc_init)
int MPID_nem_ofi_vc_init(MPIDI_VC_t * vc)
431
432
433
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3I_VC *const vc_ch = &vc->ch;
434
    MPID_nem_ofi_vc_t *const vc_ofi = VC_OFI(vc);
435
436

    BEGIN_FUNC(FCNAME);
437
438
439
    vc->sendNoncontig_fn = MPID_nem_ofi_SendNoncontig;
    vc_ch->iStartContigMsg = MPID_nem_ofi_iStartContigMsg;
    vc_ch->iSendContig = MPID_nem_ofi_iSendContig;
440
441
    vc_ch->next = NULL;
    vc_ch->prev = NULL;
442
    vc_ofi->is_cmvc = 0;
443
444
445
446
447
    vc->comm_ops = &_g_comm_ops;

    MPIDI_CHANGE_VC_STATE(vc, ACTIVE);

    if (NULL == vc->pg) {
448
        vc_ofi->is_cmvc = 1;
449
450
451
452
453
454
455
456
    }
    else {
    }
    END_FUNC(FCNAME);
    return mpi_errno;
}

/* ------------------------------------------------------------------------ */
457
458
/* MPID_nem_ofi_vc_destroy                                                  */
/* MPID_nem_ofi_vc_terminate                                                */
459
460
461
/* TODO:  Verify this code has no leaks                                     */
/* ------------------------------------------------------------------------ */
#undef FCNAME
462
463
#define FCNAME DECL_FUNC(MPID_nem_ofi_vc_destroy)
int MPID_nem_ofi_vc_destroy(MPIDI_VC_t * vc)
464
465
{
    BEGIN_FUNC(FCNAME);
466
    if (vc && (VC_OFI(vc)->is_cmvc == 1) && (VC_OFI(vc)->ready == 1)) {
467
468
469
470
        if (vc->pg != NULL) {
            printf("ERROR: VC Destroy (%p) pg = %s\n", vc, (char *) vc->pg->id);
        }
        MPIDI_VC_t *prev = gl_data.cm_vcs;
471
472
        while (prev && prev != vc && VC_OFI(prev)->next != vc) {
            prev = VC_OFI(vc)->next;
473
        }
474
475
        if (VC_OFI(prev)->next == vc) {
            VC_OFI(prev)->next = VC_OFI(vc)->next;
476
477
        }
        else if (vc == gl_data.cm_vcs) {
478
            gl_data.cm_vcs = VC_OFI(vc)->next;
479
480
481
482
483
        }
        else {
            MPIU_Assert(0);
        }
    }
484
    VC_OFI(vc)->ready = 0;
485
486
487
488
489
    END_FUNC(FCNAME);
    return MPI_SUCCESS;
}

#undef FCNAME
490
491
#define FCNAME DECL_FUNC(MPID_nem_ofi_vc_terminate)
int MPID_nem_ofi_vc_terminate(MPIDI_VC_t * vc)
492
493
494
495
{
    int mpi_errno = MPI_SUCCESS;
    BEGIN_FUNC(FCNAME);
    MPI_RC(MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED));
496
    VC_OFI(vc)->ready = 0;
497
498
499
500
501
502
    END_FUNC_RC(FCNAME);
}



/* ------------------------------------------------------------------------ */
503
/* MPID_nem_ofi_connect_to_root                                             */
504
505
506
507
508
509
510
511
512
513
514
515
516
517
/*  * A new unconnected VC (cm/ephemeral VC) has been created.  This code   */
/*    connects the new VC to a rank in another process group.  The parent   */
/*    address is obtained by an out of band method and given to this        */
/*    routine as a business card                                            */
/*  * Read the business card address and insert the address                 */
/*  * Send a connection request to the parent.  The parent has posted a     */
/*    persistent request to handle incoming connection requests             */
/*    The connect message has the child's business card.                    */
/*  * Add the new VC to the list of ephemeral BC's (cm_vc's).  These VC's   */
/*    are not part of the process group, so they require special handling   */
/*    during the SendContig family of routines.                             */
/* ------------------------------------------------------------------------ */
#undef FCNAME
#define FCNAME DECL_FUNC(nm_connect_to_root)
518
int MPID_nem_ofi_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc)
519
520
{
    int len, ret, mpi_errno = MPI_SUCCESS, str_errno = MPI_SUCCESS;
521
    int my_bc_len = OFI_KVSAPPSTRLEN;
522
523
524
525
526
527
    char *addr = NULL, *bc = NULL, *my_bc = NULL;
    MPID_Request *sreq;
    uint64_t conn_req_send_bits;

    BEGIN_FUNC(FCNAME);
    addr = MPIU_Malloc(gl_data.bound_addrlen);
528
    bc = MPIU_Malloc(OFI_KVSAPPSTRLEN);
529
530
531
532
533
534
535
536
    MPIU_Assertp(addr);
    MPIU_Assertp(bc);
    my_bc = bc;
    if (!business_card || business_card[0] != 't') {
        mpi_errno = MPI_ERR_OTHER;
        goto fn_fail;
    }
    MPI_RC(MPIDI_GetTagFromPort(business_card, &new_vc->port_name_tag));
537
    ret = MPIU_Str_get_binary_arg(business_card, "OFI", addr, gl_data.bound_addrlen, &len);
538
539
540
    MPIU_ERR_CHKANDJUMP((ret != MPIU_STR_SUCCESS && ret != MPIU_STR_NOMEM) ||
                        (size_t) len != gl_data.bound_addrlen,
                        mpi_errno, MPI_ERR_OTHER, "**badbusinesscard");
541
    FI_RC(fi_av_insert(gl_data.av, addr, 1, &(VC_OFI(new_vc)->direct_addr), 0ULL, NULL), avmap);
542

543
    VC_OFI(new_vc)->ready = 1;
544
545
    str_errno = MPIU_Str_add_int_arg(&bc, &my_bc_len, "tag", new_vc->port_name_tag);
    MPIU_ERR_CHKANDJUMP(str_errno, mpi_errno, MPI_ERR_OTHER, "**argstr_port_name_tag");
546
    MPI_RC(MPID_nem_ofi_get_business_card(MPIR_Process.comm_world->rank, &bc, &my_bc_len));
547
    my_bc_len = OFI_KVSAPPSTRLEN - my_bc_len;
548

549
    MPID_nem_ofi_create_req(&sreq, 1);
550
551
552
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = NULL;
    sreq->dev.next = NULL;
553
554
    REQ_OFI(sreq)->event_callback = MPID_nem_ofi_connect_to_root_callback;
    REQ_OFI(sreq)->pack_buffer = my_bc;
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
    if (gl_data.api_set == API_SET_1) {
        conn_req_send_bits = init_sendtag(0, MPIR_Process.comm_world->rank, 0, MPID_CONN_REQ);
        FI_RC(fi_tsend(gl_data.endpoint,
                       REQ_OFI(sreq)->pack_buffer,
                       my_bc_len,
                       gl_data.mr,
                       VC_OFI(new_vc)->direct_addr,
                       conn_req_send_bits, &(REQ_OFI(sreq)->ofi_context)), tsend);
    } else {
        conn_req_send_bits = init_sendtag_2(0, 0, MPID_CONN_REQ);
        FI_RC(fi_tsenddata(gl_data.endpoint,
                           REQ_OFI(sreq)->pack_buffer,
                           my_bc_len,
                           gl_data.mr,
                           MPIR_Process.comm_world->rank,
                           VC_OFI(new_vc)->direct_addr,
                           conn_req_send_bits, &(REQ_OFI(sreq)->ofi_context)), tsend);
    }
573
    MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
574
575
    VC_OFI(new_vc)->is_cmvc = 1;
    VC_OFI(new_vc)->next = gl_data.cm_vcs;
576
577
578
579
580
581
582
583
584
585
586
587
588
    gl_data.cm_vcs = new_vc;
  fn_exit:
    if (addr)
        MPIU_Free(addr);
    END_FUNC(FCNAME);
    return mpi_errno;
  fn_fail:
    if (my_bc)
        MPIU_Free(my_bc);
    goto fn_exit;
}

#undef FCNAME
589
590
#define FCNAME DECL_FUNC(MPID_nem_ofi_get_business_card)
int MPID_nem_ofi_get_business_card(int my_rank ATTRIBUTE((unused)),
591
592
593
594
595
596
                                   char **bc_val_p, int *val_max_sz_p)
{
    int mpi_errno = MPI_SUCCESS, str_errno = MPIU_STR_SUCCESS;
    BEGIN_FUNC(FCNAME);
    str_errno = MPIU_Str_add_binary_arg(bc_val_p,
                                        val_max_sz_p,
597
                                        "OFI",
598
599
600
601
602
603
604
                                        (char *) &gl_data.bound_addr, sizeof(gl_data.bound_addr));
    if (str_errno) {
        MPIU_ERR_CHKANDJUMP(str_errno == MPIU_STR_NOMEM, mpi_errno, MPI_ERR_OTHER, "**buscard_len");
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**buscard");
    }
    END_FUNC_RC(FCNAME);
}