ptl_recv.c 28.1 KB
Newer Older
1
2
3
4
5
6
7
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2012 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "ptl_impl.h"
8
#include "rptl.h"
9
10
11
12
13
14
15
16
17

#undef FUNCNAME
#define FUNCNAME dequeue_req
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static void dequeue_req(const ptl_event_t *e)
{
    int found;
    MPID_Request *const rreq = e->user_ptr;
18
19
    int s_len, r_len;

20
21
    /* At this point we know the ME is unlinked. Invalidate the handle to
       prevent further accesses, e.g. an attempted cancel. */
22
    REQ_PTL(rreq)->put_me = PTL_INVALID_HANDLE;
23

24
25
26
    found = MPIDI_CH3U_Recvq_DP(rreq);
    MPIU_Assert(found);

27
    rreq->status.MPI_ERROR = MPI_SUCCESS;
28
    rreq->status.MPI_SOURCE = NPTL_MATCH_GET_RANK(e->match_bits);
29
30
31
32
33
34
35
36
37
    rreq->status.MPI_TAG = NPTL_MATCH_GET_TAG(e->match_bits);

    MPID_Datatype_get_size_macro(rreq->dev.datatype, r_len);
    r_len *= rreq->dev.user_count;

    s_len = NPTL_HEADER_GET_LENGTH(e->hdr_data);

    if (s_len > r_len) {
        /* truncated data */
Pavan Balaji's avatar
Pavan Balaji committed
38
        MPIR_STATUS_SET_COUNT(rreq->status, r_len);
39
40
41
        MPIU_ERR_SET2(rreq->status.MPI_ERROR, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", s_len, r_len);
    }
    
Pavan Balaji's avatar
Pavan Balaji committed
42
    MPIR_STATUS_SET_COUNT(rreq->status, s_len);
43
44
45
46
47
48
49
50
51
52
53
}

#undef FUNCNAME
#define FUNCNAME handler_recv_complete
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int handler_recv_complete(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
    int ret;
54
    int i;
55
56
57
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_COMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_COMPLETE);
58
    
59
    MPIU_Assert(e->type == PTL_EVENT_REPLY || e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
60

61
62
    if (REQ_PTL(rreq)->md != PTL_INVALID_HANDLE) {
        ret = PtlMDRelease(REQ_PTL(rreq)->md);
63
        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdrelease", "**ptlmdrelease %s", MPID_nem_ptl_strerror(ret));
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    }

    for (i = 0; i < MPID_NEM_PTL_NUM_CHUNK_BUFFERS; ++i)
        if (REQ_PTL(rreq)->chunk_buffer[i])
            MPIU_Free(REQ_PTL(rreq)->chunk_buffer[i]);
    
    MPIDI_CH3U_Request_complete(rreq);

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_COMPLETE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME handler_recv_dequeue_complete
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int handler_recv_dequeue_complete(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
87
88
89
90
91
92
    int is_contig;
    MPI_Aint last;
    MPI_Aint dt_true_lb;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr ATTRIBUTE((unused));

93
94
95
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_COMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_COMPLETE);
96

97
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
98
99

    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, is_contig, data_sz, dt_ptr, dt_true_lb);
100
    
101
    dequeue_req(e);
102
103
104

    if (e->type == PTL_EVENT_PUT_OVERFLOW) {
        /* unpack the data from unexpected buffer */
105
106
        MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "is_contig = %d", is_contig);

107
        if (is_contig) {
108
            MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength);
109
110
        } else {
            last = e->mlength;
111
            MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, e->start);
112
113
            if (last != e->mlength)
                MPIU_ERR_SET(rreq->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch");
114
        }
115
116
117
    } else {
        if (!is_contig && data_sz != e->mlength)
            MPIU_ERR_SET(rreq->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch");
118
119
    }
    
120
121
122
123
124
125
126
127
128
    mpi_errno = handler_recv_complete(e);

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_COMPLETE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

129
#undef FUNCNAME
130
#define FUNCNAME handler_recv_big_get
131
132
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
133
static int handler_recv_big_get(const ptl_event_t *e)
134
135
136
137
138
139
140
141
142
143
144
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
    MPI_Aint last;

    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_UNPACK);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_UNPACK);

    MPIU_Assert(e->type == PTL_EVENT_REPLY);

145
146
147
148
149
150
151
152
153
    /* decrement the number of remaining gets */
    REQ_PTL(rreq)->num_gets--;
    if (REQ_PTL(rreq)->num_gets == 0) {
        /* if we used a temporary buffer, unpack the data */
        if (REQ_PTL(rreq)->chunk_buffer[0]) {
            last = rreq->dev.segment_size;
            MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, REQ_PTL(rreq)->chunk_buffer[0]);
            MPIU_Assert(last == rreq->dev.segment_size);
        }
154
        mpi_errno = handler_recv_complete(e);
155
    }
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_UNPACK);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME big_get
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static void big_get(void *buf, ptl_size_t left_to_get, MPIDI_VC_t *vc, ptl_match_bits_t match_bits, MPID_Request *rreq)
{
    int ret;
    MPID_nem_ptl_vc_area *vc_ptl;
    ptl_size_t start, get_sz;

    vc_ptl = VC_PTL(vc);
    start = (ptl_size_t)buf;

179
180
    /* we need to handle all events */
    REQ_PTL(rreq)->event_handler = handler_recv_big_get;
181
182
183

    while (left_to_get > 0) {
        /* get up to the maximum allowed by the portals interface */
184
        if (left_to_get > MPIDI_nem_ptl_ni_limits.max_msg_size)
185
            get_sz = MPIDI_nem_ptl_ni_limits.max_msg_size;
186
        else
187
            get_sz = left_to_get;
188
189

        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, match_bits, 0, rreq);
190
191
192
193
194
195
196
        DBG_MSG_GET("global", get_sz, vc->pg_rank, match_bits);
        MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)start);
        MPIU_Assert(ret == 0);

        /* account for what has been sent */
        start += get_sz;
        left_to_get -= get_sz;
197
        REQ_PTL(rreq)->num_gets++;
198
199
200
    }
}

201
202
203
204
205
206
207
208
#undef FUNCNAME
#define FUNCNAME handler_recv_unpack_complete
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int handler_recv_unpack_complete(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
209
    void *buf;
210
211
212
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_UNPACK_COMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_UNPACK_COMPLETE);
213
    
214
    MPIU_Assert(e->type == PTL_EVENT_REPLY || e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
215

216
217
218
219
    if (e->type == PTL_EVENT_PUT_OVERFLOW)
        buf = e->start;
    else
        buf = REQ_PTL(rreq)->chunk_buffer[0];
220

221
222
223
    mpi_errno = MPID_nem_ptl_unpack_byte(rreq->dev.segment_ptr, rreq->dev.segment_first,
                                         rreq->dev.segment_first + e->mlength, buf,
                                         &REQ_PTL(rreq)->overflow[0]);
224
225
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    
226
    mpi_errno = handler_recv_complete(e);
227
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_UNPACK_COMPLETE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME handler_recv_dequeue_unpack_complete
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int handler_recv_dequeue_unpack_complete(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_COMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_COMPLETE);
246
    
247
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266

    dequeue_req(e);
    mpi_errno = handler_recv_unpack_complete(e);

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_COMPLETE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME handler_recv_dequeue_large
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int handler_recv_dequeue_large(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
267
268
    MPIDI_VC_t *vc;
    MPID_nem_ptl_vc_area *vc_ptl;
269
270
271
272
273
274
    int ret;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPI_Aint last;
275
    MPIU_CHKPMEM_DECL(1);
276
277
278
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);
279
    
280
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
281

282
283
284
    MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);
    vc_ptl = VC_PTL(vc);
    
285
286
    dequeue_req(e);

287
288
289
290
291
    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);

    /* unpack data from unexpected buffer first */
    if (e->type == PTL_EVENT_PUT_OVERFLOW) {
        if (dt_contig) {
292
            MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength);
293
294
        } else {
            last = e->mlength;
295
            MPID_Segment_unpack(rreq->dev.segment_ptr, 0, &last, e->start);
296
297
298
299
300
            MPIU_Assert(last == e->mlength);
            rreq->dev.segment_first = e->mlength;
        }
    }
    
301
302
303
304
305
306
    if (!(e->hdr_data & NPTL_LARGE)) {
        /* all data has already been received; we're done */
        mpi_errno = handler_recv_complete(e);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        goto fn_exit;
    }
307
        
308
309
    MPIU_Assert (e->mlength == PTL_LARGE_THRESHOLD);

310
311
    /* we need to GET the rest of the data from the sender's buffer */
    if (dt_contig) {
312
313
        big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD,
                vc, e->match_bits, rreq);
314
315
316
317
        goto fn_exit;
    }

    /* noncontig recv buffer */
318
    
319
320
321
322
    last = rreq->dev.segment_size;
    rreq->dev.iov_count = MPID_IOV_LIMIT;
    MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count);

323
    if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
324
325
326
327
328
329
        /* Rest of message fits in one IOV */
        ptl_md_t md;

        md.start = rreq->dev.iov;
        md.length = rreq->dev.iov_count;
        md.options = PTL_IOVEC;
330
        md.eq_handle = MPIDI_nem_ptl_origin_eq;
331
332
        md.ct_handle = PTL_CT_NONE;
        ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md);
333
        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret));
334

335
        REQ_PTL(rreq)->event_handler = handler_recv_complete;
336
        ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg,
337
                     e->match_bits, 0, rreq);
338
        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
339
340
341
342
343
        goto fn_exit;
    }
        
    /* message won't fit in a single IOV, allocate buffer and unpack when received */
    /* FIXME: For now, allocate a single large buffer to hold entire message */
344
    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD,
345
                        mpi_errno, "chunk_buffer");
346
    big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq);
347
348

 fn_exit:
349
350
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
351
352
353
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);
    return mpi_errno;
 fn_fail:
354
355
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
356
357
358
359
360
361
362
363
364
365
}


#undef FUNCNAME
#define FUNCNAME handler_recv_dequeue_unpack_large
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
366
367
    MPID_Request *const rreq = e->user_ptr;
    MPIDI_VC_t *vc;
368
    MPI_Aint last;
369
370
    void *buf;
    MPIU_CHKPMEM_DECL(1);
371
372
373
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
374
375
376
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);

    MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);
377
378
379
380
381
382
383
384
385
386

    dequeue_req(e);

    if (!(e->hdr_data & NPTL_LARGE)) {
        /* all data has already been received; we're done */
        mpi_errno = handler_recv_unpack_complete(e);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        goto fn_exit;
    }

387
388
389
390
    if (e->type == PTL_EVENT_PUT_OVERFLOW)
        buf = e->start;
    else
        buf = REQ_PTL(rreq)->chunk_buffer[0];
391
392

    MPIU_Assert(e->mlength == PTL_LARGE_THRESHOLD);
393
394
395
    last = PTL_LARGE_THRESHOLD;
    MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf);
    MPIU_Assert(last == PTL_LARGE_THRESHOLD);
396
397
398
    rreq->dev.segment_first += PTL_LARGE_THRESHOLD;
    MPIU_Free(REQ_PTL(rreq)->chunk_buffer[0]);

399
400
401
    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
                        mpi_errno, "chunk_buffer");
    big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);
402
403

 fn_exit:
404
405
406
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
407
408
    return mpi_errno;
 fn_fail:
409
410
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
411
412
413
414
415
416
}

#undef FUNCNAME
#define FUNCNAME MPID_nem_ptl_recv_posted
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
417
int MPID_nem_ptl_recv_posted(MPIDI_VC_t *vc, MPID_Request *rreq)
418
419
420
421
422
423
424
425
426
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc);
    ptl_me_t me;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPI_Aint last;
427
    ptl_process_t id_any;
428
429
    int ret;
    MPIU_CHKPMEM_DECL(1);
430
431
432
433
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RECV_POSTED);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RECV_POSTED);

434
435
436
    id_any.phys.nid = PTL_NID_ANY;
    id_any.phys.pid = PTL_PID_ANY;

437
438
439
440
441
442
    MPID_nem_ptl_init_req(rreq);
    
    me.ct_handle = PTL_CT_NONE;
    me.uid = PTL_UID_ANY;
    me.options = ( PTL_ME_OP_PUT | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
                   PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_USE_ONCE );
443
    if (vc == NULL) {
444
445
        /* MPI_ANY_SOURCE receive */
        me.match_id = id_any;
446
447
448
449
450
    } else {
        if (!vc_ptl->id_initialized) {
            mpi_errno = MPID_nem_ptl_init_id(vc);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        }
451
        me.match_id = vc_ptl->id;
452
    }
453
454
455
456

    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "tag=%#x ctx=%#x rank=%#x", rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank));
    me.match_bits = NPTL_MATCH(rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id,
                               rreq->dev.match.parts.rank);
457
458
459
460
461
    if (rreq->dev.match.parts.tag == MPI_ANY_TAG)
        me.ignore_bits = NPTL_MATCH_IGNORE_ANY_TAG;
    else
        me.ignore_bits = NPTL_MATCH_IGNORE;

462
463
464
    me.min_free = 0;

    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
465
    MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count=%d datatype=%#x contig=%d data_sz=%lu", rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz));
466

467
    if (data_sz <= PTL_LARGE_THRESHOLD) {
468
469
        if (dt_contig) {
            /* small contig message */
470
            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message");
471
            me.start = (char *)rreq->dev.user_buf + dt_true_lb;
472
            me.length = data_sz;
473
            REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete;
474
475
        } else {
            /* small noncontig */
476
            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message");
477
478
            rreq->dev.segment_ptr = MPID_Segment_alloc();
            MPIU_ERR_CHKANDJUMP1(rreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
479
480
            MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
            rreq->dev.segment_first = 0;
481
482
483
484
            rreq->dev.segment_size = data_sz;

            last = rreq->dev.segment_size;
            rreq->dev.iov_count = MPID_IOV_LIMIT;
485
            MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count);
486
487
488

            if (last == rreq->dev.segment_size) {
                /* entire message fits in IOV */
489
                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "    entire message fits in IOV");
490
491
492
                me.start = rreq->dev.iov;
                me.length = rreq->dev.iov_count;
                me.options |= PTL_IOVEC;
493
                REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete;
494
495
496
            } else {
                /* IOV is not long enough to describe entire message: recv into
                   buffer and unpack later */
497
                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "    IOV too long: using bounce buffer");
498
                MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer");
499
500
                me.start = REQ_PTL(rreq)->chunk_buffer[0];
                me.length = data_sz;
501
                REQ_PTL(rreq)->event_handler = handler_recv_dequeue_unpack_complete;
502
503
504
505
506
507
            }
        }
    } else {
        /* Large message: Create an ME for the first chunk of data, then do a GET for the rest */
        if (dt_contig) {
            /* large contig message */
508
            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message");
509
            me.start = (char *)rreq->dev.user_buf + dt_true_lb;
510
            me.length = PTL_LARGE_THRESHOLD;
511
            REQ_PTL(rreq)->event_handler = handler_recv_dequeue_large;
512
513
        } else {
            /* large noncontig */
514
            MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large noncontig message");
515
516
517
            rreq->dev.segment_ptr = MPID_Segment_alloc();
            MPIU_ERR_CHKANDJUMP1(rreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
            MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
518
            rreq->dev.segment_first = 0;
519
520
521
522
            rreq->dev.segment_size = data_sz;

            last = PTL_LARGE_THRESHOLD;
            rreq->dev.iov_count = MPID_IOV_LIMIT;
523
            MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count);
524
525
526

            if (last == PTL_LARGE_THRESHOLD) {
                /* first chunk fits in IOV */
527
                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "    first chunk fits in IOV");
528
529
530
531
                rreq->dev.segment_first = last;
                me.start = rreq->dev.iov;
                me.length = rreq->dev.iov_count;
                me.options |= PTL_IOVEC;
532
                REQ_PTL(rreq)->event_handler = handler_recv_dequeue_large;
533
534
535
            } else {
                /* IOV is not long enough to describe the first chunk: recv into
                   buffer and unpack later */
536
                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "    IOV too long: using bounce buffer for first chunk");
537
538
                MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, PTL_LARGE_THRESHOLD, mpi_errno, "chunk_buffer");
                me.start = REQ_PTL(rreq)->chunk_buffer[0];
539
                me.length = PTL_LARGE_THRESHOLD;
540
                REQ_PTL(rreq)->event_handler = handler_recv_dequeue_unpack_large;
541
542
543
544
545
            }
        }
        
    }

546
547
548
549
550
551
552
    /* if there is no space to append the entry, process outstanding events and try again */
    while (1) {
        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_pt, &me, PTL_PRIORITY_LIST, rreq, &REQ_PTL(rreq)->put_me);
        if (ret != PTL_NO_SPACE)
            break;
        MPID_nem_ptl_poll(1);
    }
553
    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
554
    DBG_MSG_MEAPPEND("REG", vc ? vc->pg_rank : MPI_ANY_SOURCE, me, rreq);
555
556
    MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "    buf=%p", me.start);
    MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "MPIDI_nem_ptl_pt = %d", MPIDI_nem_ptl_pt);
557
558

 fn_exit:
559
560
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
561
562
563
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_RECV_POSTED);
    return mpi_errno;
 fn_fail:
564
565
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
566
567
}

568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
#undef FUNCNAME
#define FUNCNAME MPID_nem_ptl_anysource_posted
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
void MPID_nem_ptl_anysource_posted(MPID_Request *rreq)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_ANYSOURCE_POSTED);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_ANYSOURCE_POSTED);

    mpi_errno = MPID_nem_ptl_recv_posted(NULL, rreq);

    /* FIXME: This function is void, so we can't return an error.  This function
       cannot return an error because the queue functions (where the posted_recv
       hooks are called) return no error code. */
    MPIU_Assertp(mpi_errno == MPI_SUCCESS);

    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_ANYSOURCE_POSTED);
}

#undef FUNCNAME
#define FUNCNAME cancel_recv
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int cancel_recv(MPID_Request *rreq, int *cancelled)
{
    int mpi_errno = MPI_SUCCESS;
596
    int ptl_err   = PTL_OK;
597
598
599
600
    MPIDI_STATE_DECL(MPID_STATE_CANCEL_RECV);

    MPIDI_FUNC_ENTER(MPID_STATE_CANCEL_RECV);

601
602
603
604
605
    *cancelled = FALSE;

    /* An invalid handle indicates the operation has been completed
       and the matching list entry unlinked. At that point, the operation
       cannot be cancelled. */
606
607
    if (REQ_PTL(rreq)->put_me != PTL_INVALID_HANDLE) {
        ptl_err = PtlMEUnlink(REQ_PTL(rreq)->put_me);
608
609
        if (ptl_err == PTL_OK)
            *cancelled = TRUE;
610
611
612
613
        /* FIXME: if we properly invalidate matching list entry handles, we should be
           able to ensure an unlink operation results in either PTL_OK or PTL_IN_USE.
           Anything else would be an error. For now, though, we assume anything but PTL_OK
           is uncancelable and return. */
614
    }
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_CANCEL_RECV);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPID_nem_ptl_anysource_matched
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_nem_ptl_anysource_matched(MPID_Request *rreq)
{
630
631
    int mpi_errno, cancelled;

632
633
634
635
636
637
638
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_ANYSOURCE_MATCHED);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_ANYSOURCE_MATCHED);

    mpi_errno = cancel_recv(rreq, &cancelled);
    /* FIXME: This function is does not return an error because the queue
       functions (where the posted_recv hooks are called) return no error
639
       code. See also comment on cancel_recv. */
640
641
    MPIU_Assertp(mpi_errno == MPI_SUCCESS);

642
 fn_exit:
643
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_ANYSOURCE_MATCHED);
644
    return MPI_SUCCESS;
645
646
 fn_fail:
    goto fn_exit;
647
648
649
650
}



651
652
653
654
655
656
#undef FUNCNAME
#define FUNCNAME MPID_nem_ptl_cancel_recv
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_nem_ptl_cancel_recv(MPIDI_VC_t *vc,  MPID_Request *rreq)
{
657
658
    int mpi_errno, cancelled;

659
660
661
662
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_CANCEL_RECV);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_CANCEL_RECV);

663
664
665
666
667
    mpi_errno = cancel_recv(rreq, &cancelled);
    /* FIXME: This function is does not return an error because the queue
       functions (where the posted_recv hooks are called) return no error
       code. */
    MPIU_Assertp(mpi_errno == MPI_SUCCESS);
668
669
670

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_CANCEL_RECV);
671
    return !cancelled;
672
673
674
 fn_fail:
    goto fn_exit;
}
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717



#undef FUNCNAME
#define FUNCNAME MPID_nem_ptl_lmt_start_recv
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc,  MPID_Request *rreq, MPID_IOV s_cookie)
{
    /* This function should only be called as a result of an Mrecv because of the CH3 protocol for
       Rendezvous Mrecvs. The regular CH3 protocol is not optimal for portals, since we don't need
       to exchange CTS/RTS. We need this code here because at the time of the Mprobe we don't know
       the target buffer, but we dequeue (and lose) the portals entry. This doesn't happen on
       regular large transfers because we handle them directly on the netmod. */
    int mpi_errno = MPI_SUCCESS;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    ptl_match_bits_t match_bits;
    int was_incomplete;
    int ret;
    MPID_nem_ptl_vc_area *vc_ptl = VC_PTL(vc);
    MPIU_CHKPMEM_DECL(1);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_LMT_START_RECV);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_LMT_START_RECV);

    /* This Rendezvous protocol does not do RTS-CTS. Since we have all the data, we limit to get it */
    /* The following code is inspired on handler_recv_dqueue_large */

    match_bits = NPTL_MATCH(rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id,
                            rreq->dev.match.parts.rank);
    MPIDI_CH3U_Request_increment_cc(rreq, &was_incomplete);
    MPIU_Assert(was_incomplete == 0);
    MPIU_Object_add_ref(rreq);

    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr,
                            dt_true_lb);
    if (dt_contig) {
        void * real_user_buf = (char *)rreq->dev.user_buf + dt_true_lb;

718
719
        big_get((char *)real_user_buf + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, match_bits, rreq);

720
721
722
723
724
725
726
727
728
729
730
        /* The memcpy is done after the get purposely for overlapping */
        MPIU_Memcpy(real_user_buf, rreq->dev.tmpbuf, PTL_LARGE_THRESHOLD);
    }
    else {
        MPI_Aint last;

        rreq->dev.segment_ptr = MPID_Segment_alloc();
        MPIU_ERR_CHKANDJUMP1(rreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem",
                             "**nomem %s", "MPID_Segment_alloc");
        MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype,
                          rreq->dev.segment_ptr, 0);
731
        rreq->dev.segment_first = 0;
732
        rreq->dev.segment_size = data_sz;
733
734
735
736
        last = PTL_LARGE_THRESHOLD;
        MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.tmpbuf);
        MPIU_Assert(last == PTL_LARGE_THRESHOLD);
        rreq->dev.segment_first = PTL_LARGE_THRESHOLD;
737
        last = rreq->dev.segment_size;
738
739
740
        rreq->dev.iov_count = MPID_IOV_LIMIT;
        MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov,
                                 &rreq->dev.iov_count);
741
        if (last == rreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
742
743
744
745
746
747
            /* Rest of message fits in one IOV */
            ptl_md_t md;

            md.start = rreq->dev.iov;
            md.length = rreq->dev.iov_count;
            md.options = PTL_IOVEC;
748
            md.eq_handle = MPIDI_nem_ptl_origin_eq;
749
750
751
752
753
754
            md.ct_handle = PTL_CT_NONE;
            ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md);
            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s",
                                 MPID_nem_ptl_strerror(ret));

            REQ_PTL(rreq)->event_handler = handler_recv_complete;
755
756
            ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first,
                                        vc_ptl->id, vc_ptl->ptg, match_bits, 0, rreq);
757
758
759
760
761
762
            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s",
                                 MPID_nem_ptl_strerror(ret));
        }
        else {
            /* message won't fit in a single IOV, allocate buffer and unpack when received */
            /* FIXME: For now, allocate a single large buffer to hold entire message */
763
            MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
764
                                mpi_errno, "chunk_buffer");
765
            big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, match_bits, rreq);
766
767
768
769
770
771
772
773
774
775
776
777
778
        }
    }
    MPIU_Free(rreq->dev.tmpbuf);
    rreq->ch.lmt_tmp_cookie.MPID_IOV_LEN = 0;  /* Required for do_cts in mpid_nem_lmt.c */

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_LMT_START_RECV);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}