mpid_nem_inline.h 30 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#ifndef _MPID_NEM_INLINE_H
#define _MPID_NEM_INLINE_H

10
#define MPID_NEM_THREAD_POLLS_BEFORE_YIELD 10
11
12

#include "my_papi_defs.h"
13
#include "mpiiov.h"
14
#include "mpidi_nem_statistics.h"
15
#include "mpit.h"
16

17
extern int MPID_nem_lmt_shm_pending;
18
19
extern MPID_nem_cell_ptr_t MPID_nem_prefetched_cell;

20
21
22
23
24
static inline int MPID_nem_mpich_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again);
static inline int MPID_nem_mpich_sendv (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
static inline void MPID_nem_mpich_dequeue_fastbox (int local_rank);
static inline void MPID_nem_mpich_enqueue_fastbox (int local_rank);
static inline int MPID_nem_mpich_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
25
static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
26
27
28
29
30
static inline int MPID_nem_mpich_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
static inline int MPID_nem_mpich_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
static inline int MPID_nem_mpich_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
static inline int MPID_nem_mpich_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
static inline void MPID_nem_mpich_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
31
32
                                                           MPIDI_msg_sz_t segment_size, void *header, MPIDI_msg_sz_t header_sz,
                                                           MPIDI_VC_t *vc, int *again);
33
static inline void MPID_nem_mpich_send_seg (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first, MPIDI_msg_sz_t segment_size,
34
35
                                                    MPIDI_VC_t *vc, int *again);

36
37
/* evaluates to TRUE if it is safe to block on recv operations in the progress
 * loop, FALSE otherwise */
38
39
40
41
#define MPID_nem_safe_to_block_recv()           \
    (!MPID_nem_local_lmt_pending &&             \
     !MPIDI_CH3I_shm_active_send &&             \
     !MPIDI_CH3I_Sendq_head(MPIDI_CH3I_shm_sendq) &&       \
42
     !MPIDU_Sched_are_pending())
43
44

#undef FUNCNAME
45
#define FUNCNAME MPID_nem_mpich_send_header
46
47
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
48
static inline int
49
MPID_nem_mpich_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again)
50
51
52
53
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_cell_ptr_t el;
    int my_rank;
54
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
55
56
57
58

    /*DO_PAPI (PAPI_reset (PAPI_EventSet)); */

    MPIU_Assert (size == sizeof(MPIDI_CH3_Pkt_t));
59
    MPIU_Assert (vc_ch->is_local);
60
61
62
63
64

    my_rank = MPID_nem_mem_region.rank;

#ifdef USE_FASTBOX
    {
65
	MPID_nem_fbox_mpich_t *pbox = vc_ch->fbox_out;
66

67
68
69
70
        /* _is_full contains acquire barrier */
        if (MPID_nem_fbox_is_full((MPID_nem_fbox_common_ptr_t)pbox))
            goto usequeue_l;

71
72
73
        pbox->cell.pkt.mpich.source  = MPID_nem_mem_region.local_rank;
        pbox->cell.pkt.mpich.datalen = size;
        pbox->cell.pkt.mpich.seqno   = vc_ch->send_seqno++;
74
        
75
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, pbox->cell.pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
76
        
77
        MPIU_Memcpy((void *)pbox->cell.pkt.mpich.p.payload, buf, size);
78
79
80

        OPA_store_release_int(&pbox->flag.value, 1);

81
82
83
84
        MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (&pbox->cell));
        
        goto return_success;
85
86
    }
 usequeue_l:
87
    MPIR_T_PVAR_COUNTER_INC_VAR(NEM, &MPID_nem_fbox_fall_back_to_queue_count[MPID_nem_mem_region.local_ranks[vc->lpid]], 1);
88

89
#endif /*USE_FASTBOX */
90

91
92
93
#ifdef PREFETCH_CELL
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    el = MPID_nem_prefetched_cell;
94

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	    goto return_again;
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues14));
#else /* PREFETCH_CELL */
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	goto return_again;
    }
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues14));

    DO_PAPI (PAPI_reset (PAPI_EventSet));
    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ , &el);
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues10));
#endif /* PREFETCH_CELL */

    DO_PAPI (PAPI_reset (PAPI_EventSet));
117
118
119
120
121
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = size;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
122
    
123
    MPIU_Memcpy((void *)el->pkt.mpich.p.payload, buf, size);
124
125
126
127
128
129
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues11));

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

    DO_PAPI (PAPI_reset (PAPI_EventSet));
130
131
    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);
    /*MPID_nem_rel_dump_queue( vc_ch->recv_queue ); */
132
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues12));
133
    DO_PAPI (PAPI_reset (PAPI_EventSet));
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#ifdef PREFETCH_CELL
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues10));
#endif /*PREFETCH_CELL */

    /*DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues14)); */

 return_success:
    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
    return mpi_errno;
154
155
 fn_fail:
    goto fn_exit;
156
157
158
159
}


/*
160
  int MPID_nem_mpich_sendv (struct iovec **iov, int *n_iov, MPIDI_VC_t *vc);
161
162
163

  sends iov to vc
  Non-blocking
164
  if iov specifies more than MPID_NEM_MPICH_DATA_LEN of data, the iov will be truncated, so that after MPID_nem_mpich_sendv returns,
165
166
167
168
  iov will describe unsent data
  sets again to 1 if it can't get a free cell, 0 otherwise
*/
#undef FUNCNAME
169
#define FUNCNAME MPID_nem_mpich_sendv
170
171
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
172
static inline int
173
MPID_nem_mpich_sendv (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again)
174
175
176
177
178
179
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_cell_ptr_t el;
    char *cell_buf;
    MPIDI_msg_sz_t payload_len;    
    int my_rank;
180
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
181
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_SENDV);
182

183
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MPICH_SENDV);
184

185
    MPIU_Assert (*n_iov > 0 && (*iov)->MPID_IOV_LEN > 0);
186
187
    MPIU_Assert(vc_ch->is_local);

188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    DO_PAPI (PAPI_reset (PAPI_EventSet));

    my_rank = MPID_nem_mem_region.rank;
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
    
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL     */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ , &el);
#endif /*PREFETCH_CELL     */

215
    payload_len = MPID_NEM_MPICH_DATA_LEN;
216
    cell_buf    = (char *) el->pkt.mpich.p.payload; /* cast away volatile */
217
    
218
    while (*n_iov && payload_len >= (*iov)->MPID_IOV_LEN)
219
    {
220
	size_t _iov_len = (*iov)->MPID_IOV_LEN;
221
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, _iov_len);
222
223
224
225
226
227
228
229
	payload_len -= _iov_len;
	cell_buf += _iov_len;
	--(*n_iov);
	++(*iov);
    }
    
    if (*n_iov && payload_len > 0)
    {
230
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, payload_len);
231
232
	(*iov)->MPID_IOV_BUF = (char *)(*iov)->MPID_IOV_BUF + payload_len;
	(*iov)->MPID_IOV_LEN -= payload_len;
233
234
235
 	payload_len = 0;
    }

236
237
238
239
240
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = MPID_NEM_MPICH_DATA_LEN - payload_len;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH);
241
242
243
244

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

245
246
    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);
    /*MPID_nem_rel_dump_queue( vc_ch->recv_queue ); */
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
262
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MPICH_SENDV);
263
    return mpi_errno;
264
265
 fn_fail:
    goto fn_exit;
266
267
}

268
269
/* MPID_nem_mpich_sendv_header (struct iovec **iov, int *n_iov, int dest)
   same as above but first iov element is an MPICH header */
270
#undef FUNCNAME
271
#define FUNCNAME MPID_nem_mpich_sendv_header
272
273
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
274
static inline int
275
MPID_nem_mpich_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again)
276
277
278
279
280
281
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_cell_ptr_t el;
    char *cell_buf;
    MPIDI_msg_sz_t payload_len;    
    int my_rank;
282
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
283
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_SENDV_HEADER);
284
    
285
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MPICH_SENDV_HEADER);
286

287
288
    MPIU_Assert(vc_ch->is_local);

289
    DO_PAPI (PAPI_reset (PAPI_EventSet));
290
    MPIU_Assert (*n_iov > 0 && (*iov)->MPID_IOV_LEN == sizeof(MPIDI_CH3_Pkt_t));
291
292
293
294

    my_rank = MPID_nem_mem_region.rank;

#ifdef USE_FASTBOX
295
    if (*n_iov == 2 && (*iov)[1].MPID_IOV_LEN + sizeof(MPIDI_CH3_Pkt_t) <= MPID_NEM_FBOX_DATALEN)
296
    {
297
	MPID_nem_fbox_mpich_t *pbox = vc_ch->fbox_out;
298

299
300
301
        if (MPID_nem_fbox_is_full((MPID_nem_fbox_common_ptr_t)pbox))
            goto usequeue_l;

302
303
304
305
        pbox->cell.pkt.mpich.source  = MPID_nem_mem_region.local_rank;
        pbox->cell.pkt.mpich.datalen = (*iov)[1].MPID_IOV_LEN + sizeof(MPIDI_CH3_Pkt_t);
        pbox->cell.pkt.mpich.seqno   = vc_ch->send_seqno++;
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, pbox->cell.pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
306
        
307
308
        MPIU_Memcpy((void *)pbox->cell.pkt.mpich.p.payload, (*iov)[0].MPID_IOV_BUF, (*iov)[0].MPID_IOV_LEN);
        MPIU_Memcpy ((char *)pbox->cell.pkt.mpich.p.payload + (*iov)[0].MPID_IOV_LEN, (*iov)[1].MPID_IOV_BUF, (*iov)[1].MPID_IOV_LEN);
309
        
310
        OPA_store_release_int(&pbox->flag.value, 1);
311
312
313
314
315
316
        *n_iov = 0;

        MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (&pbox->cell));
        
        goto return_success;
317
318
    }
 usequeue_l:
319
    MPIR_T_PVAR_COUNTER_INC_VAR(NEM, &MPID_nem_fbox_fall_back_to_queue_count[MPID_nem_mem_region.local_ranks[vc->lpid]], 1);
320

321
322
323
324
#endif /*USE_FASTBOX */
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
325

326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL    */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
#endif /*PREFETCH_CELL */

346
    MPIU_Memcpy((void *)el->pkt.mpich.p.payload, (*iov)->MPID_IOV_BUF, sizeof(MPIDI_CH3_Pkt_t));
347

348
    cell_buf = (char *)(el->pkt.mpich.p.payload) + sizeof(MPIDI_CH3_Pkt_t);
349
350
351
    ++(*iov);
    --(*n_iov);

352
    payload_len = MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
353
    while (*n_iov && payload_len >= (*iov)->MPID_IOV_LEN)
354
    {
355
	size_t _iov_len = (*iov)->MPID_IOV_LEN;
356
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, _iov_len);
357
358
359
360
361
362
363
364
	payload_len -= _iov_len;
	cell_buf += _iov_len;
	--(*n_iov);
	++(*iov);
    }
    
    if (*n_iov && payload_len > 0)
    {
365
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, payload_len);
366
367
	(*iov)->MPID_IOV_BUF = (char *)(*iov)->MPID_IOV_BUF + payload_len;
	(*iov)->MPID_IOV_LEN -= payload_len;
368
369
370
	payload_len = 0;
    }

371
372
373
374
375
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = MPID_NEM_MPICH_DATA_LEN - payload_len;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
376
377
378
379

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

380
381
    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);	
    /*MPID_nem_rel_dump_queue( vc_ch->recv_queue ); */
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

 return_success:
    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
398
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MPICH_SENDV_HEADER);
399
    return mpi_errno;
400
401
 fn_fail:
    goto fn_exit;
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
}

/* send the header and data described by the segment in one cell.  If
   there is no cell available, *again is set to 1.  If all of the data
   cannot be sent, *segment_first is set to the index of the first
   unsent byte.
   Pre condition:  This must be the first packet of a message (i.e.,
                       *segment first == 0)
                   The destination process is local
   Post conditions:  the header has been sent iff *again == 0
                     if there is data to send (segment_size > 0) then
                         (the header has been sent iff any data has
                         been sent (i.e., *segment_first > 0) )
                     i.e.: we will never send only the header
*/
#undef FUNCNAME
418
#define FUNCNAME MPID_nem_mpich_send_seg_header
419
420
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
421
static inline void
422
MPID_nem_mpich_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first, MPIDI_msg_sz_t segment_size,
423
424
425
                                 void *header, MPIDI_msg_sz_t header_sz, MPIDI_VC_t *vc, int *again)
{
    MPID_nem_cell_ptr_t el;
426
    MPIDI_msg_sz_t datalen;
427
428
    int my_rank;
    MPIDI_msg_sz_t last;
429
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
430
431
432
433
434
435
436
437
438
439
440
441
442

    MPIU_Assert(vc_ch->is_local); /* netmods will have their own implementation */
    MPIU_Assert(header_sz <= sizeof(MPIDI_CH3_Pkt_t));
    MPIU_Assert(*segment_first == 0); /* this routine is only called for new messages */
    
    
    DO_PAPI (PAPI_reset (PAPI_EventSet));

    my_rank = MPID_nem_mem_region.rank;

#ifdef USE_FASTBOX
    if (sizeof(MPIDI_CH3_Pkt_t) + segment_size <= MPID_NEM_FBOX_DATALEN)
    {
443
	MPID_nem_fbox_mpich_t *pbox = vc_ch->fbox_out;
444

445
446
447
        if (MPID_nem_fbox_is_full((MPID_nem_fbox_common_ptr_t)pbox))
            goto usequeue_l;

448
	{
449
450
451
452
	    pbox->cell.pkt.mpich.source  = MPID_nem_mem_region.local_rank;
	    pbox->cell.pkt.mpich.datalen = sizeof(MPIDI_CH3_Pkt_t) + segment_size;
	    pbox->cell.pkt.mpich.seqno   = vc_ch->send_seqno++;
            MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, pbox->cell.pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
453
454

            /* copy header */
455
            MPIU_Memcpy((void *)pbox->cell.pkt.mpich.p.payload, header, header_sz);
456
            
457
458
            /* copy data */
            last = segment_size;
459
            MPID_Segment_pack(segment, *segment_first, &last, (char *)pbox->cell.pkt.mpich.p.payload + sizeof(MPIDI_CH3_Pkt_t));
460
            MPIU_Assert(last == segment_size);
461

462
            OPA_store_release_int(&pbox->flag.value, 1);
463
464
465
466
467
468
469
470
471
472

            *segment_first = last;

	    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
	    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (&pbox->cell));

            goto return_success;
	}
    }
 usequeue_l:
473
    MPIR_T_PVAR_COUNTER_INC_VAR(NEM, &MPID_nem_fbox_fall_back_to_queue_count[MPID_nem_mem_region.local_ranks[vc->lpid]], 1);
474

475
476
477
478
#endif /*USE_FASTBOX */
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL    */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
#endif /*PREFETCH_CELL */

    /* copy header */
501
    MPIU_Memcpy((void *)el->pkt.mpich.p.payload, header, header_sz);
502
503
    
    /* copy data */
504
    if (segment_size - *segment_first <= MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t))
505
506
        last = segment_size;
    else
507
        last = *segment_first + MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
508
    
509
    MPID_Segment_pack(segment, *segment_first, &last, (char *)el->pkt.mpich.p.payload + sizeof(MPIDI_CH3_Pkt_t));
510
511
512
    datalen = sizeof(MPIDI_CH3_Pkt_t) + last - *segment_first;
    *segment_first = last;
    
513
514
515
516
517
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = datalen;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);	

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

 return_success:
    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
    return;
}

542
/* similar to MPID_nem_mpich_send_seg_header, except there is no
543
544
   header to send.  This need not be the first packet of a message. */
#undef FUNCNAME
545
#define FUNCNAME MPID_nem_mpich_send_seg
546
547
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
548
static inline void
549
MPID_nem_mpich_send_seg (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first, MPIDI_msg_sz_t segment_size, MPIDI_VC_t *vc, int *again)
550
551
{
    MPID_nem_cell_ptr_t el;
552
    MPIDI_msg_sz_t datalen;
553
554
    int my_rank;
    MPIDI_msg_sz_t last;
555
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
556

557
    MPIU_Assert(vc_ch->is_local); /* netmods will have their own implementation */    
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
    
    DO_PAPI (PAPI_reset (PAPI_EventSet));

    my_rank = MPID_nem_mem_region.rank;
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
    
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL    */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
#endif /*PREFETCH_CELL */

    /* copy data */
587
    if (segment_size - *segment_first <= MPID_NEM_MPICH_DATA_LEN)
588
589
        last = segment_size;
    else
590
        last = *segment_first + MPID_NEM_MPICH_DATA_LEN;
591
    
592
    MPID_Segment_pack(segment, *segment_first, &last, (char *)el->pkt.mpich.p.payload);
593
594
595
    datalen = last - *segment_first;
    *segment_first = last;
    
596
597
598
599
600
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = datalen;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);	

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
    return;
}

/*
625
  MPID_nem_mpich_dequeue_fastbox (int local_rank)
626
627
628
629
630
631
  decrements usage count on fastbox for process with local rank local_rank and
  dequeues it from fbox queue if usage is 0.
  This function is called whenever a receive for a process on this node is matched.
  Fastboxes on fbox queue are polled regularly for incoming messages.
*/
#undef FUNCNAME
632
#define FUNCNAME MPID_nem_mpich_dequeue_fastbox
633
634
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
635
static inline void MPID_nem_mpich_dequeue_fastbox(int local_rank)
636
637
638
{
    MPID_nem_fboxq_elem_t *el;

639
640
    MPIU_Assert(local_rank < MPID_nem_mem_region.num_local);

641
    el = &MPID_nem_fboxq_elem_list[local_rank];    
642
    MPIU_Assert(el->fbox != NULL);
643

644
    MPIU_Assert(el->usage);
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665

    --el->usage;
    if (el->usage == 0)
    {
	if (el->prev == NULL)
	    MPID_nem_fboxq_head = el->next;
	else
	    el->prev->next = el->next;

	if (el->next == NULL)
	    MPID_nem_fboxq_tail = el->prev;
	else
	    el->next->prev = el->prev;

	if (el == MPID_nem_curr_fboxq_elem)
	{
	    if (el->next == NULL)
		MPID_nem_curr_fboxq_elem = MPID_nem_fboxq_head;
	    else
		MPID_nem_curr_fboxq_elem = el->next;
	}
666
    }    
667
668
669
}

/*
670
  MPID_nem_mpich_enqueue_fastbox (int local_rank)
671
672
673
674
675
  enqueues fastbox for process with local rank local_rank on fbox queue
  This function is called whenever a receive is posted for a process on this node.
  Fastboxes on fbox queue are polled regularly for incoming messages.
*/
#undef FUNCNAME
676
#define FUNCNAME MPID_nem_mpich_dequeue_fastbox
677
678
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
679
static inline void MPID_nem_mpich_enqueue_fastbox(int local_rank)
680
681
682
{
    MPID_nem_fboxq_elem_t *el;

683
684
    MPIU_Assert(local_rank < MPID_nem_mem_region.num_local);

685
    el = &MPID_nem_fboxq_elem_list[local_rank];
686
687
    MPIU_Assert(el->fbox != NULL);

688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
    if (el->usage)
    {
	++el->usage;
    }
    else
    {
	el->usage = 1;
	if (MPID_nem_fboxq_tail == NULL)
	{
	    el->prev = NULL;
	    MPID_nem_curr_fboxq_elem = MPID_nem_fboxq_head = el;
	}
	else
	{
	    el->prev = MPID_nem_fboxq_tail;
	    MPID_nem_fboxq_tail->next = el;
	}
	    
	el->next = NULL;
	MPID_nem_fboxq_tail = el;
708
    }
709
710
711
712
713
714
715
716
717
718
719
}
/*
  MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead)
  check whether the sequence number for the cell at the head of qhead is the one
  expected from the sender of that cell
  We only check these for processes in COMM_WORLD (i.e. the ones initially allocated)
*/
#undef FUNCNAME
#define FUNCNAME MPID_nem_recv_seqno_matches
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
720
static inline int
721
722
MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead)
{
723
724
    int source;
    MPID_nem_cell_ptr_t cell = MPID_nem_queue_head(qhead);
725
    source = cell->pkt.mpich.source;
726
    
727
    return (cell->pkt.mpich.seqno == MPID_nem_recv_seqno[source]);
728
729
730
}

/*
731
  int MPID_nem_mpich_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
732
733
734

  non-blocking receive
  sets cell to the received cell, or NULL if there is nothing to receive. in_fbox is true iff the cell was found in a fbox
735
  the cell must be released back to the subsystem with MPID_nem_mpich_release_cell() once the packet has been copied out
736
737
*/
#undef FUNCNAME
738
#define FUNCNAME MPID_nem_mpich_test_recv
739
740
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
741
static inline int
742
MPID_nem_mpich_test_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress)
743
744
745
746
747
748
{
    int mpi_errno = MPI_SUCCESS;
    
    DO_PAPI (PAPI_reset (PAPI_EventSet));

#ifdef USE_FASTBOX
749
    if (poll_active_fboxes(cell)) goto fbox_l;
750
751
#endif/* USE_FASTBOX     */

752
    if (MPID_nem_num_netmods)
753
    {
754
	mpi_errno = MPID_nem_network_poll(in_blocking_progress);
755
756
757
758
759
760
        if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    }

    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_recvQ) || !MPID_nem_recv_seqno_matches (MPID_nem_mem_region.my_recvQ))
    {
#ifdef USE_FASTBOX
761
762
763
764
765
766
767
768
769
770
771
772
        /* check for messages from any process (even those from which
           we don't expect messages).  If we're nonblocking, check all
           fboxes at once, if we're in a blocking loop we'll keep
           iterating, so just check them one at a time. */
        if (!in_blocking_progress) {
            int found;
            found = poll_every_fbox(cell);
            if (found)
                goto fbox_l;
        } else {
            poll_next_fbox (cell, goto fbox_l);
        }
773
774
775
776
777
778
779
#endif/* USE_FASTBOX     */
	*cell = NULL;
	goto fn_exit;
    }
    
    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

780
    ++MPID_nem_recv_seqno[(*cell)->pkt.mpich.source];
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
    *in_fbox = 0;

 fn_exit:
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues6));
    
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, {
	if (*cell)
	{
	    MPIU_DBG_MSG_S (CH3_CHANNEL, VERBOSE, "<-- Recv %s", (*in_fbox) ? "fbox " : "queue");
	    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (*cell));
	}
    });

 fn_fail:
    return mpi_errno;

 fbox_l:
798
   *in_fbox = 1;
799
800
801
802
803
    goto fn_exit;

}

/*
804
  int MPID_nem_mpich_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
805
806
807
808

  blocking receive with timeout
  waits up to timeout iterations to receive a cell
  sets cell to the received cell, or NULL if there is nothing to receive. in_fbox is true iff the cell was found in a fbox
809
  the cell must be released back to the subsystem with MPID_nem_mpich_release_cell() once the packet has been copied out
810
811
*/
#undef FUNCNAME
812
#define FUNCNAME MPID_nem_mpich_test_recv_wait
813
814
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
815
static inline int
816
MPID_nem_mpich_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout)
817
818
819
820
{
    int mpi_errno = MPI_SUCCESS;
    
#ifdef USE_FASTBOX
821
    if (poll_active_fboxes(cell)) goto fbox_l;
822
823
#endif/* USE_FASTBOX     */

824
    if (MPID_nem_num_netmods)
825
    {
826
	mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
827
828
829
830
831
832
        if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    }

    while ((--timeout > 0) && (MPID_nem_queue_empty (MPID_nem_mem_region.my_recvQ) || !MPID_nem_recv_seqno_matches (MPID_nem_mem_region.my_recvQ)))
    {
#ifdef USE_FASTBOX
833
	poll_next_fbox (cell, goto fbox_l);
834
835
836
837
838
839
840
#endif/* USE_FASTBOX     */
	*cell = NULL;
	goto exit_l;
    }
    
    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

841
    ++MPID_nem_recv_seqno[(*cell)->pkt.mpich.source];
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
    *in_fbox = 0;
 exit_l:
    
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, {
            if (*cell)
            {
                MPIU_DBG_MSG_S (CH3_CHANNEL, VERBOSE, "<-- Recv %s", (*in_fbox) ? "fbox " : "queue");
                MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (*cell));
            }
        });

 fn_fail:
    return mpi_errno;

 fbox_l:
    *in_fbox = 1;
    goto exit_l;
}

/*
862
  int MPID_nem_mpich_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
863
864
865
866

  blocking receive waits until there is something to receive, or then
  sets cell to the received cell. in_fbox is true iff the cell was
  found in a fbox the cell must be released back to the subsystem with
867
  MPID_nem_mpich_release_cell() once the packet has been copied out
868
869
*/
#undef FUNCNAME
870
#define FUNCNAME MPID_nem_mpich_blocking_recv
871
872
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
873
static inline int
874
MPID_nem_mpich_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
875
876
877
878
879
880
881
882
883
884
885
886
887
888
{
    int mpi_errno = MPI_SUCCESS;
    DO_PAPI (PAPI_reset (PAPI_EventSet));

#ifdef MPICH_IS_THREADED
    /* We should never enter this function in a multithreaded app */
#ifdef HAVE_RUNTIME_THREADCHECK
    MPIU_Assert(!MPIR_ThreadInfo.isThreaded);
#else
    MPIU_Assert(0);
#endif
#endif

#ifdef USE_FASTBOX
889
    if (poll_active_fboxes(cell)) goto fbox_l;
890
891
#endif /*USE_FASTBOX */

892
    if (MPID_nem_num_netmods)
893
    {
894
	mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
895
896
897
898
899
900
901
902
        if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    }

    while (MPID_nem_queue_empty (MPID_nem_mem_region.my_recvQ) || !MPID_nem_recv_seqno_matches (MPID_nem_mem_region.my_recvQ))
    {
	DO_PAPI (PAPI_reset (PAPI_EventSet));

#ifdef USE_FASTBOX	
903
904
	poll_next_fbox (cell, goto fbox_l);
        if (poll_active_fboxes(cell)) goto fbox_l;
905
906
#endif /*USE_FASTBOX */

907
	if (MPID_nem_num_netmods)
908
	{            
909
	    mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
910
911
            if (mpi_errno) MPIU_ERR_POP (mpi_errno);

912
            if (!MPID_nem_safe_to_block_recv())
913
            {
914
                *cell = NULL;
915
916
917
918
                *in_fbox = 0;
                goto exit_l;
            }
	}
919
920
921
922
923
924

        if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
            *cell = NULL;
            *in_fbox = 0;
            goto exit_l;
        }
925
        MPIU_Busy_wait();
926
927
928
929
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

930
    ++MPID_nem_recv_seqno[(*cell)->pkt.mpich.source];
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
    *in_fbox = 0;

 exit_l:    

    DO_PAPI (PAPI_accum_var (PAPI_EventSet,PAPI_vvalues8));
    
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, {
            if (*cell)
            {
                MPIU_DBG_MSG_S (CH3_CHANNEL, VERBOSE, "<-- Recv %s", (*in_fbox) ? "fbox " : "queue");
                MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell(*cell));
            }
        });

 fn_fail:
    return mpi_errno;

 fbox_l:
    *in_fbox = 1;
    goto exit_l;
}

/*
954
  int MPID_nem_mpich_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
955
956
957
958

  releases the cell back to the subsystem to be used for subsequent receives
*/
#undef FUNCNAME
959
#define FUNCNAME MPID_nem_mpich_release_cell
960
961
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
962
static inline int
963
MPID_nem_mpich_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc)
964
965
{
    int mpi_errno = MPI_SUCCESS;
966
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
967
968
969
970
971
972
973
974
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    MPID_nem_queue_enqueue (vc_ch->free_queue, cell);
    DO_PAPI (PAPI_accum_var (PAPI_EventSet,PAPI_vvalues9));
    return mpi_errno;
}

#endif /*_MPID_NEM_INLINE_H*/