mpid_nem_inline.h 30 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
7
8
9
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#ifndef _MPID_NEM_INLINE_H
#define _MPID_NEM_INLINE_H

10
#define MPID_NEM_THREAD_POLLS_BEFORE_YIELD 10
11
12

#include "my_papi_defs.h"
13
#include "mpiiov.h"
14
#include "mpidi_nem_statistics.h"
15
#include "mpit.h"
16

17
extern int MPID_nem_lmt_shm_pending;
18
19
extern MPID_nem_cell_ptr_t MPID_nem_prefetched_cell;

20
21
22
23
24
static inline int MPID_nem_mpich_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again);
static inline int MPID_nem_mpich_sendv (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
static inline void MPID_nem_mpich_dequeue_fastbox (int local_rank);
static inline void MPID_nem_mpich_enqueue_fastbox (int local_rank);
static inline int MPID_nem_mpich_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
25
static inline int MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead);
26
27
28
29
30
static inline int MPID_nem_mpich_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress);
static inline int MPID_nem_mpich_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions);
static inline int MPID_nem_mpich_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
static inline int MPID_nem_mpich_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
static inline void MPID_nem_mpich_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first,
31
32
                                                           MPIDI_msg_sz_t segment_size, void *header, MPIDI_msg_sz_t header_sz,
                                                           MPIDI_VC_t *vc, int *again);
33
static inline void MPID_nem_mpich_send_seg (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first, MPIDI_msg_sz_t segment_size,
34
35
                                                    MPIDI_VC_t *vc, int *again);

36
37
/* evaluates to TRUE if it is safe to block on recv operations in the progress
 * loop, FALSE otherwise */
38
39
40
41
#define MPID_nem_safe_to_block_recv()           \
    (!MPID_nem_local_lmt_pending &&             \
     !MPIDI_CH3I_shm_active_send &&             \
     !MPIDI_CH3I_Sendq_head(MPIDI_CH3I_shm_sendq) &&       \
42
43
     !MPIDU_Sched_are_pending() &&              \
     !num_active_issued_win && !num_passive_win)
44
45

#undef FUNCNAME
46
#define FUNCNAME MPID_nem_mpich_send_header
47
48
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
49
static inline int
50
MPID_nem_mpich_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again)
51
52
53
54
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_cell_ptr_t el;
    int my_rank;
55
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
56
57
58
59

    /*DO_PAPI (PAPI_reset (PAPI_EventSet)); */

    MPIU_Assert (size == sizeof(MPIDI_CH3_Pkt_t));
60
    MPIU_Assert (vc_ch->is_local);
61
62
63
64
65

    my_rank = MPID_nem_mem_region.rank;

#ifdef USE_FASTBOX
    {
66
	MPID_nem_fbox_mpich_t *pbox = vc_ch->fbox_out;
67

68
69
70
71
        /* _is_full contains acquire barrier */
        if (MPID_nem_fbox_is_full((MPID_nem_fbox_common_ptr_t)pbox))
            goto usequeue_l;

72
73
74
        pbox->cell.pkt.mpich.source  = MPID_nem_mem_region.local_rank;
        pbox->cell.pkt.mpich.datalen = size;
        pbox->cell.pkt.mpich.seqno   = vc_ch->send_seqno++;
75
        
76
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, pbox->cell.pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
77
        
78
        MPIU_Memcpy((void *)pbox->cell.pkt.mpich.p.payload, buf, size);
79
80
81

        OPA_store_release_int(&pbox->flag.value, 1);

82
83
84
85
        MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (&pbox->cell));
        
        goto return_success;
86
87
    }
 usequeue_l:
88
    MPIR_T_PVAR_COUNTER_INC_VAR(NEM, &MPID_nem_fbox_fall_back_to_queue_count[MPID_nem_mem_region.local_ranks[vc->lpid]], 1);
89

90
#endif /*USE_FASTBOX */
91

92
93
94
#ifdef PREFETCH_CELL
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    el = MPID_nem_prefetched_cell;
95

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	    goto return_again;
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues14));
#else /* PREFETCH_CELL */
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	goto return_again;
    }
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues14));

    DO_PAPI (PAPI_reset (PAPI_EventSet));
    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ , &el);
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues10));
#endif /* PREFETCH_CELL */

    DO_PAPI (PAPI_reset (PAPI_EventSet));
118
119
120
121
122
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = size;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
123
    
124
    MPIU_Memcpy((void *)el->pkt.mpich.p.payload, buf, size);
125
126
127
128
129
130
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues11));

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

    DO_PAPI (PAPI_reset (PAPI_EventSet));
131
132
    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);
    /*MPID_nem_rel_dump_queue( vc_ch->recv_queue ); */
133
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues12));
134
    DO_PAPI (PAPI_reset (PAPI_EventSet));
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#ifdef PREFETCH_CELL
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues10));
#endif /*PREFETCH_CELL */

    /*DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues14)); */

 return_success:
    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
    return mpi_errno;
155
156
 fn_fail:
    goto fn_exit;
157
158
159
160
}


/*
161
  int MPID_nem_mpich_sendv (struct iovec **iov, int *n_iov, MPIDI_VC_t *vc);
162
163
164

  sends iov to vc
  Non-blocking
165
  if iov specifies more than MPID_NEM_MPICH_DATA_LEN of data, the iov will be truncated, so that after MPID_nem_mpich_sendv returns,
166
167
168
169
  iov will describe unsent data
  sets again to 1 if it can't get a free cell, 0 otherwise
*/
#undef FUNCNAME
170
#define FUNCNAME MPID_nem_mpich_sendv
171
172
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
173
static inline int
174
MPID_nem_mpich_sendv (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again)
175
176
177
178
179
180
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_cell_ptr_t el;
    char *cell_buf;
    MPIDI_msg_sz_t payload_len;    
    int my_rank;
181
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
182
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_SENDV);
183

184
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MPICH_SENDV);
185

186
    MPIU_Assert (*n_iov > 0 && (*iov)->MPID_IOV_LEN > 0);
187
188
    MPIU_Assert(vc_ch->is_local);

189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
    DO_PAPI (PAPI_reset (PAPI_EventSet));

    my_rank = MPID_nem_mem_region.rank;
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
    
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL     */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ , &el);
#endif /*PREFETCH_CELL     */

216
    payload_len = MPID_NEM_MPICH_DATA_LEN;
217
    cell_buf    = (char *) el->pkt.mpich.p.payload; /* cast away volatile */
218
    
219
    while (*n_iov && payload_len >= (*iov)->MPID_IOV_LEN)
220
    {
221
	size_t _iov_len = (*iov)->MPID_IOV_LEN;
222
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, _iov_len);
223
224
225
226
227
228
229
230
	payload_len -= _iov_len;
	cell_buf += _iov_len;
	--(*n_iov);
	++(*iov);
    }
    
    if (*n_iov && payload_len > 0)
    {
231
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, payload_len);
232
233
	(*iov)->MPID_IOV_BUF = (char *)(*iov)->MPID_IOV_BUF + payload_len;
	(*iov)->MPID_IOV_LEN -= payload_len;
234
235
236
 	payload_len = 0;
    }

237
238
239
240
241
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = MPID_NEM_MPICH_DATA_LEN - payload_len;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH);
242
243
244
245

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

246
247
    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);
    /*MPID_nem_rel_dump_queue( vc_ch->recv_queue ); */
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
263
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MPICH_SENDV);
264
    return mpi_errno;
265
266
 fn_fail:
    goto fn_exit;
267
268
}

269
270
/* MPID_nem_mpich_sendv_header (struct iovec **iov, int *n_iov, int dest)
   same as above but first iov element is an MPICH header */
271
#undef FUNCNAME
272
#define FUNCNAME MPID_nem_mpich_sendv_header
273
274
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
275
static inline int
276
MPID_nem_mpich_sendv_header (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again)
277
278
279
280
281
282
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_cell_ptr_t el;
    char *cell_buf;
    MPIDI_msg_sz_t payload_len;    
    int my_rank;
283
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
284
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_SENDV_HEADER);
285
    
286
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MPICH_SENDV_HEADER);
287

288
289
    MPIU_Assert(vc_ch->is_local);

290
    DO_PAPI (PAPI_reset (PAPI_EventSet));
291
    MPIU_Assert (*n_iov > 0 && (*iov)->MPID_IOV_LEN == sizeof(MPIDI_CH3_Pkt_t));
292
293
294
295

    my_rank = MPID_nem_mem_region.rank;

#ifdef USE_FASTBOX
296
    if (*n_iov == 2 && (*iov)[1].MPID_IOV_LEN + sizeof(MPIDI_CH3_Pkt_t) <= MPID_NEM_FBOX_DATALEN)
297
    {
298
	MPID_nem_fbox_mpich_t *pbox = vc_ch->fbox_out;
299

300
301
302
        if (MPID_nem_fbox_is_full((MPID_nem_fbox_common_ptr_t)pbox))
            goto usequeue_l;

303
304
305
306
        pbox->cell.pkt.mpich.source  = MPID_nem_mem_region.local_rank;
        pbox->cell.pkt.mpich.datalen = (*iov)[1].MPID_IOV_LEN + sizeof(MPIDI_CH3_Pkt_t);
        pbox->cell.pkt.mpich.seqno   = vc_ch->send_seqno++;
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, pbox->cell.pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
307
        
308
309
        MPIU_Memcpy((void *)pbox->cell.pkt.mpich.p.payload, (*iov)[0].MPID_IOV_BUF, (*iov)[0].MPID_IOV_LEN);
        MPIU_Memcpy ((char *)pbox->cell.pkt.mpich.p.payload + (*iov)[0].MPID_IOV_LEN, (*iov)[1].MPID_IOV_BUF, (*iov)[1].MPID_IOV_LEN);
310
        
311
        OPA_store_release_int(&pbox->flag.value, 1);
312
313
314
315
316
317
        *n_iov = 0;

        MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
        MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (&pbox->cell));
        
        goto return_success;
318
319
    }
 usequeue_l:
320
    MPIR_T_PVAR_COUNTER_INC_VAR(NEM, &MPID_nem_fbox_fall_back_to_queue_count[MPID_nem_mem_region.local_ranks[vc->lpid]], 1);
321

322
323
324
325
#endif /*USE_FASTBOX */
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
326

327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL    */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
#endif /*PREFETCH_CELL */

347
    MPIU_Memcpy((void *)el->pkt.mpich.p.payload, (*iov)->MPID_IOV_BUF, sizeof(MPIDI_CH3_Pkt_t));
348

349
    cell_buf = (char *)(el->pkt.mpich.p.payload) + sizeof(MPIDI_CH3_Pkt_t);
350
351
352
    ++(*iov);
    --(*n_iov);

353
    payload_len = MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
354
    while (*n_iov && payload_len >= (*iov)->MPID_IOV_LEN)
355
    {
356
	size_t _iov_len = (*iov)->MPID_IOV_LEN;
357
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, _iov_len);
358
359
360
361
362
363
364
365
	payload_len -= _iov_len;
	cell_buf += _iov_len;
	--(*n_iov);
	++(*iov);
    }
    
    if (*n_iov && payload_len > 0)
    {
366
	MPIU_Memcpy (cell_buf, (*iov)->MPID_IOV_BUF, payload_len);
367
368
	(*iov)->MPID_IOV_BUF = (char *)(*iov)->MPID_IOV_BUF + payload_len;
	(*iov)->MPID_IOV_LEN -= payload_len;
369
370
371
	payload_len = 0;
    }

372
373
374
375
376
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = MPID_NEM_MPICH_DATA_LEN - payload_len;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
377
378
379
380

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

381
382
    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);	
    /*MPID_nem_rel_dump_queue( vc_ch->recv_queue ); */
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

 return_success:
    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
399
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MPICH_SENDV_HEADER);
400
    return mpi_errno;
401
402
 fn_fail:
    goto fn_exit;
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
}

/* send the header and data described by the segment in one cell.  If
   there is no cell available, *again is set to 1.  If all of the data
   cannot be sent, *segment_first is set to the index of the first
   unsent byte.
   Pre condition:  This must be the first packet of a message (i.e.,
                       *segment first == 0)
                   The destination process is local
   Post conditions:  the header has been sent iff *again == 0
                     if there is data to send (segment_size > 0) then
                         (the header has been sent iff any data has
                         been sent (i.e., *segment_first > 0) )
                     i.e.: we will never send only the header
*/
#undef FUNCNAME
419
#define FUNCNAME MPID_nem_mpich_send_seg_header
420
421
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
422
static inline void
423
MPID_nem_mpich_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first, MPIDI_msg_sz_t segment_size,
424
425
426
                                 void *header, MPIDI_msg_sz_t header_sz, MPIDI_VC_t *vc, int *again)
{
    MPID_nem_cell_ptr_t el;
427
    MPIDI_msg_sz_t datalen;
428
429
    int my_rank;
    MPIDI_msg_sz_t last;
430
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
431
432
433
434
435
436
437
438
439
440
441
442

    MPIU_Assert(vc_ch->is_local); /* netmods will have their own implementation */
    MPIU_Assert(header_sz <= sizeof(MPIDI_CH3_Pkt_t));
    
    
    DO_PAPI (PAPI_reset (PAPI_EventSet));

    my_rank = MPID_nem_mem_region.rank;

#ifdef USE_FASTBOX
    if (sizeof(MPIDI_CH3_Pkt_t) + segment_size <= MPID_NEM_FBOX_DATALEN)
    {
443
	MPID_nem_fbox_mpich_t *pbox = vc_ch->fbox_out;
444

445
446
447
        if (MPID_nem_fbox_is_full((MPID_nem_fbox_common_ptr_t)pbox))
            goto usequeue_l;

448
	{
449
450
451
452
	    pbox->cell.pkt.mpich.source  = MPID_nem_mem_region.local_rank;
	    pbox->cell.pkt.mpich.datalen = sizeof(MPIDI_CH3_Pkt_t) + segment_size;
	    pbox->cell.pkt.mpich.seqno   = vc_ch->send_seqno++;
            MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, pbox->cell.pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
453
454

            /* copy header */
455
            MPIU_Memcpy((void *)pbox->cell.pkt.mpich.p.payload, header, header_sz);
456
            
457
458
            /* copy data */
            last = segment_size;
459
            MPID_Segment_pack(segment, *segment_first, &last, (char *)pbox->cell.pkt.mpich.p.payload + sizeof(MPIDI_CH3_Pkt_t));
460
            MPIU_Assert(last == segment_size);
461

462
            OPA_store_release_int(&pbox->flag.value, 1);
463
464
465
466
467
468
469
470
471
472

            *segment_first = last;

	    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "--> Sent fbox ");
	    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (&pbox->cell));

            goto return_success;
	}
    }
 usequeue_l:
473
    MPIR_T_PVAR_COUNTER_INC_VAR(NEM, &MPID_nem_fbox_fall_back_to_queue_count[MPID_nem_mem_region.local_ranks[vc->lpid]], 1);
474

475
476
477
478
#endif /*USE_FASTBOX */
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL    */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
#endif /*PREFETCH_CELL */

    /* copy header */
501
    MPIU_Memcpy((void *)el->pkt.mpich.p.payload, header, header_sz);
502
503
    
    /* copy data */
504
    if (segment_size - *segment_first <= MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t))
505
506
        last = segment_size;
    else
507
        last = *segment_first + MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
508
    
509
    MPID_Segment_pack(segment, *segment_first, &last, (char *)el->pkt.mpich.p.payload + sizeof(MPIDI_CH3_Pkt_t));
510
511
512
    datalen = sizeof(MPIDI_CH3_Pkt_t) + last - *segment_first;
    *segment_first = last;
    
513
514
515
516
517
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = datalen;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);	

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

 return_success:
    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
    return;
}

542
/* similar to MPID_nem_mpich_send_seg_header, except there is no
543
544
   header to send.  This need not be the first packet of a message. */
#undef FUNCNAME
545
#define FUNCNAME MPID_nem_mpich_send_seg
546
547
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
548
static inline void
549
MPID_nem_mpich_send_seg (MPID_Segment *segment, MPIDI_msg_sz_t *segment_first, MPIDI_msg_sz_t segment_size, MPIDI_VC_t *vc, int *again)
550
551
{
    MPID_nem_cell_ptr_t el;
552
    MPIDI_msg_sz_t datalen;
553
554
    int my_rank;
    MPIDI_msg_sz_t last;
555
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
556

557
    MPIU_Assert(vc_ch->is_local); /* netmods will have their own implementation */    
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
    
    DO_PAPI (PAPI_reset (PAPI_EventSet));

    my_rank = MPID_nem_mem_region.rank;
	
#ifdef PREFETCH_CELL
    el = MPID_nem_prefetched_cell;
    
    if (!el)
    {
	if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	{
	    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
            goto return_again;
	}
	
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
    }
#else /*PREFETCH_CELL    */
    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
    {
	DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));
        goto return_again;
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &el);
#endif /*PREFETCH_CELL */

    /* copy data */
587
    if (segment_size - *segment_first <= MPID_NEM_MPICH_DATA_LEN)
588
589
        last = segment_size;
    else
590
        last = *segment_first + MPID_NEM_MPICH_DATA_LEN;
591
    
592
    MPID_Segment_pack(segment, *segment_first, &last, (char *)el->pkt.mpich.p.payload);
593
594
595
    datalen = last - *segment_first;
    *segment_first = last;
    
596
597
598
599
600
    el->pkt.mpich.source  = my_rank;
    el->pkt.mpich.dest    = vc->lpid;
    el->pkt.mpich.datalen = datalen;
    el->pkt.mpich.seqno   = vc_ch->send_seqno++;
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, el->pkt.mpich.type = MPID_NEM_PKT_MPICH_HEAD);
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624

    MPIU_DBG_MSG (CH3_CHANNEL, VERBOSE, "--> Sent queue");
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (el));

    MPID_nem_queue_enqueue (vc_ch->recv_queue, el);	

#ifdef PREFETCH_CELL
    if (!MPID_nem_queue_empty (MPID_nem_mem_region.my_freeQ))
	MPID_nem_queue_dequeue (MPID_nem_mem_region.my_freeQ, &MPID_nem_prefetched_cell);
    else
	MPID_nem_prefetched_cell = 0;
#endif /*PREFETCH_CELL */
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues5));

    *again = 0;
    goto fn_exit;
 return_again:
    *again = 1;
    goto fn_exit;
 fn_exit:
    return;
}

/*
625
  MPID_nem_mpich_dequeue_fastbox (int local_rank)
626
627
628
629
630
631
  decrements usage count on fastbox for process with local rank local_rank and
  dequeues it from fbox queue if usage is 0.
  This function is called whenever a receive for a process on this node is matched.
  Fastboxes on fbox queue are polled regularly for incoming messages.
*/
#undef FUNCNAME
632
#define FUNCNAME MPID_nem_mpich_dequeue_fastbox
633
634
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
635
static inline void MPID_nem_mpich_dequeue_fastbox(int local_rank)
636
637
638
{
    MPID_nem_fboxq_elem_t *el;

639
640
    MPIU_Assert(local_rank < MPID_nem_mem_region.num_local);

641
    el = &MPID_nem_fboxq_elem_list[local_rank];    
642
    MPIU_Assert(el->fbox != NULL);
643

644
    MPIU_Assert(el->usage);
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665

    --el->usage;
    if (el->usage == 0)
    {
	if (el->prev == NULL)
	    MPID_nem_fboxq_head = el->next;
	else
	    el->prev->next = el->next;

	if (el->next == NULL)
	    MPID_nem_fboxq_tail = el->prev;
	else
	    el->next->prev = el->prev;

	if (el == MPID_nem_curr_fboxq_elem)
	{
	    if (el->next == NULL)
		MPID_nem_curr_fboxq_elem = MPID_nem_fboxq_head;
	    else
		MPID_nem_curr_fboxq_elem = el->next;
	}
666
    }    
667
668
669
}

/*
670
  MPID_nem_mpich_enqueue_fastbox (int local_rank)
671
672
673
674
675
  enqueues fastbox for process with local rank local_rank on fbox queue
  This function is called whenever a receive is posted for a process on this node.
  Fastboxes on fbox queue are polled regularly for incoming messages.
*/
#undef FUNCNAME
676
#define FUNCNAME MPID_nem_mpich_dequeue_fastbox
677
678
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
679
static inline void MPID_nem_mpich_enqueue_fastbox(int local_rank)
680
681
682
{
    MPID_nem_fboxq_elem_t *el;

683
684
    MPIU_Assert(local_rank < MPID_nem_mem_region.num_local);

685
    el = &MPID_nem_fboxq_elem_list[local_rank];
686
687
    MPIU_Assert(el->fbox != NULL);

688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
    if (el->usage)
    {
	++el->usage;
    }
    else
    {
	el->usage = 1;
	if (MPID_nem_fboxq_tail == NULL)
	{
	    el->prev = NULL;
	    MPID_nem_curr_fboxq_elem = MPID_nem_fboxq_head = el;
	}
	else
	{
	    el->prev = MPID_nem_fboxq_tail;
	    MPID_nem_fboxq_tail->next = el;
	}
	    
	el->next = NULL;
	MPID_nem_fboxq_tail = el;
708
    }
709
710
711
712
713
714
715
716
717
718
719
}
/*
  MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead)
  check whether the sequence number for the cell at the head of qhead is the one
  expected from the sender of that cell
  We only check these for processes in COMM_WORLD (i.e. the ones initially allocated)
*/
#undef FUNCNAME
#define FUNCNAME MPID_nem_recv_seqno_matches
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
720
static inline int
721
722
MPID_nem_recv_seqno_matches (MPID_nem_queue_ptr_t qhead)
{
723
724
    int source;
    MPID_nem_cell_ptr_t cell = MPID_nem_queue_head(qhead);
725
    source = cell->pkt.mpich.source;
726
    
727
    return (cell->pkt.mpich.seqno == MPID_nem_recv_seqno[source]);
728
729
730
}

/*
731
  int MPID_nem_mpich_test_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
732
733
734

  non-blocking receive
  sets cell to the received cell, or NULL if there is nothing to receive. in_fbox is true iff the cell was found in a fbox
735
  the cell must be released back to the subsystem with MPID_nem_mpich_release_cell() once the packet has been copied out
736
737
*/
#undef FUNCNAME
738
#define FUNCNAME MPID_nem_mpich_test_recv
739
740
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
741
static inline int
742
MPID_nem_mpich_test_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int in_blocking_progress)
743
744
745
746
747
748
{
    int mpi_errno = MPI_SUCCESS;
    
    DO_PAPI (PAPI_reset (PAPI_EventSet));

#ifdef USE_FASTBOX
749
    if (poll_active_fboxes(cell)) goto fbox_l;
750
751
#endif/* USE_FASTBOX     */

752
    if (MPID_nem_num_netmods)
753
    {
754
	mpi_errno = MPID_nem_network_poll(in_blocking_progress);
755
756
757
758
759
760
        if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    }

    if (MPID_nem_queue_empty (MPID_nem_mem_region.my_recvQ) || !MPID_nem_recv_seqno_matches (MPID_nem_mem_region.my_recvQ))
    {
#ifdef USE_FASTBOX
761
762
763
764
765
766
767
768
769
770
771
772
        /* check for messages from any process (even those from which
           we don't expect messages).  If we're nonblocking, check all
           fboxes at once, if we're in a blocking loop we'll keep
           iterating, so just check them one at a time. */
        if (!in_blocking_progress) {
            int found;
            found = poll_every_fbox(cell);
            if (found)
                goto fbox_l;
        } else {
            poll_next_fbox (cell, goto fbox_l);
        }
773
774
775
776
777
778
779
#endif/* USE_FASTBOX     */
	*cell = NULL;
	goto fn_exit;
    }
    
    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

780
    ++MPID_nem_recv_seqno[(*cell)->pkt.mpich.source];
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
    *in_fbox = 0;

 fn_exit:
    DO_PAPI (PAPI_accum_var (PAPI_EventSet, PAPI_vvalues6));
    
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, {
	if (*cell)
	{
	    MPIU_DBG_MSG_S (CH3_CHANNEL, VERBOSE, "<-- Recv %s", (*in_fbox) ? "fbox " : "queue");
	    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (*cell));
	}
    });

 fn_fail:
    return mpi_errno;

 fbox_l:
798
   *in_fbox = 1;
799
800
801
802
803
    goto fn_exit;

}

/*
804
  int MPID_nem_mpich_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout);
805
806
807
808

  blocking receive with timeout
  waits up to timeout iterations to receive a cell
  sets cell to the received cell, or NULL if there is nothing to receive. in_fbox is true iff the cell was found in a fbox
809
  the cell must be released back to the subsystem with MPID_nem_mpich_release_cell() once the packet has been copied out
810
811
*/
#undef FUNCNAME
812
#define FUNCNAME MPID_nem_mpich_test_recv_wait
813
814
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
815
static inline int
816
MPID_nem_mpich_test_recv_wait (MPID_nem_cell_ptr_t *cell, int *in_fbox, int timeout)
817
818
819
820
{
    int mpi_errno = MPI_SUCCESS;
    
#ifdef USE_FASTBOX
821
    if (poll_active_fboxes(cell)) goto fbox_l;
822
823
#endif/* USE_FASTBOX     */

824
    if (MPID_nem_num_netmods)
825
    {
826
	mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
827
828
829
830
831
832
        if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    }

    while ((--timeout > 0) && (MPID_nem_queue_empty (MPID_nem_mem_region.my_recvQ) || !MPID_nem_recv_seqno_matches (MPID_nem_mem_region.my_recvQ)))
    {
#ifdef USE_FASTBOX
833
	poll_next_fbox (cell, goto fbox_l);
834
835
836
837
838
839
840
#endif/* USE_FASTBOX     */
	*cell = NULL;
	goto exit_l;
    }
    
    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

841
    ++MPID_nem_recv_seqno[(*cell)->pkt.mpich.source];
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
    *in_fbox = 0;
 exit_l:
    
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, {
            if (*cell)
            {
                MPIU_DBG_MSG_S (CH3_CHANNEL, VERBOSE, "<-- Recv %s", (*in_fbox) ? "fbox " : "queue");
                MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell (*cell));
            }
        });

 fn_fail:
    return mpi_errno;

 fbox_l:
    *in_fbox = 1;
    goto exit_l;
}

/*
862
  int MPID_nem_mpich_blocking_recv (MPID_nem_cell_ptr_t *cell, int *in_fbox);
863
864
865
866

  blocking receive waits until there is something to receive, or then
  sets cell to the received cell. in_fbox is true iff the cell was
  found in a fbox the cell must be released back to the subsystem with
867
  MPID_nem_mpich_release_cell() once the packet has been copied out
868
869
*/
#undef FUNCNAME
870
#define FUNCNAME MPID_nem_mpich_blocking_recv
871
872
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
873
static inline int
874
MPID_nem_mpich_blocking_recv(MPID_nem_cell_ptr_t *cell, int *in_fbox, int completions)
875
876
877
878
879
880
881
882
883
884
{
    int mpi_errno = MPI_SUCCESS;
    DO_PAPI (PAPI_reset (PAPI_EventSet));

#ifdef MPICH_IS_THREADED
    /* We should never enter this function in a multithreaded app */
    MPIU_Assert(!MPIR_ThreadInfo.isThreaded);
#endif

#ifdef USE_FASTBOX
885
    if (poll_active_fboxes(cell)) goto fbox_l;
886
887
#endif /*USE_FASTBOX */

888
    if (MPID_nem_num_netmods)
889
    {
890
	mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
891
892
893
894
895
896
897
898
        if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    }

    while (MPID_nem_queue_empty (MPID_nem_mem_region.my_recvQ) || !MPID_nem_recv_seqno_matches (MPID_nem_mem_region.my_recvQ))
    {
	DO_PAPI (PAPI_reset (PAPI_EventSet));

#ifdef USE_FASTBOX	
899
900
	poll_next_fbox (cell, goto fbox_l);
        if (poll_active_fboxes(cell)) goto fbox_l;
901
902
#endif /*USE_FASTBOX */

903
	if (MPID_nem_num_netmods)
904
	{            
905
	    mpi_errno = MPID_nem_network_poll(TRUE /* blocking */);
906
907
            if (mpi_errno) MPIU_ERR_POP (mpi_errno);

908
            if (!MPID_nem_safe_to_block_recv())
909
            {
910
                *cell = NULL;
911
912
913
914
                *in_fbox = 0;
                goto exit_l;
            }
	}
915
916
917
918
919
920

        if (completions != OPA_load_int(&MPIDI_CH3I_progress_completion_count)) {
            *cell = NULL;
            *in_fbox = 0;
            goto exit_l;
        }
921
        MPIU_Busy_wait();
922
923
924
925
    }

    MPID_nem_queue_dequeue (MPID_nem_mem_region.my_recvQ, cell);

926
    ++MPID_nem_recv_seqno[(*cell)->pkt.mpich.source];
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
    *in_fbox = 0;

 exit_l:    

    DO_PAPI (PAPI_accum_var (PAPI_EventSet,PAPI_vvalues8));
    
    MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, {
            if (*cell)
            {
                MPIU_DBG_MSG_S (CH3_CHANNEL, VERBOSE, "<-- Recv %s", (*in_fbox) ? "fbox " : "queue");
                MPIU_DBG_STMT (CH3_CHANNEL, VERBOSE, MPID_nem_dbg_dump_cell(*cell));
            }
        });

 fn_fail:
    return mpi_errno;

 fbox_l:
    *in_fbox = 1;
    goto exit_l;
}

/*
950
  int MPID_nem_mpich_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc);
951
952
953
954

  releases the cell back to the subsystem to be used for subsequent receives
*/
#undef FUNCNAME
955
#define FUNCNAME MPID_nem_mpich_release_cell
956
957
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
958
static inline int
959
MPID_nem_mpich_release_cell (MPID_nem_cell_ptr_t cell, MPIDI_VC_t *vc)
960
961
{
    int mpi_errno = MPI_SUCCESS;
962
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
963
964
965
966
967
968
969
970
    DO_PAPI (PAPI_reset (PAPI_EventSet));
    MPID_nem_queue_enqueue (vc_ch->free_queue, cell);
    DO_PAPI (PAPI_accum_var (PAPI_EventSet,PAPI_vvalues9));
    return mpi_errno;
}

#endif /*_MPID_NEM_INLINE_H*/