mpid_nem_init.c 23.9 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

7
#include "mpiimpl.h"
8
9
10
#include "mpid_nem_impl.h"
#include "mpid_nem_nets.h"
#include <errno.h>
11
#include "mpidi_nem_statistics.h"
12
#include "mpit.h"
13

14
15
16
17
/* constants for configure time selection of local LMT implementations */
#define MPID_NEM_LOCAL_LMT_NONE 0
#define MPID_NEM_LOCAL_LMT_SHM_COPY 1
#define MPID_NEM_LOCAL_LMT_DMA 2
18
#define MPID_NEM_LOCAL_LMT_VMSPLICE 3
19

20
21
22
23
24
25
26
27
#ifdef MEM_REGION_IN_HEAP
MPID_nem_mem_region_t *MPID_nem_mem_region_ptr = 0;
#else /* MEM_REGION_IN_HEAP */
MPID_nem_mem_region_t MPID_nem_mem_region = {{0}};
#endif /* MEM_REGION_IN_HEAP */

char MPID_nem_hostname[MAX_HOSTNAME_LEN] = "UNKNOWN";

28
29
30
static int get_local_procs(MPIDI_PG_t *pg, int our_pg_rank, int *num_local_p,
                           int **local_procs_p, int *local_rank_p);

31
#ifndef MIN
32
#define MIN( a , b ) ((a) >  (b)) ? (b) : (a)
33
34
35
#endif /* MIN */

#ifndef MAX
36
#define MAX( a , b ) ((a) >= (b)) ? (a) : (b)
37
#endif /* MAX */
38
39
40

char *MPID_nem_asymm_base_addr = 0;

Dave Goodell's avatar
Dave Goodell committed
41
/* used by mpid_nem_inline.h and mpid_nem_finalize.c */
42
unsigned long long *MPID_nem_fbox_fall_back_to_queue_count = NULL;
43
44
45
46
47
48
49
50
51

#undef FUNCNAME
#define FUNCNAME MPID_nem_init_stats
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPID_nem_init_stats(int n_local_ranks)
{
    int mpi_errno = MPI_SUCCESS;

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#ifdef ENABLE_PVAR_NEM
    MPID_nem_fbox_fall_back_to_queue_count = MPIU_Calloc(n_local_ranks, sizeof(unsigned long long));
#endif

    MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
        NEM,
        MPI_UNSIGNED_LONG_LONG,
        nem_fbox_fall_back_to_queue_count, /* name */
        MPID_nem_fbox_fall_back_to_queue_count, /* address */
        n_local_ranks, /* count, known at pvar registeration time */
        MPI_T_VERBOSITY_USER_DETAIL,
        MPI_T_BIND_NO_OBJECT,
        MPIR_T_PVAR_FLAG_CONTINUOUS, /* flags */
        NULL, /* get_value */
        NULL, /* get_count */
        "NEMESIS", /* category */
        "Array counting how many times nemesis had to fall back to the regular queue when sending messages between pairs of local processes");

fn_exit:
71
    return mpi_errno;
72
73
fn_fail:
    goto fn_exit;
74
75
}

76
77
78
79
#undef FUNCNAME
#define FUNCNAME MPID_nem_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
80
int
81
MPID_nem_init(int pg_rank, MPIDI_PG_t *pg_p, int has_parent ATTRIBUTE((unused)))
82
83
84
85
86
87
88
{
    int    mpi_errno       = MPI_SUCCESS;
    int    num_procs       = pg_p->size;
    int    ret;
    int    num_local       = -1;
    int   *local_procs     = NULL;
    int    local_rank      = -1;
89
    int    idx;
90
91
92
93
    int    i;
    char  *publish_bc_orig = NULL;
    char  *bc_val          = NULL;
    int    val_max_remaining;
94
95
96
97
98
    int    grank;
    MPID_nem_fastbox_t *fastboxes_p = NULL;
    MPID_nem_cell_t (*cells_p)[MPID_NEM_NUM_CELLS];
    MPID_nem_queue_t *recv_queues_p = NULL;
    MPID_nem_queue_t *free_queues_p = NULL;
99

100
    MPIU_CHKPMEM_DECL(9);
101

102
103
    /* TODO add compile-time asserts (rather than run-time) and convert most of these */

104
105
106
107
    /* Make sure the nemesis packet is no larger than the generic
       packet.  This is needed because we no longer include channel
       packet types in the CH3 packet types to allow dynamic channel
       loading. */
108
    MPIU_Assert(sizeof(MPIDI_CH3_nem_pkt_t) <= sizeof(MPIDI_CH3_Pkt_t));
109

110
    /* The MPID_nem_cell_rel_ptr_t defined in mpid_nem_datatypes.h
111
112
113
114
       should only contain a OPA_ptr_t.  This is to check that
       absolute pointers are exactly the same size as relative
       pointers. */
    MPIU_Assert(sizeof(MPID_nem_cell_rel_ptr_t) == sizeof(OPA_ptr_t));
115

116
117
118
119
    /* Make sure the cell structure looks like it should */
    MPIU_Assert(MPID_NEM_CELL_PAYLOAD_LEN + MPID_NEM_CELL_HEAD_LEN == sizeof(MPID_nem_cell_t));
    MPIU_Assert(sizeof(MPID_nem_cell_t) == sizeof(MPID_nem_abs_cell_t));
    /* Make sure payload is aligned on a double */
120
    MPIU_Assert(MPID_NEM_ALIGNED(&((MPID_nem_cell_t*)0)->pkt.mpich.p.payload[0], sizeof(double)));
121

122
123
124
125
    /* Initialize the business card */
    mpi_errno = MPIDI_CH3I_BCInit( &bc_val, &val_max_remaining );
    if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    publish_bc_orig = bc_val;
126

127
    ret = gethostname (MPID_nem_hostname, MAX_HOSTNAME_LEN);
128
    MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**sock_gethost", "**sock_gethost %s %d", MPIU_Strerror (errno), errno);
129
130
131

    MPID_nem_hostname[MAX_HOSTNAME_LEN-1] = '\0';

132
    mpi_errno = get_local_procs(pg_p, pg_rank, &num_local, &local_procs, &local_rank);
133
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
134

135
136
137
#ifdef MEM_REGION_IN_HEAP
    MPIU_CHKPMEM_MALLOC (MPID_nem_mem_region_ptr, MPID_nem_mem_region_t *, sizeof(MPID_nem_mem_region_t), mpi_errno, "mem_region");
#endif /* MEM_REGION_IN_HEAP */
138

139
140
141
142
143
144
145
146
    MPID_nem_mem_region.num_seg        = 7;
    MPIU_CHKPMEM_MALLOC (MPID_nem_mem_region.seg, MPID_nem_seg_info_ptr_t, MPID_nem_mem_region.num_seg * sizeof(MPID_nem_seg_info_t), mpi_errno, "mem_region segments");
    MPID_nem_mem_region.rank           = pg_rank;
    MPID_nem_mem_region.num_local      = num_local;
    MPID_nem_mem_region.num_procs      = num_procs;
    MPID_nem_mem_region.local_procs    = local_procs;
    MPID_nem_mem_region.local_rank     = local_rank;
    MPIU_CHKPMEM_MALLOC (MPID_nem_mem_region.local_ranks, int *, num_procs * sizeof(int), mpi_errno, "mem_region local ranks");
147
    MPID_nem_mem_region.ext_procs      = num_procs - num_local ;
148
149
    MPIU_CHKPMEM_MALLOC (MPID_nem_mem_region.ext_ranks, int *, MPID_nem_mem_region.ext_procs * sizeof(int), mpi_errno, "mem_region ext ranks");
    MPID_nem_mem_region.next           = NULL;
150

151
    for (idx = 0 ; idx < num_procs; idx++)
152
    {
153
	MPID_nem_mem_region.local_ranks[idx] = MPID_NEM_NON_LOCAL;
154
    }
155
    for (idx = 0; idx < num_local; idx++)
156
    {
157
158
	grank = local_procs[idx];
	MPID_nem_mem_region.local_ranks[grank] = idx;
159
160
    }

161
    idx = 0;
162
    for(grank = 0 ; grank < num_procs ; grank++)
163
    {
164
	if(!MPID_NEM_IS_LOCAL(grank))
165
	{
166
	    MPID_nem_mem_region.ext_ranks[idx++] = grank;
167
168
169
170
171
172
173
174
175
176
177
	}
    }

#ifdef FORCE_ASYM
    {
        /* this is used for debugging
           each process allocates a different sized piece of shared
           memory so that when the shared memory segment used for
           communication is allocated it will probably be mapped at a
           different location for each process
        */
178
        MPIU_SHMW_Hnd_t handle;
179
180
181
	int size = (local_rank * 65536) + 65536;
	char *base_addr;

182
183
184
185
        mpi_errno = MPIU_SHMW_Hnd_init(&handle);
        if(mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }

        mpi_errno = MPIU_SHMW_Seg_create_and_attach(handle, size, &base_addr, 0);
186
187
188
        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno)
        {
189
190
            MPIU_SHMW_Seg_remove(handle);
            MPIU_SHMW_Hnd_finalize(&handle);
191
192
193
            MPIU_ERR_POP (mpi_errno);
        }
        /* --END ERROR HANDLING-- */
194

195
        mpi_errno = MPIU_SHMW_Seg_remove(handle);
196
197
198
        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno)
        {
199
            MPIU_SHMW_Hnd_finalize(&handle);
200
201
202
203
            MPIU_ERR_POP (mpi_errno);
        }
        /* --END ERROR HANDLING-- */

204
        MPIU_SHMW_Hnd_finalize(&handle);
205
206
207
208
    }
    /*fprintf(stderr,"[%i] -- address shift ok \n",pg_rank); */
#endif  /*FORCE_ASYM */

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
    /* Request fastboxes region */
    mpi_errno = MPIDI_CH3I_Seg_alloc(MAX((num_local*((num_local-1)*sizeof(MPID_nem_fastbox_t))), MPID_NEM_ASYMM_NULL_VAL),
                                     (void **)&fastboxes_p);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    
    /* Request data cells region */
    mpi_errno = MPIDI_CH3I_Seg_alloc(num_local * MPID_NEM_NUM_CELLS * sizeof(MPID_nem_cell_t), (void **)&cells_p);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    /* Request free q region */
    mpi_errno = MPIDI_CH3I_Seg_alloc(num_local * sizeof(MPID_nem_queue_t), (void **)&free_queues_p);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    /* Request recv q region */
    mpi_errno = MPIDI_CH3I_Seg_alloc(num_local * sizeof(MPID_nem_queue_t), (void **)&recv_queues_p);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    /* Request shared collectives barrier vars region */
    mpi_errno = MPIDI_CH3I_Seg_alloc(MPID_NEM_NUM_BARRIER_VARS * sizeof(MPID_nem_barrier_vars_t),
                                     (void **)&MPID_nem_mem_region.barrier_vars);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    /* Actually allocate the segment and assign regions to the pointers */
    mpi_errno = MPIDI_CH3I_Seg_commit(&MPID_nem_mem_region.memory, num_local, local_rank);
233
234
    if (mpi_errno) MPIU_ERR_POP (mpi_errno);

235
236
    /* init shared collectives barrier region */
    mpi_errno = MPID_nem_barrier_vars_init(MPID_nem_mem_region.barrier_vars);
237
238
    if (mpi_errno) MPIU_ERR_POP (mpi_errno);

239
    /* local procs barrier */
240
    mpi_errno = MPID_nem_barrier();
241
242
    if (mpi_errno) MPIU_ERR_POP (mpi_errno);

Darius Buntinas's avatar
Darius Buntinas committed
243
    /* find our cell region */
244
    MPID_nem_mem_region.Elements = cells_p[local_rank];
245

246
247
248
    /* Tables of pointers to shared memory Qs */
    MPIU_CHKPMEM_MALLOC(MPID_nem_mem_region.FreeQ, MPID_nem_queue_ptr_t *, num_procs * sizeof(MPID_nem_queue_ptr_t), mpi_errno, "FreeQ");
    MPIU_CHKPMEM_MALLOC(MPID_nem_mem_region.RecvQ, MPID_nem_queue_ptr_t *, num_procs * sizeof(MPID_nem_queue_ptr_t), mpi_errno, "RecvQ");
249

250
251
252
    /* Init table entry for our Qs */
    MPID_nem_mem_region.FreeQ[pg_rank] = &free_queues_p[local_rank];
    MPID_nem_mem_region.RecvQ[pg_rank] = &recv_queues_p[local_rank];
253

254
255
256
257
258
    /* Init our queues */
    MPID_nem_queue_init(MPID_nem_mem_region.RecvQ[pg_rank]);
    MPID_nem_queue_init(MPID_nem_mem_region.FreeQ[pg_rank]);
    
    /* Init and enqueue our free cells */
259
    for (idx = 0; idx < MPID_NEM_NUM_CELLS; ++idx)
260
    {
261
262
	MPID_nem_cell_init(&(MPID_nem_mem_region.Elements[idx]));
	MPID_nem_queue_enqueue(MPID_nem_mem_region.FreeQ[pg_rank], &(MPID_nem_mem_region.Elements[idx]));
263
264
    }

265
266
267
268
269
270
271
272
273
274
    mpi_errno = MPID_nem_coll_init();
    if (mpi_errno) MPIU_ERR_POP (mpi_errno);
    
    /* This must be done before initializing the netmod so that the nemesis
       communicator creation hooks get registered (and therefore called) before
       the netmod hooks, giving the netmod an opportunity to override the
       nemesis collective function table. */
    mpi_errno = MPIDI_CH3U_Comm_register_create_hook(MPIDI_CH3I_comm_create, NULL);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

275
    /* network init */
276
    if (MPID_nem_num_netmods)
277
    {
278
279
        mpi_errno = MPID_nem_choose_netmod();
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
Darius Buntinas's avatar
Darius Buntinas committed
280
	mpi_errno = MPID_nem_netmod_func->init(pg_p, pg_rank, &bc_val, &val_max_remaining);
281
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
282
283
    }

284
285
286
287
288
    /* Register detroy hooks after netmod init so the netmod hooks get called
       before nemesis hooks. */
    mpi_errno = MPIDI_CH3U_Comm_register_destroy_hook(MPIDI_CH3I_comm_destroy, NULL);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    
289
    /* set default route for external processes through network */
290
    for (idx = 0 ; idx < MPID_nem_mem_region.ext_procs ; idx++)
291
    {
292
	grank = MPID_nem_mem_region.ext_ranks[idx];
Darius Buntinas's avatar
Darius Buntinas committed
293
	MPID_nem_mem_region.FreeQ[grank] = NULL;
294
	MPID_nem_mem_region.RecvQ[grank] = NULL;
295
296
    }

297
298

    /* set route for local procs through shmem */
299
    for (idx = 0; idx < num_local; idx++)
300
    {
301
302
303
304
	grank = local_procs[idx];
	MPID_nem_mem_region.FreeQ[grank] = &free_queues_p[idx];
	MPID_nem_mem_region.RecvQ[grank] = &recv_queues_p[idx];

305
306
	MPIU_Assert(MPID_NEM_ALIGNED(MPID_nem_mem_region.FreeQ[grank], MPID_NEM_CACHE_LINE_LEN));
	MPIU_Assert(MPID_NEM_ALIGNED(MPID_nem_mem_region.RecvQ[grank], MPID_NEM_CACHE_LINE_LEN));
307
308
309
    }

    /* make pointers to our queues global so we don't have to dereference the array */
310
311
    MPID_nem_mem_region.my_freeQ = MPID_nem_mem_region.FreeQ[pg_rank];
    MPID_nem_mem_region.my_recvQ = MPID_nem_mem_region.RecvQ[pg_rank];
312

313
314
    
    /* local barrier */
315
    mpi_errno = MPID_nem_barrier();
316
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
317

318
319
320
321
    
    /* Allocate table of pointers to fastboxes */
    MPIU_CHKPMEM_MALLOC(MPID_nem_mem_region.mailboxes.in,  MPID_nem_fastbox_t **, num_local * sizeof(MPID_nem_fastbox_t *), mpi_errno, "fastboxes");
    MPIU_CHKPMEM_MALLOC(MPID_nem_mem_region.mailboxes.out, MPID_nem_fastbox_t **, num_local * sizeof(MPID_nem_fastbox_t *), mpi_errno, "fastboxes");
322

323
    MPIU_Assert(num_local > 0);
324
325
326
327

#define MAILBOX_INDEX(sender, receiver) ( ((sender) > (receiver)) ? ((num_local-1) * (sender) + (receiver)) :		\
                                          (((sender) < (receiver)) ? ((num_local-1) * (sender) + ((receiver)-1)) : 0) )

328
    /* fill in tables */
329
330
331
332
    for (i = 0; i < num_local; ++i)
    {
	if (i == local_rank)
	{
333
            /* No fastboxs to myself */
334
335
336
337
338
	    MPID_nem_mem_region.mailboxes.in [i] = NULL ;
	    MPID_nem_mem_region.mailboxes.out[i] = NULL ;
	}
	else
	{
339
340
	    MPID_nem_mem_region.mailboxes.in [i] = &fastboxes_p[MAILBOX_INDEX(i, local_rank)];
	    MPID_nem_mem_region.mailboxes.out[i] = &fastboxes_p[MAILBOX_INDEX(local_rank, i)];
341
342
	    OPA_store_int(&MPID_nem_mem_region.mailboxes.in [i]->common.flag.value, 0);
	    OPA_store_int(&MPID_nem_mem_region.mailboxes.out[i]->common.flag.value, 0);
343
344
	}
    }
345
#undef MAILBOX_INDEX
346

347
348
349
350
351
352
353
354
355
356
357
358
359
    /* setup local LMT */
#if MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_SHM_COPY
        MPID_nem_local_lmt_progress = MPID_nem_lmt_shm_progress;
#elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_DMA
        MPID_nem_local_lmt_progress = MPID_nem_lmt_dma_progress;
#elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_VMSPLICE
        MPID_nem_local_lmt_progress = MPID_nem_lmt_vmsplice_progress;
#elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_NONE
        MPID_nem_local_lmt_progress = NULL;
#else
#  error Must select a valid local LMT implementation!
#endif

360
361
362
    /* publish business card */
    mpi_errno = MPIDI_PG_SetConnInfo(pg_rank, (const char *)publish_bc_orig);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
363
364
    MPIU_Free(publish_bc_orig);

365

366
    mpi_errno = MPID_nem_barrier();
367
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
368
    mpi_errno = MPID_nem_mpich_init();
369
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
370
371
372
373
    mpi_errno = MPID_nem_barrier();
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
#ifdef ENABLE_CHECKPOINTING
    mpi_errno = MPIDI_nem_ckpt_init();
374
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
375
#endif
376

377
378
#ifdef PAPI_MONITOR
    my_papi_start( pg_rank );
379
#endif /*PAPI_MONITOR   */
380

381
382
    MPID_nem_init_stats(num_local);

383
384
385
386
387
388
389
390
    MPIU_CHKPMEM_COMMIT();
 fn_exit:
    return mpi_errno;
 fn_fail:
    /* --BEGIN ERROR HANDLING-- */
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
391

392
393
394
395
396
397
398
399
}

/* MPID_nem_vc_init initialize nemesis' part of the vc */
#undef FUNCNAME
#define FUNCNAME MPID_nem_vc_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int
400
MPID_nem_vc_init (MPIDI_VC_t *vc)
401
402
{
    int mpi_errno = MPI_SUCCESS;
403
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
404
    MPIU_CHKPMEM_DECL(1);
405
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_VC_INIT);
406

407
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_VC_INIT);
408
    
409
410
    vc_ch->pkt_handler = NULL;
    vc_ch->num_pkt_handlers = 0;
411
    
412
413
414
415
416
417
418
    vc_ch->send_seqno         = 0;
#ifdef ENABLE_CHECKPOINTING
    vc_ch->ckpt_msg_len       = 0;
    vc_ch->ckpt_msg_buf       = NULL;
    vc_ch->ckpt_pause_send_vc = NULL;
    vc_ch->ckpt_continue_vc   = NULL;
    vc_ch->ckpt_restart_vc    = NULL;
419
#endif
420
    vc_ch->pending_pkt_len    = 0;
421
    MPIU_CHKPMEM_MALLOC (vc_ch->pending_pkt, MPIDI_CH3_Pkt_t *, sizeof (MPIDI_CH3_Pkt_t), mpi_errno, "pending_pkt");
422
423
424
425

    /* We do different things for vcs in the COMM_WORLD pg vs other pgs
       COMM_WORLD vcs may use shared memory, and already have queues allocated
    */
426
    if (vc->lpid < MPID_nem_mem_region.num_procs)
427
428
429
    {
	/* This vc is in COMM_WORLD */
	vc_ch->is_local = MPID_NEM_IS_LOCAL (vc->lpid);
430
	vc_ch->free_queue = MPID_nem_mem_region.FreeQ[vc->lpid]; /* networks and local procs have free queues */
431
432
433
434
435
    }
    else
    {
	/* this vc is the result of a connect */
	vc_ch->is_local = 0;
Darius Buntinas's avatar
Darius Buntinas committed
436
	vc_ch->free_queue = NULL;
437
    }
438

439
440
441
442
443
    /* MT we acquire the LMT CS here, b/c there is at least a theoretical race
     * on some fields, such as lmt_copy_buf.  In practice it's not an issue, but
     * this will keep DRD happy. */
    MPIU_THREAD_CS_ENTER(LMT,);

444
445
446
447
    /* override rendezvous functions */
    vc->rndvSend_fn = MPID_nem_lmt_RndvSend;
    vc->rndvRecv_fn = MPID_nem_lmt_RndvRecv;

448
449
    if (vc_ch->is_local)
    {
450
451
        MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
        
452
453
	vc_ch->fbox_out = &MPID_nem_mem_region.mailboxes.out[MPID_nem_mem_region.local_ranks[vc->lpid]]->mpich;
	vc_ch->fbox_in = &MPID_nem_mem_region.mailboxes.in[MPID_nem_mem_region.local_ranks[vc->lpid]]->mpich;
454
455
456
457
458
459
460
461
	vc_ch->recv_queue = MPID_nem_mem_region.RecvQ[vc->lpid];

        /* override nocontig send function */
        vc->sendNoncontig_fn = MPIDI_CH3I_SendNoncontig;

        /* local processes use the default method */
        vc_ch->iStartContigMsg = NULL;
        vc_ch->iSendContig     = NULL;
462

463
#if MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_SHM_COPY
464
465
466
467
468
469
        vc_ch->lmt_initiate_lmt  = MPID_nem_lmt_shm_initiate_lmt;
        vc_ch->lmt_start_recv    = MPID_nem_lmt_shm_start_recv;
        vc_ch->lmt_start_send    = MPID_nem_lmt_shm_start_send;
        vc_ch->lmt_handle_cookie = MPID_nem_lmt_shm_handle_cookie;
        vc_ch->lmt_done_send     = MPID_nem_lmt_shm_done_send;
        vc_ch->lmt_done_recv     = MPID_nem_lmt_shm_done_recv;
470
        vc_ch->lmt_vc_terminated = MPID_nem_lmt_shm_vc_terminated;
471
472
473
474
475
476
477
#elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_DMA
        vc_ch->lmt_initiate_lmt  = MPID_nem_lmt_dma_initiate_lmt;
        vc_ch->lmt_start_recv    = MPID_nem_lmt_dma_start_recv;
        vc_ch->lmt_start_send    = MPID_nem_lmt_dma_start_send;
        vc_ch->lmt_handle_cookie = MPID_nem_lmt_dma_handle_cookie;
        vc_ch->lmt_done_send     = MPID_nem_lmt_dma_done_send;
        vc_ch->lmt_done_recv     = MPID_nem_lmt_dma_done_recv;
478
        vc_ch->lmt_vc_terminated = MPID_nem_lmt_dma_vc_terminated;
479
480
481
482
483
484
485
#elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_VMSPLICE
        vc_ch->lmt_initiate_lmt  = MPID_nem_lmt_vmsplice_initiate_lmt;
        vc_ch->lmt_start_recv    = MPID_nem_lmt_vmsplice_start_recv;
        vc_ch->lmt_start_send    = MPID_nem_lmt_vmsplice_start_send;
        vc_ch->lmt_handle_cookie = MPID_nem_lmt_vmsplice_handle_cookie;
        vc_ch->lmt_done_send     = MPID_nem_lmt_vmsplice_done_send;
        vc_ch->lmt_done_recv     = MPID_nem_lmt_vmsplice_done_recv;
486
        vc_ch->lmt_vc_terminated = MPID_nem_lmt_vmsplice_vc_terminated;
487
488
489
490
491
492
493
#elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_NONE
        vc_ch->lmt_initiate_lmt  = NULL;
        vc_ch->lmt_start_recv    = NULL;
        vc_ch->lmt_start_send    = NULL;
        vc_ch->lmt_handle_cookie = NULL;
        vc_ch->lmt_done_send     = NULL;
        vc_ch->lmt_done_recv     = NULL;
494
        vc_ch->lmt_vc_terminated = NULL;
495
496
497
#else
#  error Must select a valid local LMT implementation!
#endif
498
499

        vc_ch->lmt_copy_buf        = NULL;
500
501
        mpi_errno = MPIU_SHMW_Hnd_init(&(vc_ch->lmt_copy_buf_handle));
        if(mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
502
503
        mpi_errno = MPIU_SHMW_Hnd_init(&(vc_ch->lmt_recv_copy_buf_handle));
        if(mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
504
        vc_ch->lmt_queue.head      = NULL;
505
        vc_ch->lmt_queue.tail      = NULL;
506
507
508
        vc_ch->lmt_active_lmt      = NULL;
        vc_ch->lmt_enqueued        = FALSE;

509
        if (MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ == -1)
510
            vc->eager_max_msg_sz = MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
511
        else
512
            vc->eager_max_msg_sz = MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ;
513

514
        if (MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ == -2)
515
516
            vc->ready_eager_max_msg_sz = vc->eager_max_msg_sz; /* force local ready sends to use LMT */
        else
517
            vc->ready_eager_max_msg_sz = MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ;
518
519

        MPIU_DBG_MSG(VC, VERBOSE, "vc using shared memory");
520
521
522
523
524
525
526
527
528
529
530
531
532
    }
    else
    {
	vc_ch->fbox_out   = NULL;
	vc_ch->fbox_in    = NULL;
	vc_ch->recv_queue = NULL;

        vc_ch->lmt_initiate_lmt  = NULL;
        vc_ch->lmt_start_recv    = NULL;
        vc_ch->lmt_start_send    = NULL;
        vc_ch->lmt_handle_cookie = NULL;
        vc_ch->lmt_done_send     = NULL;
        vc_ch->lmt_done_recv     = NULL;
533
        vc_ch->lmt_vc_terminated = NULL;
534
535
536
537

        /* FIXME: DARIUS set these to default for now */
        vc_ch->iStartContigMsg = NULL;
        vc_ch->iSendContig     = NULL;
538

539
540
        MPIU_DBG_MSG_FMT(VC, VERBOSE, (MPIU_DBG_FDEST, "vc using %s netmod for rank %d pg %s",
                                       MPID_nem_netmod_strings[MPID_nem_netmod_id], vc->pg_rank,
541
542
543
                                       ((vc->pg == MPIDI_Process.my_pg) 
                                        ? "my_pg" 
                                        :   ((vc->pg)
544
545
                                             ? ((char *)vc->pg->id)
                                             : "unknown"
546
                                            )
547
548
                                           )
                             ));
549
550
        
        mpi_errno = MPID_nem_netmod_func->vc_init(vc);
551
	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
552
553
554
555
556
557
558
559
560

/* FIXME: DARIUS -- enable this assert once these functions are implemented */
/*         /\* iStartContigMsg iSendContig and sendNoncontig_fn must */
/*            be set for nonlocal processes.  Default functions only */
/*            support shared-memory communication. *\/ */
/*         MPIU_Assert(vc_ch->iStartContigMsg && vc_ch->iSendContig && vc->sendNoncontig_fn); */

    }

561
562
    MPIU_THREAD_CS_EXIT(LMT,);

563
564
565
566
    /* FIXME: ch3 assumes there is a field called sendq_head in the ch
       portion of the vc.  This is unused in nemesis and should be set
       to NULL */
    vc_ch->sendq_head = NULL;
567

568
    MPIU_CHKPMEM_COMMIT();
569
 fn_exit:
570
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_VC_INIT);
571
572
573
574
575
576
577
578
579
580
581
582
583
584
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPID_nem_vc_destroy
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int
MPID_nem_vc_destroy(MPIDI_VC_t *vc)
{
    int mpi_errno = MPI_SUCCESS;
585
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
586
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_VC_DESTROY);
587

588
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_VC_DESTROY);
589
590
591

    MPIU_Free(vc_ch->pending_pkt);

592
    mpi_errno = MPID_nem_netmod_func->vc_destroy(vc);
593
594
595
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    fn_exit:
596
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_VC_DESTROY);
597
598
599
600
601
602
603
604
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

int
MPID_nem_get_business_card (int my_rank, char *value, int length)
{
605
    return MPID_nem_netmod_func->get_business_card (my_rank, &value, &length);
606
607
608
609
}

int MPID_nem_connect_to_root (const char *business_card, MPIDI_VC_t *new_vc)
{
610
    return MPID_nem_netmod_func->connect_to_root (business_card, new_vc);
611
}
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667

/* get_local_procs() determines which processes are local and
   should use shared memory
 
   If an output variable pointer is NULL, it won't be set.

   Caller should NOT free any returned buffers.

   Note that this is really only a temporary solution as it only
   calculates these values for processes MPI_COMM_WORLD, i.e., not for
   spawned or attached processes.
*/
#undef FUNCNAME
#define FUNCNAME get_local_procs
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int get_local_procs(MPIDI_PG_t *pg, int our_pg_rank, int *num_local_p,
                           int **local_procs_p, int *local_rank_p)
{
    int mpi_errno = MPI_SUCCESS;
    int *procs;
    int i;
    int num_local = 0;
    MPID_Node_id_t our_node_id;
    MPIU_CHKPMEM_DECL(1);

    MPIU_Assert(our_pg_rank < pg->size);
    our_node_id = pg->vct[our_pg_rank].node_id;

    MPIU_CHKPMEM_MALLOC(procs, int *, pg->size * sizeof(int), mpi_errno, "local process index array");

    for (i = 0; i < pg->size; ++i) {
        if (our_node_id == pg->vct[i].node_id) {
            if (i == our_pg_rank && local_rank_p != NULL) {
                *local_rank_p = num_local;
            }
            procs[num_local] = i;
            ++num_local;
        }
    }

    MPIU_CHKPMEM_COMMIT();

    if (num_local_p != NULL)
        *num_local_p = num_local;
    if (local_procs_p != NULL)
        *local_procs_p = procs;
fn_exit:
    return mpi_errno;
fn_fail:
    /* --BEGIN ERROR HANDLING-- */
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}