mpiimpl.h 205 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
/*  
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
5
6
7
8
9
10
11
12
13
 *
 * Portions of this code were written by Microsoft. Those portions are
 * Copyright (c) 2007 Microsoft Corporation. Microsoft grants
 * permission to use, reproduce, prepare derivative works, and to
 * redistribute to others. The code is licensed "as is." The User
 * bears the risk of using it. Microsoft gives no express warranties,
 * guarantees or conditions. To the extent permitted by law, Microsoft
 * excludes the implied warranties of merchantability, fitness for a
 * particular purpose and non-infringement.
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
 */
#ifndef MPIIMPL_H_INCLUDED
#define MPIIMPL_H_INCLUDED

/*
 * This file is the temporary home of most of the definitions used to 
 * implement MPICH.  We will eventually divide this file into logical
 * pieces once we are certain of the relationships between the components.
 */

/* style: define:vsnprintf:1 sig:0 */
/* style: allow:printf:3 sig:0 */

/* Include the mpi definitions */
#include "mpi.h"

30
31
32
33
/* There are a few definitions that must be made *before* the mpichconf.h
   file is included.  These include the definitions of the error levels and some
   thread granularity constants */
#include "mpichconfconst.h"
34

35
36
37
38
/* Data computed by configure.  This is included *after* mpi.h because we
   do not want mpi.h to depend on any other files or configure flags */
#include "mpichconf.h"

39
40
41
42
43
44
/* if we are defining this, we must define it before including mpl.h */
#if defined(MPICH_DEBUG_MEMINIT)
#define MPL_VG_ENABLED 1
#endif
#include "mpl.h"

45
#include <stdio.h>
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#ifdef STDC_HEADERS
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#else
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#endif

#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif

66
67
68
69
70
/* for MAXHOSTNAMELEN under Linux and OSX */
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif

71
72
73
74
75
76
77
#if defined (HAVE_USLEEP)
#include <unistd.h>
#if defined (NEEDS_USLEEP_DECL)
int usleep(useconds_t usec);
#endif
#endif

78
79
80
81
82
83
#if (!defined MAXHOSTNAMELEN) && (!defined MAX_HOSTNAME_LEN)
#define MAX_HOSTNAME_LEN 256
#elif !defined MAX_HOSTNAME_LEN
#define MAX_HOSTNAME_LEN MAXHOSTNAMELEN
#endif

84
85
86
/* Default PMI version to use */
#define MPIU_DEFAULT_PMI_VERSION 1
#define MPIU_DEFAULT_PMI_SUBVERSION 1
87

88
89
90
91
92
93
94
95
/* This allows us to keep names local to a single file when we can use
   weak symbols */
#ifdef  USE_WEAK_SYMBOLS
#define PMPI_LOCAL static
#else
#define PMPI_LOCAL 
#endif

96
97
98
99
100
101
102
103
104
105
/* Fix for universal endianess added in autoconf 2.62 */
#ifdef WORDS_UNIVERSAL_ENDIAN
#if defined(__BIG_ENDIAN__)
#elif defined(__LITTLE_ENDIAN__)
#define WORDS_LITTLEENDIAN
#else
#error 'Universal endianess defined without __BIG_ENDIAN__ or __LITTLE_ENDIAN__'
#endif
#endif

106
107
108
109
110
111
112
113
114
115
116
117
118
/* Include some basic (and easily shared) definitions */
#include "mpibase.h"

/* FIXME: The code base should not define two of these */
/* This is used to quote a name in a definition (see FUNCNAME/FCNAME below) */
#ifndef MPIDI_QUOTE
#define MPIDI_QUOTE(A) MPIDI_QUOTE2(A)
#define MPIDI_QUOTE2(A) #A
#endif

/* 
   Include the implementation definitions (e.g., error reporting, thread
   portability)
119
   More detailed documentation is contained in the MPICH and ADI3 manuals.
120
121
122
123
 */
/* FIXME: ... to do ... */
#include "mpitypedefs.h"

124
125
126
/* This is the default implementation of MPIU_Memcpy.  We define this
   before including mpidpre.h so that it can be used when a device or
   channel can use it if it's overriding MPIU_Memcpy.  */
127
128
129
MPIU_DBG_ATTRIBUTE_NOINLINE
ATTRIBUTE((unused))
static MPIU_DBG_INLINE_KEYWORD void MPIUI_Memcpy(void * dst, const void * src, size_t len)
130
131
132
133
{
    memcpy(dst, src, len);
}

134
135
136
137
138
139
140
141
/* Include definitions from the device which must exist before items in this
   file (mpiimpl.h) can be defined. mpidpre.h must be included before any
   files that allow the device to override or extend any terms; this includes
   mpiimplthread.h and mpiutil.h */
/* ------------------------------------------------------------------------- */
#include "mpidpre.h"
/* ------------------------------------------------------------------------- */

142
143
144
145
146
147
148
149
150
/* Overriding memcpy:
   Devices and channels can override the default implementation of
   MPIU_Memcpy by defining the MPIU_Memcpy macro.  The implementation
   can call MPIUI_Memcpy for the default memcpy implementation.   
   Note that MPIU_Memcpy and MPIUI_Memcpy return void rather than a
   pointer to the destination buffer.  This is different from C89
   memcpy.
*/
#ifndef MPIU_Memcpy
151
152
153
154
155
#define MPIU_Memcpy(dst, src, len)                \
    do {                                          \
        MPIU_MEM_CHECK_MEMCPY((dst),(src),(len)); \
        MPIUI_Memcpy((dst), (src), (len));        \
    } while (0)
156
157
#endif

158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#include "mpiimplthread.h"
#include "mpiutil.h"

/* ------------------------------------------------------------------------- */
/* mpidebug.h */
/* ------------------------------------------------------------------------- */
/* Debugging and printf control */
/* Use these *only* for debugging output intended for the implementors
   and maintainers of MPICH.  Do *not* use these for any output that
   general users may normally see.  Use either the error code creation
   routines for error messages or MPIU_msg_printf etc. for general messages 
   (MPIU_msg_printf will go through gettext).  

   FIXME: Document all of these macros

   NOTE: These macros and values are deprecated.  See 
174
   www.mcs.anl.gov/mpi/mpich/developer/design/debugmsg.htm for 
175
176
177
178
179
180
181
182
183
   the new design (only partially implemented at this time).
   
   The implementation is in mpidbg.h
*/
#include "mpidbg.h"

#if defined(MPICH_DBG_OUTPUT)
#define MPIU_DBG_PRINTF(e)			\
{						\
184
    if (MPIU_dbg_state != MPIU_DBG_STATE_NONE)	\
185
186
187
188
189
190
191
192
193
194
195
196
    {						\
	MPIU_dbg_printf e;			\
    }						\
}
/* The first argument is a place holder to allow the selection of a subset
   of debugging events.  The second is a placeholder to allow a numeric
   level of debugging within that class.  The third is the debugging text */
#define MPIU_DBG_PRINTF_CLASS(_c,_l,_e) MPIU_DBG_PRINTF(_e)
#else
#define MPIU_DBG_PRINTF(e)
#define MPIU_DBG_PRINTF_CLASS(_c,_l,_e)
#endif
197

198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* The follow is temporarily provided for backward compatibility.  Any code
   using dbg_printf should be updated to use MPIU_DBG_PRINTF. */
#define dbg_printf MPIU_dbg_printf

/* ------------------------------------------------------------------------- */
/* end of mpidebug.h */
/* ------------------------------------------------------------------------- */

/* Routines for memory management */
#include "mpimem.h"

/*
 * Use MPIU_SYSCALL to wrap system calls; this provides a convenient point
 * for timing the calls and keeping track of the use of system calls.
 * This macro simply invokes the system call and does not even handle
 * EINTR.
 * To use, 
 *    MPIU_SYSCALL( return-value, name-of-call, args-in-parenthesis )
 * e.g., change "n = read(fd,buf,maxn);" into
 *    MPIU_SYSCALL( n,read,(fd,buf,maxn) );
 * An example that prints each syscall to stdout is shown below. 
 */
#ifdef USE_LOG_SYSCALLS
#define MPIU_SYSCALL(a_,b_,c_) { \
    printf( "[%d]about to call %s\n", MPIR_Process.comm_world->rank,#b_);\
          fflush(stdout); errno = 0;\
    a_ = b_ c_; \
    if ((a_)>=0 || errno==0) {\
    printf( "[%d]%s returned %d\n", \
          MPIR_Process.comm_world->rank, #b_, a_ );\
    } \
 else { \
    printf( "[%d]%s returned %d (errno = %d,%s)\n", \
          MPIR_Process.comm_world->rank, \
232
          #b_, a_, errno, MPIU_Strerror(errno));\
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
    };           fflush(stdout);}
#else
#define MPIU_SYSCALL(a_,b_,c_) a_ = b_ c_
#endif

/*TDSOverview.tex
  
  MPI has a number of data structures, most of which are represented by 
  an opaque handle in an MPI program.  In the MPICH implementation of MPI, 
  these handles are represented
  as integers; this makes implementation of the C/Fortran handle transfer 
  calls (part of MPI-2) easy.  
 
  MPID objects (again with the possible exception of 'MPI_Request's) 
  are allocated by a common set of object allocation functions.
  These are 
.vb
    void *MPIU_Handle_obj_create( MPIU_Object_alloc_t *objmem )
    void MPIU_Handle_obj_destroy( MPIU_Object_alloc_t *objmem, void *object )
.ve
  where 'objmem' is a pointer to a memory allocation object that knows 
  enough to allocate objects, including the
  size of the object and the location of preallocated memory, as well 
  as the type of memory allocator.  By providing the routines to allocate and
  free the memory, we make it easy to use the same interface to allocate both
  local and shared memory for objects (always using the same kind for each 
  type of object).

  The names create/destroy were chosen because they are different from 
  new/delete (C++ operations) and malloc/free.  
  Any name choice will have some conflicts with other uses, of course.

  Reference Counts:
  Many MPI objects have reference count semantics.  
  The semantics of MPI require that many objects that have been freed by the 
  user 
  (e.g., with 'MPI_Type_free' or 'MPI_Comm_free') remain valid until all 
  pending
  references to that object (e.g., by an 'MPI_Irecv') are complete.  There
  are several ways to implement this; MPICH uses `reference counts` in the
  objects.  To support the 'MPI_THREAD_MULTIPLE' level of thread-safety, these
  reference counts must be accessed and updated atomically.  
  A reference count for
  `any` object can be incremented (atomically) 
  with 'MPIU_Object_add_ref(objptr)'
  and decremented with 'MPIU_Object_release_ref(objptr,newval_ptr)'.  
  These have been designed so that then can be implemented as inlined 
  macros rather than function calls, even in the multithreaded case, and
  can use special processor instructions that guarantee atomicity to 
  avoid thread locks.
  The decrement routine sets the value pointed at by 'inuse_ptr' to 0 if 
  the postdecrement value of the reference counter is zero, and to a non-zero
  value otherwise.  If this value is zero, then the routine that decremented 
  the
  reference count should free the object.  This may be as simple as 
  calling 'MPIU_Handle_obj_destroy' (for simple objects with no other allocated
  storage) or may require calling a separate routine to destroy the object.
  Because MPI uses 'MPI_xxx_free' to both decrement the reference count and 
  free the object if the reference count is zero, we avoid the use of 'free'
  in the MPID routines.

  The 'inuse_ptr' approach is used rather than requiring the post-decrement
  value because, for reference-count semantics, all that is necessary is
  to know when the reference count reaches zero, and this can sometimes
  be implemented more cheaply that requiring the post-decrement value (e.g.,
  on IA32, there is an instruction for this operation).

  Question:
  Should we state that this is a macro so that we can use a register for
  the output value?  That avoids a store.  Alternately, have the macro 
  return the value as if it was a function?

  Structure Definitions:
  The structure definitions in this document define `only` that part of
  a structure that may be used by code that is making use of the ADI.
  Thus, some structures, such as 'MPID_Comm', have many defined fields;
  these are used to support MPI routines such as 'MPI_Comm_size' and
  'MPI_Comm_remote_group'.  Other structures may have few or no defined
  members; these structures have no fields used outside of the ADI.  
  In C++ terms,  all members of these structures are 'private'.  

  For the initial implementation, we expect that the structure definitions 
  will be designed for the multimethod device.  However, all items that are
  specific to a particular device (including the multi-method device) 
  will be placed at the end of the structure;
  the document will clearly identify the members that all implementations
  will provide.  This simplifies much of the code in both the ADI and the 
  implementation of the MPI routines because structure member can be directly
  accessed rather than using some macro or C++ style method interface.
  
 T*/

325
326
327
328
/* mpi_lang.h - Prototypes for language specific routines. Currently used to
 * set keyval attribute callbacks
 */
#include "mpi_lang.h"
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
/* Known language bindings */
/*E
  MPID_Lang_t - Known language bindings for MPI

  A few operations in MPI need to know what language they were called from
  or created by.  This type enumerates the possible languages so that
  the MPI implementation can choose the correct behavior.  An example of this
  are the keyval attribute copy and delete functions.

  Module:
  Attribute-DS
  E*/
typedef enum MPID_Lang_t { MPID_LANG_C 
#ifdef HAVE_FORTRAN_BINDING
			   , MPID_LANG_FORTRAN
			   , MPID_LANG_FORTRAN90
#endif
#ifdef HAVE_CXX_BINDING
			   , MPID_LANG_CXX
#endif
} MPID_Lang_t;

/* Macros for the MPI handles (e.g., the object that encodes an
   MPI_Datatype) */
#include "mpihandlemem.h"

355
356
357
358
/* This routine is used to install an attribute free routine for datatypes
   at finalize-time */
void MPIR_DatatypeAttrFinalize( void );

359
360
361
362
363
364
365
366
367
368
369
/* ------------------------------------------------------------------------- */
/* Should the following be moved into mpihandlemem.h ?*/
/* ------------------------------------------------------------------------- */

/* Routines to initialize handle allocations */
/* These are now internal to the handlemem package
void *MPIU_Handle_direct_init( void *, int, int, int );
void *MPIU_Handle_indirect_init( void *(**)[], int *, int, int, int, int );
int MPIU_Handle_free( void *((*)[]), int );
*/
/* Convert Handles to objects for MPI types that have predefined objects */
370
371
372
/* TODO examine generated assembly for this construct, it's probably suboptimal
 * on Blue Gene.  An if/else if/else might help the compiler out.  It also lets
 * us hint that one case is likely(), usually the BUILTIN case. */
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
#define MPID_Getb_ptr(kind,a,bmsk,ptr)                                  \
{                                                                       \
   switch (HANDLE_GET_KIND(a)) {                                        \
      case HANDLE_KIND_BUILTIN:                                         \
          ptr=MPID_##kind##_builtin+((a)&(bmsk));                       \
          break;                                                        \
      case HANDLE_KIND_DIRECT:                                          \
          ptr=MPID_##kind##_direct+HANDLE_INDEX(a);                     \
          break;                                                        \
      case HANDLE_KIND_INDIRECT:                                        \
          ptr=((MPID_##kind*)                                           \
               MPIU_Handle_get_ptr_indirect(a,&MPID_##kind##_mem));     \
          break;                                                        \
      case HANDLE_KIND_INVALID:                                         \
      default:								\
          ptr=0;							\
          break;							\
    }                                                                   \
}

/* Convert handles to objects for MPI types that do _not_ have any predefined
   objects */
#define MPID_Get_ptr(kind,a,ptr)					\
{									\
   switch (HANDLE_GET_KIND(a)) {					\
      case HANDLE_KIND_DIRECT:						\
          ptr=MPID_##kind##_direct+HANDLE_INDEX(a);			\
          break;							\
      case HANDLE_KIND_INDIRECT:					\
          ptr=((MPID_##kind*)						\
               MPIU_Handle_get_ptr_indirect(a,&MPID_##kind##_mem));	\
          break;							\
      case HANDLE_KIND_INVALID:						\
      case HANDLE_KIND_BUILTIN:						\
      default:								\
          ptr=0;							\
          break;							\
     }									\
}

/* FIXME: the masks should be defined with the handle definitions instead
   of inserted here as literals */
#define MPID_Comm_get_ptr(a,ptr)       MPID_Getb_ptr(Comm,a,0x03ffffff,ptr)
#define MPID_Group_get_ptr(a,ptr)      MPID_Getb_ptr(Group,a,0x03ffffff,ptr)
#define MPID_File_get_ptr(a,ptr)       MPID_Get_ptr(File,a,ptr)
#define MPID_Errhandler_get_ptr(a,ptr) MPID_Getb_ptr(Errhandler,a,0x3,ptr)
#define MPID_Op_get_ptr(a,ptr)         MPID_Getb_ptr(Op,a,0x000000ff,ptr)
#define MPID_Info_get_ptr(a,ptr)       MPID_Get_ptr(Info,a,ptr)
#define MPID_Win_get_ptr(a,ptr)        MPID_Get_ptr(Win,a,ptr)
#define MPID_Request_get_ptr(a,ptr)    MPID_Get_ptr(Request,a,ptr)
#define MPID_Grequest_class_get_ptr(a,ptr) MPID_Get_ptr(Grequest_class,a,ptr)
/* Keyvals have a special format. This is roughly MPID_Get_ptrb, but
   the handle index is in a smaller bit field.  In addition, 
   there is no storage for the builtin keyvals.  
   For the indirect case, we mask off the part of the keyval that is
   in the bits normally used for the indirect block index.
*/
#define MPID_Keyval_get_ptr(a,ptr)     \
{                                                                       \
   switch (HANDLE_GET_KIND(a)) {                                        \
      case HANDLE_KIND_BUILTIN:                                         \
          ptr=0;                                                        \
          break;                                                        \
      case HANDLE_KIND_DIRECT:                                          \
          ptr=MPID_Keyval_direct+((a)&0x3fffff);                        \
          break;                                                        \
      case HANDLE_KIND_INDIRECT:                                        \
          ptr=((MPID_Keyval*)                                           \
             MPIU_Handle_get_ptr_indirect((a)&0xfc3fffff,&MPID_Keyval_mem)); \
          break;                                                        \
      case HANDLE_KIND_INVALID:                                         \
      default:								\
          ptr=0;							\
          break;							\
    }                                                                   \
}

/* Valid pointer checks */
/* This test is lame.  Should eventually include cookie test 
   and in-range addresses */
#define MPID_Valid_ptr(kind,ptr,err) \
  {if (!(ptr)) { err = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, \
                                             "**nullptrtype", "**nullptrtype %s", #kind ); } }
#define MPID_Valid_ptr_class(kind,ptr,errclass,err) \
  {if (!(ptr)) { err = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, errclass, \
                                             "**nullptrtype", "**nullptrtype %s", #kind ); } }

#define MPID_Info_valid_ptr(ptr,err) MPID_Valid_ptr_class(Info,ptr,MPI_ERR_INFO,err)
/* Check not only for a null pointer but for an invalid communicator,
   such as one that has been freed.  Let's try the ref_count as the test
   for now */
464
465
/* ticket #1441: check (refcount<=0) to cover the case of 0, an "over-free" of
 * -1 or similar, and the 0xecec... case when --enable-g=mem is used */
466
467
#define MPID_Comm_valid_ptr(ptr,err) {                \
     MPID_Valid_ptr_class(Comm,ptr,MPI_ERR_COMM,err); \
468
     if ((ptr) && MPIU_Object_get_ref(ptr) <= 0) {    \
469
470
471
472
         MPIU_ERR_SET(err,MPI_ERR_COMM,"**comm");     \
         ptr = 0;                                     \
     }                                                \
}
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#define MPID_Group_valid_ptr(ptr,err) MPID_Valid_ptr_class(Group,ptr,MPI_ERR_GROUP,err)
#define MPID_Win_valid_ptr(ptr,err) MPID_Valid_ptr_class(Win,ptr,MPI_ERR_WIN,err)
#define MPID_Op_valid_ptr(ptr,err) MPID_Valid_ptr_class(Op,ptr,MPI_ERR_OP,err)
#define MPID_Errhandler_valid_ptr(ptr,err) MPID_Valid_ptr_class(Errhandler,ptr,MPI_ERR_ARG,err)
#define MPID_File_valid_ptr(ptr,err) MPID_Valid_ptr_class(File,ptr,MPI_ERR_FILE,err)
#define MPID_Request_valid_ptr(ptr,err) MPID_Valid_ptr_class(Request,ptr,MPI_ERR_REQUEST,err)
#define MPID_Keyval_valid_ptr(ptr,err) MPID_Valid_ptr_class(Keyval,ptr,MPI_ERR_KEYVAL,err)

/* FIXME: 
   Generic pointer test.  This is applied to any address, not just one from
   an MPI object.
   Currently unimplemented (returns success except for null pointers.
   With a little work, could check that the pointer is properly aligned,
   using something like 
   ((p) == 0 || ((char *)(p) & MPID_Alignbits[alignment] != 0)
   where MPID_Alignbits is set with a mask whose bits must be zero in a 
   properly aligned quantity.  For systems with no alignment rules, 
   all of these masks are zero, and this part of test can be eliminated.
 */
#define MPID_Pointer_is_invalid(p,alignment) ((p) == 0)
/* Fixme: The following MPID_ALIGNED_xxx values are temporary.  They 
   need to be computed by configure and included in the mpichconf.h file.
   Note that they cannot be set conservatively (i.e., as sizeof(object)),
   since the runtime system may generate objects with lesser alignment
   rules if the processor allows them.
 */
#define MPID_ALIGNED_PTR_INT   1
#define MPID_ALIGNED_PTR_LONG  1
#define MPID_ALIGNED_PTR_VOIDP 1
/* ------------------------------------------------------------------------- */
/* end of code that should the following be moved into mpihandlemem.h ?*/
/* ------------------------------------------------------------------------- */

/* ------------------------------------------------------------------------- */
/* Info */
/*TInfoOverview.tex

  'MPI_Info' provides a way to create a list of '(key,value)' pairs
  where the 'key' and 'value' are both strings.  Because many routines, both
  in the MPI implementation and in related APIs such as the PMI process
  management interface, require 'MPI_Info' arguments, we define a simple 
  structure for each 'MPI_Info' element.  Elements are allocated by the 
  generic object allocator; the head element is always empty (no 'key'
  or 'value' is defined on the head element).  
  
  For simplicity, we have not abstracted the info data structures;
  routines that want to work with the linked list may do so directly.
  Because the 'MPI_Info' type is a handle and not a pointer, an MPIU
  (utility) routine is provided to handle the 
  deallocation of 'MPID_Info' elements.  See the implementation of
  'MPI_Info_create' for how an Info type is allocated.

  Thread Safety:

  The info interface itself is not thread-robust.  In particular, the routines
  'MPI_INFO_GET_NKEYS' and 'MPI_INFO_GET_NTHKEY' assume that no other 
  thread modifies the info key.  (If the info routines had the concept
  of a next value, they would not be thread safe.  As it stands, a user
  must be careful if several threads have access to the same info object.) 
  Further, 'MPI_INFO_DUP', while not 
  explicitly advising implementers to be careful of one thread modifying the
  'MPI_Info' structure while 'MPI_INFO_DUP' is copying it, requires that the
  operation take place in a thread-safe manner.
  There isn'' much that we can do about these cases.  There are other cases
  that must be handled.  In particular, multiple threads are allowed to 
  update the same info value.  Thus, all of the update routines must be thread
  safe; the simple implementation used in the MPICH implementation uses locks.
  Note that the 'MPI_Info_delete' call does not need a lock; the defintion of
  thread-safety means that any order of the calls functions correctly; since
  it invalid either to delete the same 'MPI_Info' twice or to modify an
  'MPI_Info' that has been deleted, only one thread at a time can call 
  'MPI_Info_free' on any particular 'MPI_Info' value.  

  T*/
/*S
  MPID_Info - Structure of an MPID info

  Notes:
  There is no reference count because 'MPI_Info' values, unlike other MPI 
  objects, may be changed after they are passed to a routine without 
  changing the routine''s behavior.  In other words, any routine that uses
  an 'MPI_Info' object must make a copy or otherwise act on any info value
  that it needs.

  A linked list is used because the typical 'MPI_Info' list will be short
  and a simple linked list is easy to implement and to maintain.  Similarly,
  a single structure rather than separate header and element structures are
  defined for simplicity.  No separate thread lock is provided because
  info routines are not performance critical; they may use the single
  critical section lock in the 'MPIR_Process' structure when they need a
  thread lock.
  
  This particular form of linked list (in particular, with this particular
  choice of the first two members) is used because it allows us to use 
  the same routines to manage this list as are used to manage the 
  list of free objects (in the file 'src/util/mem/handlemem.c').  In 
  particular, if lock-free routines for updating a linked list are 
  provided, they can be used for managing the 'MPID_Info' structure as well.

  The MPI standard requires that keys can be no less that 32 characters and
  no more than 255 characters.  There is no mandated limit on the size 
  of values.

  Module:
  Info-DS
  S*/
typedef struct MPID_Info {
580
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
    struct MPID_Info   *next;
    char               *key;
    char               *value;
} MPID_Info;
extern MPIU_Object_alloc_t MPID_Info_mem;
/* Preallocated info objects */
extern MPID_Info MPID_Info_direct[];
/* ------------------------------------------------------------------------- */

/* ------------------------------------------------------------------------- */
/* Error Handlers */
/*E
  MPID_Errhandler_fn - MPID Structure to hold an error handler function

  Notes:
  The MPI-1 Standard declared only the C version of this, implicitly 
  assuming that 'int' and 'MPI_Fint' were the same. 

  Since Fortran does not have a C-style variable number of arguments 
  interface, the Fortran interface simply accepts two arguments.  Some
  calling conventions for Fortran (particularly under Windows) require
  this.

  Module:
  ErrHand-DS
  
  Questions:
  What do we want to do about C++?  Do we want a hook for a routine that can
  be called to throw an exception in C++, particularly if we give C++ access
  to this structure?  Does the C++ handler need to be different (not part
  of the union)?

  E*/
typedef union MPID_Errhandler_fn {
   void (*C_Comm_Handler_function) ( MPI_Comm *, int *, ... );
   void (*F77_Handler_function) ( MPI_Fint *, MPI_Fint * );
   void (*C_Win_Handler_function) ( MPI_Win *, int *, ... );
   void (*C_File_Handler_function) ( MPI_File *, int *, ... );
} MPID_Errhandler_fn;

/*S
  MPID_Errhandler - Description of the error handler structure

  Notes:
  Device-specific information may indicate whether the error handler is active;
  this can help prevent infinite recursion in error handlers caused by 
  user-error without requiring the user to be as careful.  We might want to 
  make this part of the interface so that the 'MPI_xxx_call_errhandler' 
  routines would check.

  It is useful to have a way to indicate that the errhandler is no longer
  valid, to help catch the case where the user has freed the errhandler but
  is still using a copy of the 'MPI_Errhandler' value.  We may want to 
  define the 'id' value for deleted errhandlers.

  Module:
  ErrHand-DS
  S*/
typedef struct MPID_Errhandler {
640
  MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
641
642
643
644
645
646
647
648
649
650
651
652
653
  MPID_Lang_t        language;
  MPID_Object_kind   kind;
  MPID_Errhandler_fn errfn;
  /* Other, device-specific information */
#ifdef MPID_DEV_ERRHANDLER_DECL
    MPID_DEV_ERRHANDLER_DECL
#endif
} MPID_Errhandler;
extern MPIU_Object_alloc_t MPID_Errhandler_mem;
/* Preallocated errhandler objects */
extern MPID_Errhandler MPID_Errhandler_builtin[];
extern MPID_Errhandler MPID_Errhandler_direct[];

654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
/* We never reference count the builtin error handler objects, regardless of how
 * we decide to reference count the other predefined objects.  If we get to the
 * point where we never reference count *any* of the builtin objects then we
 * should probably remove these checks and let them fall through to the checks
 * for BUILTIN down in the MPIU_Object_* routines. */
#define MPIR_Errhandler_add_ref( _errhand )                               \
    do {                                                                  \
        if (HANDLE_GET_KIND((_errhand)->handle) != HANDLE_KIND_BUILTIN) { \
            MPIU_Object_add_ref( _errhand );                              \
        }                                                                 \
    } while (0)
#define MPIR_Errhandler_release_ref( _errhand, _inuse )                   \
    do {                                                                  \
        if (HANDLE_GET_KIND((_errhand)->handle) != HANDLE_KIND_BUILTIN) { \
            MPIU_Object_release_ref( (_errhand), (_inuse) );              \
        }                                                                 \
        else {                                                            \
            *(_inuse) = 1;                                                \
        }                                                                 \
    } while (0)
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
/* ------------------------------------------------------------------------- */

/* ------------------------------------------------------------------------- */
/* Keyvals and attributes */
/*TKyOverview.tex

  Keyvals are MPI objects that, unlike most MPI objects, are defined to be
  integers rather than a handle (e.g., 'MPI_Comm').  However, they really
  `are` MPI opaque objects and are handled by the MPICH implementation in
  the same way as all other MPI opaque objects.  The only difference is that
  there is no 'typedef int MPI_Keyval;' in 'mpi.h'.  In particular, keyvals
  are encoded (for direct and indirect references) in the same way that 
  other MPI opaque objects are

  Each keyval has a copy and a delete function associated with it.
  Unfortunately, these have a slightly different calling sequence for
  each language, particularly when the size of a pointer is 
  different from the size of a Fortran integer.  The unions 
  'MPID_Copy_function' and 'MPID_Delete_function' capture the differences
  in a single union type.

695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
  The above comment is out of date but has never been updated as it should
  have to match the introduction of a different interface.  Beware!

  Notes: 
  
  In the original design, retrieving a attribute from a different
  language that set it was thought to be an error.  The MPI Forum
  decided that this should be allowed, and after much discussion, the
  behavior was defined.  Thus, we need to record what sort of
  attribute was provided, and be able to properly return the correct
  value in each case.  See MPI 2.2, Section 16.3.7 (Attributes) for
  specific requirements.  One consequence of this is that the value
  that is returned may have a different length that how it was set.
  On little-endian platforms (e.g., x86), this doesn't cause much of a
  problem, because the address is that of the least significant byte,
  and the lower bytes have the data that is needed in the case that
  the desired attribute type is shorter than the stored attribute.
  However, on a big-endian platform (e.g., IBM POWER), since the most
  significant bytes are stored first, depending on the length of the
  result type, the address of the result may not be the beginning of
  the memory area.  For example, assume that an MPI_Fint is 4 bytes
  and a void * (and a Fortran INTEGER of kind MPI_ADDRESS_KIND) is 8
  bytes, and let the attribute store the value in an 8 byte integer in
  a field named "value".  On a little-endian platform, the address of
  the value is always the beginning of the field "value".  On a
  big-endian platform, the address of the value is the beginning of
  the field if the return type is a pointer (e.g., from C) or Fortran
  (KIND=MPI_ADDRESS_KIND), and the address of the beginning of the
  field + 4 if the return type is a Fortran 77 integer (and, as
  specified above, an MPI_Fint is 4 bytes shorter than a void *).

  For the big-endian case, it is possible to manage these shifts (using
  WORDS_LITTLEENDIAN to detect the big-endian case).  Alternatively,
  at a small cost in space, copies in variables of the correct length
  can be maintained.  At this writing, the code in src/mpi/attr makes
  use of WORDS_LITTLEENDIAN to provide the appropriate code for the most
  common cases.
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778

  T*/
/*TAttrOverview.tex
 *
 * The MPI standard allows `attributes`, essentially an '(integer,pointer)'
 * pair, to be attached to communicators, windows, and datatypes.  
 * The integer is a `keyval`, which is allocated by a call (at the MPI level)
 * to 'MPI_Comm/Type/Win_create_keyval'.  The pointer is the value of 
 * the attribute.
 * Attributes are primarily intended for use by the user, for example, to save
 * information on a communicator, but can also be used to pass data to the
 * MPI implementation.  For example, an attribute may be used to pass 
 * Quality of Service information to an implementation to be used with 
 * communication on a particular communicator.  
 * To provide the most general access by the ADI to all attributes, the
 * ADI defines a collection of routines that are used by the implementation
 * of the MPI attribute routines (such as 'MPI_Comm_get_attr').
 * In addition, the MPI routines involving attributes will invoke the 
 * corresponding 'hook' functions (e.g., 'MPID_Dev_comm_attr_set_hook') 
 * should the device define them.
 *
 * Attributes on windows and datatypes are defined by MPI but not of 
 * interest (as yet) to the device.
 *
 * In addition, there are seven predefined attributes that the device must
 * supply to the implementation.  This is accomplished through 
 * data values that are part of the 'MPIR_Process' data block.
 *  The predefined keyvals on 'MPI_COMM_WORLD' are\:
 *.vb
 * Keyval                     Related Module
 * MPI_APPNUM                 Dynamic
 * MPI_HOST                   Core
 * MPI_IO                     Core
 * MPI_LASTUSEDCODE           Error
 * MPI_TAG_UB                 Communication
 * MPI_UNIVERSE_SIZE          Dynamic
 * MPI_WTIME_IS_GLOBAL        Timer
 *.ve
 * The values stored in the 'MPIR_Process' block are the actual values.  For 
 * example, the value of 'MPI_TAG_UB' is the integer value of the largest tag.
 * The
 * value of 'MPI_WTIME_IS_GLOBAL' is a '1' for true and '0' for false.  Likely
 * values for 'MPI_IO' and 'MPI_HOST' are 'MPI_ANY_SOURCE' and 'MPI_PROC_NULL'
 * respectively.
 *
 T*/

779
780
781
782
/* Include the attribute access routines that permit access to the 
   attribute or its pointer, needed for cross-language access to attributes */
#include "mpi_attr.h"

783
784
785
786
787
788
789
790
791
792
793
794
/* Because Comm, Datatype, and File handles are all ints, and because
   attributes are otherwise identical between the three types, we
   only store generic copy and delete functions.  This allows us to use
   common code for the attribute set, delete, and dup functions */
/*E
  MPID_Copy_function - MPID Structure to hold an attribute copy function

  Notes:
  The appropriate element of this union is selected by using the language
  field of the 'keyval'.

  Because 'MPI_Comm', 'MPI_Win', and 'MPI_Datatype' are all 'int's in 
795
  MPICH, we use a single C copy function rather than have separate
796
797
798
799
800
801
802
803
804
805
  ones for the Communicator, Window, and Datatype attributes.

  There are no corresponding typedefs for the Fortran functions.  The 
  F77 function corresponds to the Fortran 77 binding used in MPI-1 and the
  F90 function corresponds to the Fortran 90 binding used in MPI-2.

  Module:
  Attribute-DS

  E*/
806
807
808
809
810
811
812
813
814
815
816
817
818
int
MPIR_Attr_copy_c_proxy(
    MPI_Comm_copy_attr_function* user_function,
    int handle,
    int keyval,
    void* extra_state,
    MPIR_AttrType attrib_type,
    void* attrib,
    void** attrib_copy,
    int* flag
    );

typedef struct MPID_Copy_function {
819
820
821
822
823
  int  (*C_CopyFunction)( int, int, void *, void *, void *, int * );
  void (*F77_CopyFunction)  ( MPI_Fint *, MPI_Fint *, MPI_Fint *, MPI_Fint *, 
                              MPI_Fint *, MPI_Fint *, MPI_Fint * );
  void (*F90_CopyFunction)  ( MPI_Fint *, MPI_Fint *, MPI_Aint *, MPI_Aint *,
                              MPI_Aint *, MPI_Fint *, MPI_Fint * );
824
825
826
827
828
829
  /* The generic lang-independent user_function and proxy will
   * replace the lang dependent copy funcs above
   * Currently the lang-indpendent funcs are used only for keyvals
   */
  MPI_Comm_copy_attr_function *user_function;
  MPID_Attr_copy_proxy *proxy;
830
831
832
833
834
835
836
837
838
839
840
  /* The C++ function is the same as the C function */
} MPID_Copy_function;

/*E
  MPID_Delete_function - MPID Structure to hold an attribute delete function

  Notes:
  The appropriate element of this union is selected by using the language
  field of the 'keyval'.

  Because 'MPI_Comm', 'MPI_Win', and 'MPI_Datatype' are all 'int's in 
841
  MPICH, we use a single C delete function rather than have separate
842
843
844
845
846
847
848
849
850
851
  ones for the Communicator, Window, and Datatype attributes.

  There are no corresponding typedefs for the Fortran functions.  The 
  F77 function corresponds to the Fortran 77 binding used in MPI-1 and the
  F90 function corresponds to the Fortran 90 binding used in MPI-2.

  Module:
  Attribute-DS

  E*/
852
853
854
855
856
857
858
859
860
861
862
int
MPIR_Attr_delete_c_proxy(
    MPI_Comm_delete_attr_function* user_function,
    int handle,
    int keyval,
    MPIR_AttrType attrib_type,
    void* attrib,
    void* extra_state
    );

typedef struct MPID_Delete_function {
863
864
865
866
867
  int  (*C_DeleteFunction)  ( int, int, void *, void * );
  void (*F77_DeleteFunction)( MPI_Fint *, MPI_Fint *, MPI_Fint *, MPI_Fint *, 
                              MPI_Fint * );
  void (*F90_DeleteFunction)( MPI_Fint *, MPI_Fint *, MPI_Aint *, MPI_Aint *, 
                              MPI_Fint * );
868
869
870
871
872
873
  /* The generic lang-independent user_function and proxy will
   * replace the lang dependent copy funcs above
   * Currently the lang-indpendent funcs are used only for keyvals
   */
  MPI_Comm_delete_attr_function *user_function;
  MPID_Attr_delete_proxy *proxy;
874
875
876
877
878
879
880
881
882
883
} MPID_Delete_function;

/*S
  MPID_Keyval - Structure of an MPID keyval

  Module:
  Attribute-DS

  S*/
typedef struct MPID_Keyval {
884
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
885
    MPID_Object_kind     kind;
886
    int                  was_freed;
887
888
889
890
891
892
893
894
895
    void                 *extra_state;
    MPID_Copy_function   copyfn;
    MPID_Delete_function delfn;
  /* other, device-specific information */
#ifdef MPID_DEV_KEYVAL_DECL
    MPID_DEV_KEYVAL_DECL
#endif
} MPID_Keyval;

896
897
898
899
900
901
902
903
904
#define MPIR_Keyval_add_ref( _keyval )                                  \
    do {                                                                \
        MPIU_Object_add_ref( _keyval );                                 \
    } while(0)

#define MPIR_Keyval_release_ref( _keyval, _inuse )                      \
    do {                                                                \
        MPIU_Object_release_ref( _keyval, _inuse );                     \
    } while(0)
905

906
907
908
909
910
911
912
913
914
915
916
917

/* Attribute values in C/C++ are void * and in Fortran are ADDRESS_SIZED
   integers.  Normally, these are the same size, but in at least one 
   case, the address-sized integers was selected as longer than void *
   to work with the datatype code used in the I/O library.  While this
   is really a limitation in the current Datatype implementation. */
#ifdef USE_AINT_FOR_ATTRVAL
typedef MPI_Aint MPID_AttrVal_t;
#else
typedef void * MPID_AttrVal_t;
#endif

918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
/* Attributes need no ref count or handle, but since we want to use the
   common block allocator for them, we must provide those elements 
*/
/*S
  MPID_Attribute - Structure of an MPID attribute

  Notes:
  Attributes don''t have 'ref_count's because they don''t have reference
  count semantics.  That is, there are no shallow copies or duplicates
  of an attibute.  An attribute is copied when the communicator that
  it is attached to is duplicated.  Subsequent operations, such as
  'MPI_Comm_attr_free', can change the attribute list for one of the
  communicators but not the other, making it impractical to keep the
  same list.  (We could defer making the copy until the list is changed,
  but even then, there would be no reference count on the individual
  attributes.)
 
  A pointer to the keyval, rather than the (integer) keyval itself is
  used since there is no need within the attribute structure to make
  it any harder to find the keyval structure.

  The attribute value is a 'void *'.  If 'sizeof(MPI_Fint)' > 'sizeof(void*)',
  then this must be changed (no such system has been encountered yet).
  For the Fortran 77 routines in the case where 'sizeof(MPI_Fint)' < 
  'sizeof(void*)', the high end of the 'void *' value is used.  That is,
  we cast it to 'MPI_Fint *' and use that value.
944
945
946
947
948
949
950
951
952
953

  MPI defines three kinds of attributes (see MPI 2.1, Section 16.3, pages 
  487-488 (the standard says two, but there are really three, as discussed
  below).  These are pointer-valued attributes and two types of integer-valued
  attributes.  
  Pointer-valued attributes are used in C.
  Integer-valued attributes are used in Fortran.  These are of type either
  INTEGER or INTEGER(KIND=MPI_ADDRESS_KIND).

  The predefined attributes are a combination of INTEGER and pointers.
954
955
956
957
958
959
 
  Module:
  Attribute-DS

 S*/
typedef struct MPID_Attribute {
960
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
961
    MPID_Keyval  *keyval;           /* Keyval structure for this attribute */
962

963
    struct MPID_Attribute *next;    /* Pointer to next in the list */
964
    MPIR_AttrType attrType;         /* Type of the attribute */
965
966
    long        pre_sentinal;       /* Used to detect user errors in accessing
				       the value */
967
968
969
    MPID_AttrVal_t value;           /* Stored value. An Aint must be at least
				       as large as an address - some builds
				       may make an Aint larger than a void * */
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
    long        post_sentinal;      /* Like pre_sentinal */
    /* other, device-specific information */
#ifdef MPID_DEV_ATTR_DECL
    MPID_DEV_ATTR_DECL
#endif
} MPID_Attribute;
/* ------------------------------------------------------------------------- */

/*---------------------------------------------------------------------------
 * Groups are *not* a major data structure in MPICH-2.  They are provided
 * only because they are required for the group operations (e.g., 
 * MPI_Group_intersection) and for the scalable RMA synchronization
 *---------------------------------------------------------------------------*/
/* This structure is used to implement the group operations such as 
   MPI_Group_translate_ranks */
typedef struct MPID_Group_pmap_t {
    int          lrank;     /* Local rank in group (between 0 and size-1) */
    int          lpid;      /* local process id, from VCONN */
    int          next_lpid; /* Index of next lpid (in lpid order) */
    int          flag;      /* marker, used to implement group operations */
} MPID_Group_pmap_t;

/* Any changes in the MPID_Group structure must be made to the
   predefined value in MPID_Group_builtin for MPI_GROUP_EMPTY in 
   src/mpi/group/grouputil.c */
/*S
 MPID_Group - Description of the Group data structure

 The processes in the group of 'MPI_COMM_WORLD' have lpid values 0 to 'size'-1,
 where 'size' is the size of 'MPI_COMM_WORLD'.  Processes created by 
 'MPI_Comm_spawn' or 'MPI_Comm_spawn_multiple' or added by 'MPI_Comm_attach' 
 or  
 'MPI_Comm_connect'
 are numbered greater than 'size - 1' (on the calling process). See the 
 discussion of LocalPID values.

 Note that when dynamic process creation is used, the pids are `not` unique
 across the universe of connected MPI processes.  This is ok, as long as
 pids are interpreted `only` on the process that owns them.

 Only for MPI-1 are the lpid''s equal to the `global` pids.  The local pids
 can be thought of as a reference not to the remote process itself, but
 how the remote process can be reached from this process.  We may want to 
 have a structure 'MPID_Lpid_t' that contains information on the remote
 process, such as (for TCP) the hostname, ip address (it may be different if
 multiple interfaces are supported; we may even want plural ip addresses for
 stripping communication), and port (or ports).  For shared memory connected
 processes, it might have the address of a remote queue.  The lpid number 
 is an index into a table of 'MPID_Lpid_t'''s that contain this (device- and
 method-specific) information.

 Module:
 Group-DS

 S*/
typedef struct MPID_Group {
1026
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1027
1028
1029
1030
1031
1032
    int          size;           /* Size of a group */
    int          rank;           /* rank of this process relative to this 
				    group */
    int          idx_of_first_lpid;
    MPID_Group_pmap_t *lrank_to_lpid; /* Array mapping a local rank to local 
					 process number */
1033
1034
    int          is_local_dense_monotonic; /* see NOTE-G1 */

1035
1036
1037
1038
1039
1040
1041
    /* We may want some additional data for the RMA syncrhonization calls */
  /* Other, device-specific information */
#ifdef MPID_DEV_GROUP_DECL
    MPID_DEV_GROUP_DECL
#endif
} MPID_Group;

1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
/* NOTE-G1: is_local_dense_monotonic will be true iff the group meets the
 * following criteria:
 * 1) the lpids are all in the range [0,size-1], i.e. a subset of comm world
 * 2) the pids are sequentially numbered in increasing order, without any gaps,
 *    stride, or repetitions
 *
 * This additional information allows us to handle the common case (insofar as
 * group ops are common) for MPI_Group_translate_ranks where group2 is
 * group_of(MPI_COMM_WORLD), or some simple subset.  This is an important use
 * case for many MPI tool libraries, such as Scalasca.
 */

1054
1055
1056
1057
1058
1059
extern MPIU_Object_alloc_t MPID_Group_mem;
/* Preallocated group objects */
#define MPID_GROUP_N_BUILTIN 1
extern MPID_Group MPID_Group_builtin[MPID_GROUP_N_BUILTIN];
extern MPID_Group MPID_Group_direct[];

1060
1061
1062
/* Object for empty group */
extern MPID_Group * const MPID_Group_empty;

1063
#define MPIR_Group_add_ref( _group ) \
1064
    do { MPIU_Object_add_ref( _group ); } while (0)
1065
1066

#define MPIR_Group_release_ref( _group, _inuse ) \
1067
     do { MPIU_Object_release_ref( _group, _inuse ); } while (0)
1068

1069
1070
1071
void MPIR_Group_setup_lpid_list( MPID_Group * );
int MPIR_GroupCheckVCRSubset( MPID_Group *group_ptr, int vsize, MPID_VCR *vcr, int *idx );

1072
1073
1074
1075
1076
1077
1078
1079
/* ------------------------------------------------------------------------- */

/*E
  MPID_Comm_kind_t - Name the two types of communicators
  E*/
typedef enum MPID_Comm_kind_t { 
    MPID_INTRACOMM = 0, 
    MPID_INTERCOMM = 1 } MPID_Comm_kind_t;
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089

/* ideally we could add these to MPID_Comm_kind_t, but there's too much existing
 * code that assumes that the only valid values are INTRACOMM or INTERCOMM */
typedef enum MPID_Comm_hierarchy_kind_t {
    MPID_HIERARCHY_FLAT = 0,        /* no hierarchy */
    MPID_HIERARCHY_PARENT = 1,      /* has subcommunicators */
    MPID_HIERARCHY_NODE_ROOTS = 2,  /* is the subcomm for node roots */
    MPID_HIERARCHY_NODE = 3,        /* is the subcomm for a node */
    MPID_HIERARCHY_SIZE             /* cardinality of this enum */
} MPID_Comm_hierarchy_kind_t;
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
/* Communicators */

/*S
  MPID_Comm - Description of the Communicator data structure

  Notes:
  Note that the size and rank duplicate data in the groups that
  make up this communicator.  These are used often enough that this
  optimization is valuable.  

  This definition provides only a 16-bit integer for context id''s .
  This should be sufficient for most applications.  However, extending
  this to a 32-bit (or longer) integer should be easy.

  There are two context ids.  One is used for sending and one for 
  receiving.  In the case of an Intracommunicator, they are the same
  context id.  They differ in the case of intercommunicators, where 
  they may come from processes in different comm worlds (in the
  case of MPI-2 dynamic process intercomms).  

  The virtual connection table is an explicit member of this structure.
  This contains the information used to contact a particular process,
  indexed by the rank relative to this communicator.

  Groups are allocated lazily.  That is, the group pointers may be
  null, created only when needed by a routine such as 'MPI_Comm_group'.
  The local process ids needed to form the group are available within
  the virtual connection table.
  For intercommunicators, we may want to always have the groups.  If not, 
  we either need the 'local_group' or we need a virtual connection table
  corresponding to the 'local_group' (we may want this anyway to simplify
  the implementation of the intercommunicator collective routines).

  The pointer to the structure 'MPID_Collops' containing pointers to the 
  collective  
  routines allows an implementation to replace each routine on a 
  routine-by-routine basis.  By default, this pointer is null, as are the 
  pointers within the structure.  If either pointer is null, the implementation
  uses the generic provided implementation.  This choice, rather than
  initializing the table with pointers to all of the collective routines,
  is made to reduce the space used in the communicators and to eliminate the
  need to include the implementation of all collective routines in all MPI 
  executables, even if the routines are not used.

  The macro 'MPID_HAS_HETERO' may be defined by a device to indicate that
  the device supports MPI programs that must communicate between processes with
  different data representations (e.g., different sized integers or different
  byte orderings).  If the device does need to define this value, it should
  be defined in the file 'mpidpre.h'. 

1140
1141
1142
1143
1144
  Please note that the local_size and remote_size fields can be confusing.  For
  intracommunicators both fields are always equal to the size of the
  communicator.  For intercommunicators local_size is equal to the size of
  local_group while remote_size is equal to the size of remote_group.

1145
1146
1147
1148
1149
1150
1151
1152
  Module:
  Communicator-DS

  Question:
  For fault tolerance, do we want to have a standard field for communicator 
  health?  For example, ok, failure detected, all (live) members of failed 
  communicator have acked.
  S*/
1153
1154
typedef struct MPID_Comm {
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1155
1156
    MPIR_Context_id_t context_id; /* Send context id.  See notes */
    MPIR_Context_id_t recvcontext_id; /* Send context id.  See notes */
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
    int           remote_size;   /* Value of MPI_Comm_(remote)_size */
    int           rank;          /* Value of MPI_Comm_rank */
    MPID_VCRT     vcrt;          /* virtual connecton reference table */
    MPID_VCR *    vcr;           /* alias to the array of virtual connections
				    in vcrt */
    MPID_VCRT     local_vcrt;    /* local virtual connecton reference table */
    MPID_VCR *    local_vcr;     /* alias to the array of local virtual
				    connections in local vcrt */
    MPID_Attribute *attributes;  /* List of attributes */
    int           local_size;    /* Value of MPI_Comm_size for local group */
    MPID_Group   *local_group,   /* Groups in communicator. */
                 *remote_group;  /* The local and remote groups are the
                                    same for intra communicators */
    MPID_Comm_kind_t comm_kind;  /* MPID_INTRACOMM or MPID_INTERCOMM */
    char          name[MPI_MAX_OBJECT_NAME];  /* Required for MPI-2 */
    MPID_Errhandler *errhandler; /* Pointer to the error handler structure */
    struct MPID_Comm    *local_comm; /* Defined only for intercomms, holds
				        an intracomm for the local group */
1175
1176

    MPID_Comm_hierarchy_kind_t hierarchy_kind; /* flat, parent, node, or node_roots */
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
    struct MPID_Comm *node_comm; /* Comm of processes in this comm that are on
                                    the same node as this process. */
    struct MPID_Comm *node_roots_comm; /* Comm of root processes for other nodes. */
    int *intranode_table;        /* intranode_table[i] gives the rank in
                                    node_comm of rank i in this comm or -1 if i
                                    is not in this process' node_comm.
                                    It is of size 'local_size'. */
    int *internode_table;        /* internode_table[i] gives the rank in
                                    node_roots_comm of rank i in this comm.
                                    It is of size 'local_size'. */
1187

1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
    int           is_low_group;  /* For intercomms only, this boolean is
				    set for all members of one of the 
				    two groups of processes and clear for 
				    the other.  It enables certain
				    intercommunicator collective operations
				    that wish to use half-duplex operations
				    to implement a full-duplex operation */
    struct MPID_Comm     *comm_next;/* Provides a chain through all active 
				       communicators */
    struct MPID_Collops  *coll_fns; /* Pointer to a table of functions 
                                              implementing the collective 
                                              routines */
    struct MPID_TopoOps  *topo_fns; /* Pointer to a table of functions
				       implementting the topology routines
				    */
1203
    int next_sched_tag;             /* used by the NBC schedule code to allocate tags */
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
#ifdef MPID_HAS_HETERO
    int is_hetero;
#endif
  /* Other, device-specific information */
#ifdef MPID_DEV_COMM_DECL
    MPID_DEV_COMM_DECL
#endif
} MPID_Comm;
extern MPIU_Object_alloc_t MPID_Comm_mem;

1214
1215
/* this function should not be called by normal code! */
int MPIR_Comm_delete_internal(MPID_Comm * comm_ptr, int isDisconnect);
1216
1217

#define MPIR_Comm_add_ref(_comm) \
1218
    do { MPIU_Object_add_ref((_comm)); } while (0)
1219
#define MPIR_Comm_release_ref( _comm, _inuse ) \
1220
    do { MPIU_Object_release_ref( _comm, _inuse ); } while (0)
1221

1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258

/* Release a reference to a communicator.  If there are no pending
   references, delete the communicator and recover all storage and
   context ids.

   This routine has been inlined because keeping it as a separate routine
   results in a >5% performance hit for the SQMR benchmark.
*/
#undef FUNCNAME
#define FUNCNAME MPIR_Comm_release
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static inline int MPIR_Comm_release(MPID_Comm * comm_ptr, int isDisconnect)
{
    int mpi_errno = MPI_SUCCESS;
    int in_use;

    MPIR_Comm_release_ref(comm_ptr, &in_use);
    if (unlikely(!in_use)) {
        /* the following routine should only be called by this function and its
         * "_always" variant. */
        mpi_errno = MPIR_Comm_delete_internal(comm_ptr, isDisconnect);
        /* not ERR_POPing here to permit simpler inlining.  Our caller will
         * still report the error from the comm_delete level. */
    }

    return mpi_errno;
}
#undef FUNCNAME
#undef FCNAME

/* MPIR_Comm_release_always is the same as MPIR_Comm_release except it uses
   MPIR_Comm_release_ref_always instead.
*/
int MPIR_Comm_release_always(MPID_Comm *comm_ptr, int isDisconnect);


1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
/* Preallocated comm objects.  There are 3: comm_world, comm_self, and 
   a private (non-user accessible) dup of comm world that is provided 
   if needed in MPI_Finalize.  Having a separate version of comm_world
   avoids possible interference with User code */
#define MPID_COMM_N_BUILTIN 3
extern MPID_Comm MPID_Comm_builtin[MPID_COMM_N_BUILTIN];
extern MPID_Comm MPID_Comm_direct[];
/* This is the handle for the internal MPI_COMM_WORLD .  The "2" at the end
   of the handle is 3-1 (e.g., the index in the builtin array) */
#define MPIR_ICOMM_WORLD  ((MPI_Comm)0x44000002)

1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
/* The following preprocessor macros provide bitfield access information for
 * context ID values.  They follow a uniform naming pattern:
 *
 * MPID_CONTEXT_foo_WIDTH - the width in bits of the field
 * MPID_CONTEXT_foo_MASK  - A valid bit mask for bit-wise AND and OR operations
 *                          with exactly all of the bits in the field set.
 * MPID_CONTEXT_foo_SHIFT - The number of bits that the field should be shifted
 *                          rightwards to place it in the least significant bits
 *                          of the ID.  There may still be higher order bits
 *                          from other fields, so the _MASK should be used first
 *                          if you want to reliably retrieve the exact value of
 *                          the field.
 */

/* yields an rvalue that is the value of the field_name_ in the least significant bits */
#define MPID_CONTEXT_READ_FIELD(field_name_,id_) \
    (((id_) & MPID_CONTEXT_##field_name_##_MASK) >> MPID_CONTEXT_##field_name_##_SHIFT)
/* yields an rvalue that is the old_id_ with field_name_ set to field_val_ */
#define MPID_CONTEXT_SET_FIELD(field_name_,old_id_,field_val_) \
    ((old_id_ & ~MPID_CONTEXT_##field_name_##_MASK) | ((field_val_) << MPID_CONTEXT_##field_name_##_SHIFT))

1291
/* Context suffixes for separating pt2pt and collective communication */
1292
1293
1294
1295
1296
1297
1298
#define MPID_CONTEXT_SUFFIX_WIDTH (1)
#define MPID_CONTEXT_SUFFIX_SHIFT (0)
#define MPID_CONTEXT_SUFFIX_MASK ((1 << MPID_CONTEXT_SUFFIX_WIDTH) - 1)
#define MPID_CONTEXT_INTRA_PT2PT (0)
#define MPID_CONTEXT_INTRA_COLL  (1)
#define MPID_CONTEXT_INTER_PT2PT (0)
#define MPID_CONTEXT_INTER_COLL  (1)
1299

1300
1301
1302
/* Used to derive context IDs for sub-communicators from a parent communicator's
   context ID value.  This field comes after the one bit suffix.
   values are shifted left by 1. */
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
#define MPID_CONTEXT_SUBCOMM_WIDTH (2)
#define MPID_CONTEXT_SUBCOMM_SHIFT (MPID_CONTEXT_SUFFIX_WIDTH + MPID_CONTEXT_SUFFIX_SHIFT)
#define MPID_CONTEXT_SUBCOMM_MASK      (((1 << MPID_CONTEXT_SUBCOMM_WIDTH) - 1) << MPID_CONTEXT_SUBCOMM_SHIFT)

/* these values may be added/subtracted directly to/from an existing context ID
 * in order to determine the context ID of the child/parent */
#define MPID_CONTEXT_PARENT_OFFSET    (0 << MPID_CONTEXT_SUBCOMM_SHIFT)
#define MPID_CONTEXT_INTRANODE_OFFSET (1 << MPID_CONTEXT_SUBCOMM_SHIFT)
#define MPID_CONTEXT_INTERNODE_OFFSET (2 << MPID_CONTEXT_SUBCOMM_SHIFT)

/* this field (IS_LOCALCOM) is used to derive a context ID for local
 * communicators of intercommunicators without communication */
#define MPID_CONTEXT_IS_LOCALCOMM_WIDTH (1)
#define MPID_CONTEXT_IS_LOCALCOMM_SHIFT (MPID_CONTEXT_SUBCOMM_SHIFT + MPID_CONTEXT_SUBCOMM_WIDTH)
#define MPID_CONTEXT_IS_LOCALCOMM_MASK (((1 << MPID_CONTEXT_IS_LOCALCOMM_WIDTH) - 1) << MPID_CONTEXT_IS_LOCALCOMM_SHIFT)
1318
1319
1320
1321

/* MPIR_MAX_CONTEXT_MASK is the number of ints that make up the bit vector that
 * describes the context ID prefix space.
 *
1322
 * The following must hold:
1323
1324
 * (num_bits_in_vector) <= (maximum_context_id_prefix)
 *   which is the following in concrete terms:
1325
 * MPIR_MAX_CONTEXT_MASK*MPIR_CONTEXT_INT_BITS <= 2**(MPIR_CONTEXT_ID_BITS - (MPID_CONTEXT_PREFIX_SHIFT + MPID_CONTEXT_DYNAMIC_PROC_WIDTH))
1326
 *
1327
1328
1329
1330
 * We currently always assume MPIR_CONTEXT_INT_BITS is 32, regardless of the
 * value of sizeof(int)*CHAR_BITS.  We also make the assumption that CHAR_BITS==8.
 *
 * For a 16-bit context id field and CHAR_BITS==8, this implies MPIR_MAX_CONTEXT_MASK <= 256
1331
 */
1332
1333

/* number of bits to shift right by in order to obtain the context ID prefix */
1334
1335
1336
1337
1338
1339
1340
#define MPID_CONTEXT_PREFIX_SHIFT (MPID_CONTEXT_IS_LOCALCOMM_SHIFT + MPID_CONTEXT_IS_LOCALCOMM_WIDTH)
#define MPID_CONTEXT_PREFIX_WIDTH (MPIR_CONTEXT_ID_BITS - (MPID_CONTEXT_PREFIX_SHIFT + MPID_CONTEXT_DYNAMIC_PROC_WIDTH))
#define MPID_CONTEXT_PREFIX_MASK (((1 << MPID_CONTEXT_PREFIX_WIDTH) - 1) << MPID_CONTEXT_PREFIX_SHIFT)

#define MPID_CONTEXT_DYNAMIC_PROC_WIDTH (1) /* the upper half is reserved for dynamic procs */
#define MPID_CONTEXT_DYNAMIC_PROC_SHIFT (MPIR_CONTEXT_ID_BITS - MPID_CONTEXT_DYNAMIC_PROC_WIDTH) /* the upper half is reserved for dynamic procs */
#define MPID_CONTEXT_DYNAMIC_PROC_MASK (((1 << MPID_CONTEXT_DYNAMIC_PROC_WIDTH) - 1) << MPID_CONTEXT_DYNAMIC_PROC_SHIFT)
1341
1342
1343
1344
1345

/* should probably be (sizeof(int)*CHAR_BITS) once we make the code CHAR_BITS-clean */
#define MPIR_CONTEXT_INT_BITS (32)
#define MPIR_CONTEXT_ID_BITS (sizeof(MPIR_Context_id_t)*8) /* 8 --> CHAR_BITS eventually */
#define MPIR_MAX_CONTEXT_MASK \
1346
    ((1 << (MPIR_CONTEXT_ID_BITS - (MPID_CONTEXT_PREFIX_SHIFT + MPID_CONTEXT_DYNAMIC_PROC_WIDTH))) / MPIR_CONTEXT_INT_BITS)
1347

1348
1349
1350
1351
/* Utility routines.  Where possible, these are kept in the source directory
   with the other comm routines (src/mpi/comm, in mpicomm.h).  However,
   to create a new communicator after a spawn or connect-accept operation, 
   the device may need to create a new contextid */
1352
int MPIR_Get_contextid( MPID_Comm *, MPIR_Context_id_t *context_id );
1353
int MPIR_Get_contextid_sparse(MPID_Comm *comm_ptr, MPIR_Context_id_t *context_id, int ignore_id);
1354
int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr, int tag, MPIR_Context_id_t *context_id, int ignore_id);
1355
void MPIR_Free_contextid( MPIR_Context_id_t );
1356
1357
1358
1359
1360
1361

/* ------------------------------------------------------------------------- */

/* Requests */
/* This currently defines a single structure type for all requests.  
   Eventually, we may want a union type, as used in MPICH-1 */
1362
1363
1364
1365
1366
1367
/* NOTE-R1: MPID_REQUEST_MPROBE signifies that this is a request created by
 * MPI_Mprobe or MPI_Improbe.  Since we use MPI_Request objects as our
 * MPI_Message objects, we use this separate kind in order to provide stronger
 * error checking.  Once a message (backed by a request) is promoted to a real
 * request by calling MPI_Mrecv/MPI_Imrecv, we actually modify the kind to be
 * MPID_REQUEST_RECV in order to keep completion logic as simple as possible. */
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
/*E
  MPID_Request_kind - Kinds of MPI Requests

  Module:
  Request-DS

  E*/
typedef enum MPID_Request_kind_t {
    MPID_REQUEST_UNDEFINED,
    MPID_REQUEST_SEND,
    MPID_REQUEST_RECV,
    MPID_PREQUEST_SEND,
    MPID_PREQUEST_RECV,
    MPID_UREQUEST,
1382
    MPID_COLL_REQUEST,
1383
    MPID_REQUEST_MPROBE, /* see NOTE-R1 */
1384
1385
1386
1387
1388
1389
1390
    MPID_LAST_REQUEST_KIND
#ifdef MPID_DEV_REQUEST_KIND_DECL
    , MPID_DEV_REQUEST_KIND_DECL
#endif
} MPID_Request_kind_t;

/* Typedefs for Fortran generalized requests */
1391
1392
1393
typedef void (MPIR_Grequest_f77_cancel_function)(void *, MPI_Fint*, MPI_Fint *); 
typedef void (MPIR_Grequest_f77_free_function)(void *, MPI_Fint *); 
typedef void (MPIR_Grequest_f77_query_function)(void *, MPI_Fint *, MPI_Fint *); 
1394

1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
/* vtable-ish structure holding generalized request function pointers and other
 * state.  Saves ~48 bytes in pt2pt requests on many platforms. */
struct MPID_Grequest_fns {
    MPI_Grequest_cancel_function *cancel_fn;
    MPI_Grequest_free_function   *free_fn;
    MPI_Grequest_query_function  *query_fn;
    MPIX_Grequest_poll_function   *poll_fn;
    MPIX_Grequest_wait_function   *wait_fn;
    void             *grequest_extra_state;
    MPIX_Grequest_class         greq_class;
    MPID_Lang_t                  greq_lang;         /* language that defined
                                                       the generalize req */
};
1408
1409
1410
1411

/* see mpiimplthread.h for the def of MPID_cc_t and related functions/macros */
#define MPID_Request_is_complete(req_) (MPID_cc_is_complete((req_)->cc_ptr))

1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
/*S
  MPID_Request - Description of the Request data structure

  Module:
  Request-DS

  Notes:
  If it is necessary to remember the MPI datatype, this information is 
  saved within the device-specific fields provided by 'MPID_DEV_REQUEST_DECL'.

  Requests come in many flavors, as stored in the 'kind' field.  It is 
  expected that each kind of request will have its own structure type 
  (e.g., 'MPID_Request_send_t') that extends the 'MPID_Request'.
  
  S*/
typedef struct MPID_Request {
1428
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1429
1430
1431
1432
    MPID_Request_kind_t kind;
    /* pointer to the completion counter */
    /* This is necessary for the case when an operation is described by a 
       list of requests */
1433
    MPID_cc_t *cc_ptr;
1434
1435
    /* A comm is needed to find the proper error handler */
    MPID_Comm *comm;
1436
1437
1438
1439
    /* completion counter.  Ensure cc and status are in the same cache
       line, assuming the cache line size is a multiple of 32 bytes
       and 32-bit integers */
    MPID_cc_t cc;
1440
1441
1442
1443
1444
    /* Status is needed for wait/test/recv */
    MPI_Status status;
    /* Persistent requests have their own "real" requests.  Receive requests
       have partnering send requests when src=dest. etc. */
    struct MPID_Request *partner_request;
1445
1446
1447
1448
1449

    /* User-defined request support via a "vtable".  Saves space in the already
     * bloated request for regular pt2pt and NBC requests. */
    struct MPID_Grequest_fns *greq_fns;

1450
1451
1452
1453
    /* Other, device-specific information */
#ifdef MPID_DEV_REQUEST_DECL
    MPID_DEV_REQUEST_DECL
#endif
1454
} MPID_Request ATTRIBUTE((__aligned__(32)));
1455

1456
1457
1458
1459
1460
extern MPIU_Object_alloc_t MPID_Request_mem;
/* Preallocated request objects */
extern MPID_Request MPID_Request_direct[];

#define MPIR_Request_add_ref( _req ) \
1461
    do { MPIU_Object_add_ref( _req ); } while (0)
1462
1463

#define MPIR_Request_release_ref( _req, _inuse ) \
1464
    do { MPIU_Object_release_ref( _req, _inuse ); } while (0)
1465
1466
1467
1468
1469
1470
1471

/* These macros allow us to implement a sendq when debugger support is
   selected.  As there is extra overhead for this, we only do this
   when specifically requested 
*/
#ifdef HAVE_DEBUGGER_SUPPORT
void MPIR_WaitForDebugger( void );
1472
void MPIR_DebuggerSetAborting( const char * );
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
void MPIR_Sendq_remember(MPID_Request *, int, int, int );
void MPIR_Sendq_forget(MPID_Request *);
void MPIR_CommL_remember( MPID_Comm * );
void MPIR_CommL_forget( MPID_Comm * );

#define MPIR_SENDQ_REMEMBER(_a,_b,_c,_d) MPIR_Sendq_remember(_a,_b,_c,_d)
#define MPIR_SENDQ_FORGET(_a) MPIR_Sendq_forget(_a)
#define MPIR_COMML_REMEMBER(_a) MPIR_CommL_remember( _a )
#define MPIR_COMML_FORGET(_a) MPIR_CommL_forget( _a )
#else
#define MPIR_SENDQ_REMEMBER(a,b,c,d)
#define MPIR_SENDQ_FORGET(a)
#define MPIR_COMML_REMEMBER(_a) 
#define MPIR_COMML_FORGET(_a) 
#endif

1489
1490
1491
/* must come after MPID_Comm is declared/defined */
int MPIR_Get_contextid_nonblock(MPID_Comm *comm_ptr, MPID_Comm *newcommp, MPID_Request **req);
int MPIR_Get_intercomm_contextid_nonblock(MPID_Comm *comm_ptr, MPID_Comm *newcommp, MPID_Request **req);
1492
1493

/* ------------------------------------------------------------------------- */
1494
1495
1496
1497
1498
1499
1500
/* Prototypes and definitions for the node ID code.  This is used to support
   hierarchical collectives in a (mostly) device-independent way. */
#if defined(MPID_USE_NODE_IDS)
/* MPID_Node_id_t is a signed integer type defined by the device in mpidpre.h. */
int MPID_Get_node_id(MPID_Comm *comm, int rank, MPID_Node_id_t *id_p);
int MPID_Get_max_node_id(MPID_Comm *comm, MPID_Node_id_t *max_id_p);
#endif
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524

/* ------------------------------------------------------------------------- */
/*S
  MPID_Progress_state - object to hold progress state when using the blocking
  progress routines.

  Module:
  Misc

  Notes:
  The device must define MPID_PROGRESS_STATE_DECL.  It should  include any state
  that needs to be maintained between calls to MPID_Progress_{start,wait,end}.
  S*/
typedef struct MPID_Progress_state
{
    MPID_PROGRESS_STATE_DECL
}
MPID_Progress_state;
/* ------------------------------------------------------------------------- */

/* ------------------------------------------------------------------------- */
/* end of mpirma.h (in src/mpi/rma?) */
/* ------------------------------------------------------------------------- */

1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
/*
 * To provide more flexibility in the handling of RMA operations, we provide
 * these options:
 *
 *  Statically defined ADI routines
 *      MPID_Put etc, provided by the ADI
 *  Dynamically defined routines
 *      A function table is used, initialized during window creation
 *
 * Which of these is used is selected by the device.  If USE_MPID_RMA_TABLE is
 * defined, then the function table is used.  Otherwise, the calls turn into
 * MPID_<Rma operation>, e.g., MPID_Put or MPID_Win_create.
 */

/* We need to export this header file (at least the struct) to the
   device, so that it can implement the init routine. */
#ifdef USE_MPID_RMA_TABLE
#define MPIU_RMA_CALL(winptr,funccall) (winptr)->RMAFns.funccall

#else
/* Just use the MPID_<fcn> version of the function */
#define MPIU_RMA_CALL(winptr,funccall) MPID_##funccall

#endif /* USE_MPID_RMA_TABLE */

1550
/* Windows */
1551
1552
#ifdef USE_MPID_RMA_TABLE
struct MPID_Win;
1553
typedef struct MPID_RMA_Ops {
1554
    int (*Win_free)(struct MPID_Win **);
1555

1556
    int (*Put)(const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
1557
		struct MPID_Win *);
1558
    int (*Get)(void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
1559
		struct MPID_Win *);
1560
    int (*Accumulate)(const void *, int, MPI_Datatype, int, MPI_Aint, int,
1561
		       MPI_Datatype, MPI_Op, struct MPID_Win *);
1562

1563
1564
1565
1566
1567
1568
    int (*Win_fence)(int, struct MPID_Win *);
    int (*Win_post)(MPID_Group *, int, struct MPID_Win *);
    int (*Win_start)(MPID_Group *, int, struct MPID_Win *);
    int (*Win_complete)(struct MPID_Win *);
    int (*Win_wait)(struct MPID_Win *);
    int (*Win_test)(struct MPID_Win *, int *);
1569

1570
1571
    int (*Win_lock)(int, int, int, struct MPID_Win *);
    int (*Win_unlock)(int, struct MPID_Win *);
1572
1573
1574
1575

    /* MPI-3 Functions */
    int (*Win_attach)(struct MPID_Win *, void *, MPI_Aint);
    int (*Win_detach)(struct MPID_Win *, const void *);
1576
    int (*Win_shared_query)(struct MPID_Win *, int, MPI_Aint *, int *, void *);
1577

1578
1579
1580
    int (*Win_set_info)(struct MPID_Win *, MPID_Info *);
    int (*Win_get_info)(struct MPID_Win *, MPID_Info **);

1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
    int (*Win_lock_all)(int, struct MPID_Win *);
    int (*Win_unlock_all)(struct MPID_Win *);

    int (*Win_flush)(int, struct MPID_Win *);
    int (*Win_flush_all)(struct MPID_Win *);
    int (*Win_flush_local)(int, struct MPID_Win *);
    int (*Win_flush_local_all)(struct MPID_Win *);
    int (*Win_sync)(struct MPID_Win *);

    int (*Get_accumulate)(const void *, int , MPI_Datatype, void *, int,
                          MPI_Datatype, int, MPI_Aint, int, MPI_Datatype, MPI_Op,
                          struct MPID_Win *);
    int (*Fetch_and_op)(const void *, void *, MPI_Datatype, int, MPI_Aint, MPI_Op,
                        struct MPID_Win *);
    int (*Compare_and_swap)(const void *, const void *, void *, MPI_Datatype, int,
                            MPI_Aint, struct MPID_Win *);

    int (*Rput)(const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
                struct MPID_Win *, MPID_Request**);
    int (*Rget)(void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
                struct MPID_Win *, MPID_Request**);
    int (*Raccumulate)(const void *, int, MPI_Datatype, int, MPI_Aint, int,
                       MPI_Datatype, MPI_Op, struct MPID_Win *, MPID_Request**);
    int (*Rget_accumulate)(const void *, int , MPI_Datatype, void *, int,
                           MPI_Datatype, int, MPI_Aint, int, MPI_Datatype, MPI_Op,
                           struct MPID_Win *, MPID_Request**);

1608
1609
} MPID_RMAFns;
#define MPID_RMAFNS_VERSION 2
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
/* Note that the memory allocation/free routines do not take a window, 
   so they must be initialized separately, and are a per-run, not per-window
   object.  If the device can manage different kinds of memory allocations,
   these routines must internally provide that flexibility. */
/* 
    void *(*Alloc_mem)(size_t, MPID_Info *);
    int (*Free_mem)(void *);
*/
#endif

1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
/*S
  MPID_Win - Description of the Window Object data structure.

  Module:
  Win-DS

  Notes:
  The following 3 keyvals are defined for attributes on all MPI 
  Window objects\:
.vb
 MPI_WIN_SIZE
 MPI_WIN_BASE
 MPI_WIN_DISP_UNIT
.ve
  These correspond to the values in 'length', 'start_address', and 
  'disp_unit'.

  The communicator in the window is the same communicator that the user
  provided to 'MPI_Win_create' (not a dup).  However, each intracommunicator
  has a special context id that may be used if MPI communication is used 
  by the implementation to implement the RMA operations.

  There is no separate window group; the group of the communicator should be
  used.

  Question:
  Should a 'MPID_Win' be defined after 'MPID_Segment' in case the device 
  wants to 
  store a queue of pending put/get operations, described with 'MPID_Segment'
  (or 'MPID_Request')s?

  S*/
typedef struct MPID_Win {
1653
    MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1654
1655
1656
1657
1658
1659
1660
1661
1662
    int fence_cnt;     /* 0 = no fence has been called; 
                          1 = fence has been called */ 
    MPID_Errhandler *errhandler;  /* Pointer to the error handler structure */
    void *base;
    MPI_Aint    size;        
    int          disp_unit;      /* Displacement unit of *local* window */
    MPID_Attribute *attributes;
    MPID_Group *start_group_ptr; /* group passed in MPI_Win_start */
    int start_assert;            /* assert passed to MPI_Win_start */
1663
1664
1665
    MPID_Comm *comm_ptr;         /* Pointer to comm of window (dup) */
    int         myrank;          /* Rank of this process in comm (used to 
				    detect operations on self) */
1666
1667
    int lockRank;                /* If within an MPI_Win_lock epoch, 
				    the rank that we locked */
1668
1669
#ifdef USE_THREADED_WINDOW_CODE
    /* These were causing compilation errors.  We need to figure out how to
1670
       integrate threads into MPICH before including these fields. */
1671
1672
1673
1674
1675
1676
1677
1678
1679
    /* FIXME: The test here should be within a test for threaded support */
#ifdef HAVE_PTHREAD_H
    pthread_t wait_thread_id; /* id of thread handling MPI_Win_wait */
    pthread_t passive_target_thread_id; /* thread for passive target RMA */
#elif defined(HAVE_WINTHREADS)
    HANDLE wait_thread_id;
    HANDLE passive_target_thread_id;
#endif
#endif
1680
1681
    /* */
#ifdef USE_MPID_RMA_TABLE
1682
    MPID_RMAFns RMAFns;
1683
#endif    
1684
1685
    /* These are COPIES of the values so that addresses to them
       can be returned as attributes.  They are initialized by the
1686
1687
1688
1689
1690
       MPI_Win_get_attr function.
     
       These values are constant for the lifetime of the window, so
       this is thread-safe.
     */
1691
1692
1693
1694
    int  copyDispUnit;
    MPI_Aint copySize;
    
    char          name[MPI_MAX_OBJECT_NAME];  
1695

1696
1697
1698
1699
    MPIR_Win_flavor_t create_flavor;
    MPIR_Win_model_t  model;
    MPIR_Win_flavor_t copyCreateFlavor;
    MPIR_Win_model_t  copyModel;
1700

1701
1702
1703
1704
1705
1706
1707
1708
1709
  /* Other, device-specific information */
#ifdef MPID_DEV_WIN_DECL
    MPID_DEV_WIN_DECL
#endif
} MPID_Win;
extern MPIU_Object_alloc_t MPID_Win_mem;
/* Preallocated win objects */
extern MPID_Win MPID_Win_direct[];

1710
1711
1712
1713
1714
enum MPID_Win_lock_states { 
    /* LOCKED = 0, 1, ... */
    MPID_WIN_STATE_UNLOCKED   = -1,
    MPID_WIN_STATE_LOCKED_ALL = -2
};
1715

1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
/* ------------------------------------------------------------------------- */
/* also in mpirma.h ?*/
/* ------------------------------------------------------------------------- */

/*
 * Good Memory (may be required for passive target operations on MPI_Win)
 */

/*@
  MPID_Alloc_mem - Allocate memory suitable for passive target RMA operations

  Input Parameter:
+ size - Number of types to allocate.
- info - Info object

  Return value:
  Pointer to the allocated memory.  If the memory is not available, 
  returns null.

  Notes:
  This routine is used to implement 'MPI_Alloc_mem'.  It is for that reason
  that there is no communicator argument.  

  This memory may `only` be freed with 'MPID_Free_mem'.

  This is a `local`, not a collective operation.  It functions more like a
  good form of 'malloc' than collective shared-memory allocators such as
  the 'shmalloc' found on SGI systems.

  Implementations of this routine may wish to use 'MPID_Memory_register'.  
  However, this routine has slighly different requirements, so a separate
  entry point is provided.

  Question:
  Since this takes an info object, should there be an error routine in the 
  case that the info object contains an error?

  Module:
  Win
  @*/
void *MPID_Alloc_mem( size_t size, MPID_Info *info );

/*@
  MPID_Free_mem - Frees memory allocated with 'MPID_Alloc_mem'

  Input Parameter:
. ptr - Pointer to memory allocated by 'MPID_Alloc_mem'.

  Return value:
  'MPI_SUCCESS' if memory was successfully freed; an MPI error code otherwise.

  Notes:
  The return value is provided because it may not be easy to validate the
  value of 'ptr' without attempting to free the memory.

  Module:
  Win
  @*/
int MPID_Free_mem( void *ptr );

/*@
  MPID_Mem_was_alloced - Return true if this memory was allocated with 
  'MPID_Alloc_mem'

  Input Parameters:
+ ptr  - Address of memory
- size - Size of reqion in bytes.

  Return value:
  True if the memory was allocated with 'MPID_Alloc_mem', false otherwise.

  Notes:
  This routine may be needed by 'MPI_Win_create' to ensure that the memory 
  for passive target RMA operations was allocated with 'MPI_Mem_alloc'.
  This may be used, for example, for ensuring that memory used with
  passive target operations was allocated with 'MPID_Alloc_mem'.

  Module:
  Win
  @*/
int MPID_Mem_was_alloced( void *ptr );  /* brad : this isn't used or implemented anywhere */

/* ------------------------------------------------------------------------- */
/* end of also in mpirma.h ? */
/* ------------------------------------------------------------------------- */

/* ------------------------------------------------------------------------- */
/* Reduction and accumulate operations */
/*E
  MPID_Op_kind - Enumerates types of MPI_Op types

  Notes:
  These are needed for implementing 'MPI_Accumulate', since only predefined
  operations are allowed for that operation.  

  A gap in the enum values was made allow additional predefined operations
  to be inserted.  This might include future additions to MPI or experimental
  extensions (such as a Read-Modify-Write operation).

  Module:
  Collective-DS
  E*/
1818
typedef enum MPID_Op_kind { MPID_OP_NULL=0, MPID_OP_MAX=1, MPID_OP_MIN=2,
1819
1820
1821
1822
			    MPID_OP_SUM=3, MPID_OP_PROD=4, 
	       MPID_OP_LAND=5, MPID_OP_BAND=6, MPID_OP_LOR=7, MPID_OP_BOR=8,
	       MPID_OP_LXOR=9, MPID_OP_BXOR=10, MPID_OP_MAXLOC=11, 
               MPID_OP_MINLOC=12, MPID_OP_REPLACE=13, 
1823
               MPID_OP_NO_OP=14,
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
               MPID_OP_USER_NONCOMMUTE=32, MPID_OP_USER=33 }
  MPID_Op_kind;

/*S
  MPID_User_function - Definition of a user function for MPI_Op types.

  Notes:
  This includes a 'const' to make clear which is the 'in' argument and 
  which the 'inout' argument, and to indicate that the 'count' and 'datatype'
  arguments are unchanged (they are addresses in an attempt to allow 
  interoperation with Fortran).  It includes 'restrict' to emphasize that 
  no overlapping operations are allowed.

  We need to include a Fortran version, since those arguments will
  have type 'MPI_Fint *' instead.  We also need to add a test to the
  test suite for this case; in fact, we need tests for each of the handle
  types to ensure that the transfered handle works correctly.

  This is part of the collective module because user-defined operations
  are valid only for the collective computation routines and not for 
  RMA accumulate.

  Yes, the 'restrict' is in the correct location.  C compilers that 
  support 'restrict' should be able to generate code that is as good as a
  Fortran compiler would for these functions.

  We should note on the manual pages for user-defined operations that
  'restrict' should be used when available, and that a cast may be 
  required when passing such a function to 'MPI_Op_create'.

  Question:
  Should each of these function types have an associated typedef?

  Should there be a C++ function here?

  Module:
  Collective-DS
  S*/
typedef union MPID_User_function {
    void (*c_function) ( const void *, void *, 
			 const int *, const MPI_Datatype * ); 
    void (*f77_function) ( const void *, void *,
			  const MPI_Fint *, const MPI_Fint * );
} MPID_User_function;
/* FIXME: Should there be "restrict" in the definitions above, e.g., 
   (*c_function)( const void restrict * , void restrict *, ... )? */

/*S
  MPID_Op - MPI_Op structure

  Notes:
  All of the predefined functions are commutative.  Only user functions may 
  be noncummutative, so there are two separate op types for commutative and
  non-commutative user-defined operations.

  Operations do not require reference counts because there are no nonblocking
  operations that accept user-defined operations.  Thus, there is no way that
  a valid program can free an 'MPI_Op' while it is in use.

  Module:
  Collective-DS
  S*/
typedef struct MPID_Op {
1887
     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1888
1889
1890
1891
     MPID_Op_kind       kind;
     MPID_Lang_t        language;
     MPID_User_function function;
  } MPID_Op;
1892
#define MPID_OP_N_BUILTIN 15
1893
1894
1895
1896
extern MPID_Op MPID_Op_builtin[MPID_OP_N_BUILTIN];
extern MPID_Op MPID_Op_direct[];
extern MPIU_Object_alloc_t MPID_Op_mem;

1897
1898
#define MPIR_Op_add_ref(_op) \
    do { MPIU_Object_add_ref(_op); } while (0)
1899
#define MPIR_Op_release_ref( _op, _inuse ) \
1900
    do { MPIU_Object_release_ref( _op, _inuse ); } while (0)
1901

1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
/* release and free-if-not-in-use helper */
#define MPIR_Op_release(op_p_)                           \
    do {                                                 \
        int in_use_;                                     \
        MPIR_Op_release_ref((op_p_), &in_use_);          \
        if (!in_use_) {                                  \
            MPIU_Handle_obj_free(&MPID_Op_mem, (op_p_)); \
        }                                                \
    } while (0)

1912
1913
1914
/* ------------------------------------------------------------------------- */

/* ------------------------------------------------------------------------- */
1915
/* mpicoll.h (in src/mpi/coll?) */
1916
1917
1918
1919
1920
1921
/* ------------------------------------------------------------------------- */

/* Collective operations */
typedef struct MPID_Collops {
    int ref_count;   /* Supports lazy copies */
    /* Contains pointers to the functions for the MPI collectives */
1922
    int (*Barrier) (MPID_Comm *, int *);
1923
    int (*Bcast) (void*, int, MPI_Datatype, int, MPID_Comm *, int *);
1924
    int (*Gather) (const void*, int, MPI_Datatype, void*, int, MPI_Datatype,
1925
                   int, MPID_Comm *, int *); 
1926
    int (*Gatherv) (const void*, int, MPI_Datatype, void*, const int *, const int *,
1927
                    MPI_Datatype, int, MPID_Comm *, int *); 
1928
    int (*Scatter) (const void*, int, MPI_Datatype, void*, int, MPI_Datatype,
1929
                    int, MPID_Comm *, int *);
1930
1931
1932
    int (*Scatterv) (const void*, const int *, const int *, MPI_Datatype,
                     void*, int, MPI_Datatype, int, MPID_Comm *, int *);
    int (*Allgather) (const void*, int, MPI_Datatype, void*, int,
1933
                      MPI_Datatype, MPID_Comm *, int *);
1934
1935
1936
    int (*Allgatherv) (const void*, int, MPI_Datatype, void*, const int *,
                       const int *, MPI_Datatype, MPID_Comm *, int *);
    int (*Alltoall) (const void*, int, MPI_Datatype, void*, int, MPI_Datatype,
1937
                               MPID_Comm *, int *);
1938
1939
1940
1941
1942
1943
    int (*Alltoallv) (const void*, const int *, const int *, MPI_Datatype,
                      void*, const int *, const int *, MPI_Datatype, MPID_Comm *,
                      int *);
    int (*Alltoallw) (const void*, const int *, const int *, const MPI_Datatype *, void*,
                      const int *, const int *, const MPI_Datatype *, MPID_Comm *, int *);
    int (*Reduce) (const void*, void*, int, MPI_Datatype, MPI_Op, int,
1944
                   MPID_Comm *, int *);
1945
    int (*Allreduce) (const void*, void*, int, MPI_Datatype, MPI_Op,
1946
                      MPID_Comm *, int *);
1947
    int (*Reduce_scatter) (const void*, void*, const int *, MPI_Datatype, MPI_Op,
1948
                           MPID_Comm *, int *);
1949
1950
    int (*Scan) (const void*, void*, int, MPI_Datatype, MPI_Op, MPID_Comm *, int * );
    int (*Exscan) (const void*, void*, int, MPI_Datatype, MPI_Op, MPID_Comm *, int * );
1951
    int (*Reduce_scatter_block) (const void*, void*, int, MPI_Datatype, MPI_Op,
1952
                           MPID_Comm *, int *);
1953
1954

    /* MPI-3 nonblocking collectives */
1955
1956
1957
    int (*Ibarrier)(MPID_Comm *comm_ptr, MPID_Sched_t s);
    int (*Ibcast)(void *buffer, int count, MPI_Datatype datatype, int root,
                  MPID_Comm *comm_ptr, MPID_Sched_t s);
1958
    int (*Igather)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1959
                   int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr,
1960
                   MPID_Sched_t s);
1961
1962
    int (*Igatherv)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
                    const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root,
1963
                    MPID_Comm *comm_ptr, MPID_Sched_t s);
1964
    int (*Iscatter)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1965
                    int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr,
1966
                    MPID_Sched_t s);
1967
1968
1969
1970
    int (*Iscatterv)(const void *sendbuf, const int *sendcounts, const int *displs,
                     MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
                     int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
    int (*<