mpidrma.h 33.2 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */
#if !defined(MPICH_MPIDRMA_H_INCLUDED)
#define MPICH_MPIDRMA_H_INCLUDED

9
#include "mpl_utlist.h"
10
#include "mpidi_ch3_impl.h"
11

12
typedef enum MPIDI_RMA_Op_type {
13 14 15 16 17 18 19 20
    MPIDI_RMA_PUT = 23,
    MPIDI_RMA_GET = 24,
    MPIDI_RMA_ACCUMULATE = 25,
    /* REMOVED: MPIDI_RMA_LOCK     = 26, */
    MPIDI_RMA_ACC_CONTIG = 27,
    MPIDI_RMA_GET_ACCUMULATE = 28,
    MPIDI_RMA_COMPARE_AND_SWAP = 29,
    MPIDI_RMA_FETCH_AND_OP = 30
21 22
} MPIDI_RMA_Op_type_t;

23 24
/* Special case RMA operations */

25
enum MPIDI_RMA_Datatype {
26 27
    MPIDI_RMA_DATATYPE_BASIC = 50,
    MPIDI_RMA_DATATYPE_DERIVED = 51
28 29
};

Pavan Balaji's avatar
Pavan Balaji committed
30 31 32
/* We start with an arbitrarily chosen number (42), to help with
 * debugging when a packet type is not initialized or wrongly
 * initialized. */
33
enum MPID_Lock_state {
34
    MPID_LOCK_NONE = 42,
Pavan Balaji's avatar
Pavan Balaji committed
35
    MPID_LOCK_SHARED_ALL
36 37
};

38 39 40 41
/*
 * RMA Declarations.  We should move these into something separate from
 * a Request.
 */
42

43
/* to send derived datatype across in RMA ops */
44 45 46 47 48 49 50 51 52 53 54 55
typedef struct MPIDI_RMA_dtype_info {   /* for derived datatypes */
    int is_contig;
    int max_contig_blocks;
    MPI_Aint size;
    MPI_Aint extent;
    int dataloop_size;          /* not needed because this info is sent in
                                 * packet header. remove it after lock/unlock
                                 * is implemented in the device */
    void *dataloop;             /* pointer needed to update pointers
                                 * within dataloop on remote side */
    int dataloop_depth;
    int eltype;
56 57 58 59 60
    MPI_Aint ub, lb, true_ub, true_lb;
    int has_sticky_ub, has_sticky_lb;
} MPIDI_RMA_dtype_info;

/* for keeping track of RMA ops, which will be executed at the next sync call */
61 62 63
typedef struct MPIDI_RMA_Op {
    struct MPIDI_RMA_Op *prev;  /* pointer to next element in list */
    struct MPIDI_RMA_Op *next;  /* pointer to next element in list */
64 65 66 67
    /* FIXME: It would be better to setup the packet that will be sent, at
     * least in most cases (if, as a result of the sync/ops/sync sequence,
     * a different packet type is needed, it can be extracted from the
     * information otherwise stored). */
68
    MPIDI_RMA_Op_type_t type;
69 70 71
    void *origin_addr;
    int origin_count;
    MPI_Datatype origin_datatype;
72 73 74 75
    int target_rank;
    MPI_Aint target_disp;
    int target_count;
    MPI_Datatype target_datatype;
76
    MPI_Op op;                  /* for accumulate */
77
    /* Used to complete operations */
Pavan Balaji's avatar
Pavan Balaji committed
78 79 80
    struct MPID_Request *request;
    MPIDI_RMA_dtype_info dtype_info;
    void *dataloop;
81 82 83 84 85 86
    void *result_addr;
    int result_count;
    MPI_Datatype result_datatype;
    void *compare_addr;
    int compare_count;
    MPI_Datatype compare_datatype;
87
} MPIDI_RMA_Op_t;
88 89

typedef struct MPIDI_PT_single_op {
90
    int type;                   /* put, get, or accum. */
91 92 93 94
    void *addr;
    int count;
    MPI_Datatype datatype;
    MPI_Op op;
95 96 97
    void *data;                 /* for queued puts and accumulates, data is copied here */
    MPI_Request request_handle; /* for gets */
    int data_recd;              /* to indicate if the data has been received */
98
    MPIDI_CH3_Pkt_flags_t flags;
99 100 101 102 103 104
} MPIDI_PT_single_op;

typedef struct MPIDI_Win_lock_queue {
    struct MPIDI_Win_lock_queue *next;
    int lock_type;
    MPI_Win source_win_handle;
105 106 107
    MPIDI_VC_t *vc;
    struct MPIDI_PT_single_op *pt_single_op;    /* to store info for
                                                 * lock-put-unlock optimization */
108
} MPIDI_Win_lock_queue;
109 110

/* Routine use to tune RMA optimizations */
111
void MPIDI_CH3_RMA_SetAccImmed(int flag);
112

113 114
/*** RMA OPS LIST HELPER ROUTINES ***/

115
typedef MPIDI_RMA_Op_t *MPIDI_RMA_Ops_list_t;
116

117 118 119
/* Return nonzero if the RMA operations list is empty.
 */
#undef FUNCNAME
120
#define FUNCNAME MPIDI_CH3I_RMA_Ops_isempty
121 122
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
123
static inline int MPIDI_CH3I_RMA_Ops_isempty(MPIDI_RMA_Ops_list_t * list)
124
{
125 126 127 128 129 130 131 132 133 134
    return *list == NULL;
}


/* Return a pointer to the first element in the list.
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Ops_head
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
135
static inline MPIDI_RMA_Op_t *MPIDI_CH3I_RMA_Ops_head(MPIDI_RMA_Ops_list_t * list)
136 137 138 139 140 141 142 143 144 145 146
{
    return *list;
}


/* Return a pointer to the last element in the list.
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Ops_tail
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
147
static inline MPIDI_RMA_Op_t *MPIDI_CH3I_RMA_Ops_tail(MPIDI_RMA_Ops_list_t * list)
148 149 150 151 152 153 154 155 156 157 158 159 160 161
{
    return (*list) ? (*list)->prev : NULL;
}


/* Append an element to the tail of the RMA ops list
 *
 * @param IN    list      Pointer to the RMA ops list
 * @param IN    elem      Pointer to the element to be appended
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Ops_append
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
162
static inline void MPIDI_CH3I_RMA_Ops_append(MPIDI_RMA_Ops_list_t * list, MPIDI_RMA_Op_t * elem)
163 164
{
    MPL_DL_APPEND(*list, elem);
165 166 167 168 169
}


/* Allocate a new element on the tail of the RMA operations list.
 *
170 171
 * @param IN    list      Pointer to the RMA ops list
 * @param OUT   new_ptr   Pointer to the element that was allocated
172 173 174
 * @return                MPI error class
 */
#undef FUNCNAME
175
#define FUNCNAME MPIDI_CH3I_RMA_Ops_alloc_tail
176 177
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
178 179
static inline int MPIDI_CH3I_RMA_Ops_alloc_tail(MPIDI_RMA_Ops_list_t * list,
                                                MPIDI_RMA_Op_t ** new_elem)
180 181
{
    int mpi_errno = MPI_SUCCESS;
182
    MPIDI_RMA_Op_t *tmp_ptr;
183 184 185
    MPIU_CHKPMEM_DECL(1);

    /* FIXME: We should use a pool allocator here */
186
    MPIU_CHKPMEM_MALLOC(tmp_ptr, MPIDI_RMA_Op_t *, sizeof(MPIDI_RMA_Op_t),
187 188 189 190
                        mpi_errno, "RMA operation entry");

    tmp_ptr->next = NULL;
    tmp_ptr->dataloop = NULL;
191
    tmp_ptr->request = NULL;
192

193
    MPL_DL_APPEND(*list, tmp_ptr);
194

195
    *new_elem = tmp_ptr;
196

197
  fn_exit:
198 199
    MPIU_CHKPMEM_COMMIT();
    return mpi_errno;
200
  fn_fail:
201
    MPIU_CHKPMEM_REAP();
202
    *new_elem = NULL;
203 204 205 206
    goto fn_exit;
}


207
/* Unlink an element from the RMA ops list
208
 *
209 210
 * @param IN    list      Pointer to the RMA ops list
 * @param IN    elem      Pointer to the element to be unlinked
211 212
 */
#undef FUNCNAME
213
#define FUNCNAME MPIDI_CH3I_RMA_Ops_unlink
214 215
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
216
static inline void MPIDI_CH3I_RMA_Ops_unlink(MPIDI_RMA_Ops_list_t * list, MPIDI_RMA_Op_t * elem)
217
{
218 219
    MPL_DL_DELETE(*list, elem);
}
220 221


222 223 224 225 226 227 228 229 230
/* Free an element in the RMA operations list.
 *
 * @param IN    list      Pointer to the RMA ops list
 * @param IN    curr_ptr  Pointer to the element to be freed.
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Ops_free_elem
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
231 232
static inline void MPIDI_CH3I_RMA_Ops_free_elem(MPIDI_RMA_Ops_list_t * list,
                                                MPIDI_RMA_Op_t * curr_ptr)
233
{
234
    MPIDI_RMA_Op_t *tmp_ptr = curr_ptr;
235

236
    MPIU_Assert(curr_ptr != NULL);
237

238
    MPL_DL_DELETE(*list, curr_ptr);
239 240 241 242

    /* Check if we allocated a dataloop for this op (see send/recv_rma_msg) */
    if (tmp_ptr->dataloop != NULL)
        MPIU_Free(tmp_ptr->dataloop);
243
    MPIU_Free(tmp_ptr);
244 245
}

246

247 248 249 250 251 252
/* Free an element in the RMA operations list.
 *
 * @param IN    list      Pointer to the RMA ops list
 * @param INOUT curr_ptr  Pointer to the element to be freed.  Will be updated
 *                        to point to the element following the element that
 *                        was freed.
253 254
 */
#undef FUNCNAME
255
#define FUNCNAME MPIDI_CH3I_RMA_Ops_free_and_next
256 257
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
258 259
static inline void MPIDI_CH3I_RMA_Ops_free_and_next(MPIDI_RMA_Ops_list_t * list,
                                                    MPIDI_RMA_Op_t ** curr_ptr)
260
{
261
    MPIDI_RMA_Op_t *next_ptr = (*curr_ptr)->next;
262

263 264 265
    MPIDI_CH3I_RMA_Ops_free_elem(list, *curr_ptr);
    *curr_ptr = next_ptr;
}
266 267


268 269 270 271 272 273
/* Free the entire RMA operations list.
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Ops_free
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
274
static inline void MPIDI_CH3I_RMA_Ops_free(MPIDI_RMA_Ops_list_t * list)
275
{
276
    MPIDI_RMA_Op_t *curr_ptr, *tmp_ptr;
277

278 279 280
    MPL_DL_FOREACH_SAFE(*list, curr_ptr, tmp_ptr) {
        MPIDI_CH3I_RMA_Ops_free_elem(list, curr_ptr);
    }
281 282 283
}


284 285 286 287
/* Retrieve the RMA ops list pointer from the window.  This routine detects
 * whether we are in an active or passive target epoch and returns the correct
 * ops list; we use a shared list for active target and separate per-target
 * lists for passive target.
288 289
 */
#undef FUNCNAME
290
#define FUNCNAME MPIDI_CH3I_RMA_Get_ops_list
291 292
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
293
static inline MPIDI_RMA_Ops_list_t *MPIDI_CH3I_RMA_Get_ops_list(MPID_Win * win_ptr, int target)
294
{
295
    if (win_ptr->epoch_state == MPIDI_EPOCH_FENCE ||
296
        win_ptr->epoch_state == MPIDI_EPOCH_START || win_ptr->epoch_state == MPIDI_EPOCH_PSCW) {
297 298 299 300
        return &win_ptr->at_rma_ops_list;
    }
    else {
        return &win_ptr->targets[target].rma_ops_list;
301 302 303
    }
}

304

305 306 307 308 309 310
/* ------------------------------------------------------------------------ */
/*
 * Followings are new routines for origin completion for RMA operations.
 */
/* ------------------------------------------------------------------------ */

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
#define ASSIGN_COPY(src, dest, count, type)     \
    {                                           \
        type *src_ = (type *) src;              \
        type *dest_ = (type *) dest;            \
        int i;                                  \
        for (i = 0; i < count; i++)             \
            dest_[i] = src_[i];                 \
        goto fn_exit;                           \
    }

static inline int shm_copy(const void *src, int scount, MPI_Datatype stype,
                           void *dest, int dcount, MPI_Datatype dtype)
{
    int mpi_errno = MPI_SUCCESS;

    /* We use a threshold of operations under which a for loop of assignments is
     * used.  Even though this happens at smaller block lengths, making it
     * potentially inefficient, it can take advantage of some vectorization
     * available on most modern processors. */
#define SHM_OPS_THRESHOLD  (16)

    if (MPIR_DATATYPE_IS_PREDEFINED(stype) && MPIR_DATATYPE_IS_PREDEFINED(dtype) &&
        scount <= SHM_OPS_THRESHOLD) {

        /* FIXME: We currently only optimize a few predefined datatypes, which
         * have a direct C datatype mapping. */

        /* The below list of datatypes is based on those specified in the MPI-3
         * standard on page 665. */
        switch (stype) {
341 342
        case MPI_CHAR:
            ASSIGN_COPY(src, dest, scount, char);
343

344 345
        case MPI_SHORT:
            ASSIGN_COPY(src, dest, scount, signed short int);
346

347 348
        case MPI_INT:
            ASSIGN_COPY(src, dest, scount, signed int);
349

350 351
        case MPI_LONG:
            ASSIGN_COPY(src, dest, scount, signed long int);
352

353 354
        case MPI_LONG_LONG_INT:        /* covers MPI_LONG_LONG too */
            ASSIGN_COPY(src, dest, scount, signed long long int);
355

356 357
        case MPI_SIGNED_CHAR:
            ASSIGN_COPY(src, dest, scount, signed char);
358

359 360
        case MPI_UNSIGNED_CHAR:
            ASSIGN_COPY(src, dest, scount, unsigned char);
361

362 363
        case MPI_UNSIGNED_SHORT:
            ASSIGN_COPY(src, dest, scount, unsigned short int);
364

365 366
        case MPI_UNSIGNED:
            ASSIGN_COPY(src, dest, scount, unsigned int);
367

368 369
        case MPI_UNSIGNED_LONG:
            ASSIGN_COPY(src, dest, scount, unsigned long int);
370

371 372
        case MPI_UNSIGNED_LONG_LONG:
            ASSIGN_COPY(src, dest, scount, unsigned long long int);
373

374 375
        case MPI_FLOAT:
            ASSIGN_COPY(src, dest, scount, float);
376

377 378
        case MPI_DOUBLE:
            ASSIGN_COPY(src, dest, scount, double);
379

380 381
        case MPI_LONG_DOUBLE:
            ASSIGN_COPY(src, dest, scount, long double);
382 383 384 385

#if 0
            /* FIXME: we need a configure check to define HAVE_WCHAR_T before
             * this can be enabled */
386 387
        case MPI_WCHAR:
            ASSIGN_COPY(src, dest, scount, wchar_t);
388 389 390 391 392
#endif

#if 0
            /* FIXME: we need a configure check to define HAVE_C_BOOL before
             * this can be enabled */
393 394
        case MPI_C_BOOL:
            ASSIGN_COPY(src, dest, scount, _Bool);
395 396 397
#endif

#if HAVE_INT8_T
398 399
        case MPI_INT8_T:
            ASSIGN_COPY(src, dest, scount, int8_t);
400 401 402
#endif /* HAVE_INT8_T */

#if HAVE_INT16_T
403 404
        case MPI_INT16_T:
            ASSIGN_COPY(src, dest, scount, int16_t);
405 406 407
#endif /* HAVE_INT16_T */

#if HAVE_INT32_T
408 409
        case MPI_INT32_T:
            ASSIGN_COPY(src, dest, scount, int32_t);
410 411 412
#endif /* HAVE_INT32_T */

#if HAVE_INT64_T
413 414
        case MPI_INT64_T:
            ASSIGN_COPY(src, dest, scount, int64_t);
415 416 417
#endif /* HAVE_INT64_T */

#if HAVE_UINT8_T
418 419
        case MPI_UINT8_T:
            ASSIGN_COPY(src, dest, scount, uint8_t);
420 421 422
#endif /* HAVE_UINT8_T */

#if HAVE_UINT16_T
423 424
        case MPI_UINT16_T:
            ASSIGN_COPY(src, dest, scount, uint16_t);
425 426 427
#endif /* HAVE_UINT16_T */

#if HAVE_UINT32_T
428 429
        case MPI_UINT32_T:
            ASSIGN_COPY(src, dest, scount, uint32_t);
430 431 432
#endif /* HAVE_UINT32_T */

#if HAVE_UINT64_T
433 434
        case MPI_UINT64_T:
            ASSIGN_COPY(src, dest, scount, uint64_t);
435 436
#endif /* HAVE_UINT64_T */

437 438
        case MPI_AINT:
            ASSIGN_COPY(src, dest, scount, MPI_Aint);
439

440 441
        case MPI_COUNT:
            ASSIGN_COPY(src, dest, scount, MPI_Count);
442

443 444
        case MPI_OFFSET:
            ASSIGN_COPY(src, dest, scount, MPI_Offset);
445 446 447 448

#if 0
            /* FIXME: we need a configure check to define HAVE_C_COMPLEX before
             * this can be enabled */
449 450
        case MPI_C_COMPLEX:    /* covers MPI_C_FLOAT_COMPLEX as well */
            ASSIGN_COPY(src, dest, scount, float _Complex);
451 452 453 454 455
#endif

#if 0
            /* FIXME: we need a configure check to define HAVE_C_DOUPLE_COMPLEX
             * before this can be enabled */
456 457
        case MPI_C_DOUBLE_COMPLEX:
            ASSIGN_COPY(src, dest, scount, double _Complex);
458 459 460 461 462
#endif

#if 0
            /* FIXME: we need a configure check to define
             * HAVE_C_LONG_DOUPLE_COMPLEX before this can be enabled */
463 464
        case MPI_C_LONG_DOUBLE_COMPLEX:
            ASSIGN_COPY(src, dest, scount, long double _Complex);
465 466 467 468
#endif

#if 0
            /* Types that don't have a direct equivalent */
469 470
        case MPI_BYTE:
        case MPI_PACKED:
471 472
#endif

473 474 475 476 477 478 479
#if 0   /* Fortran types */
        case MPI_INTEGER:
        case MPI_REAL:
        case MPI_DOUBLE_PRECISION:
        case MPI_COMPLEX:
        case MPI_LOGICAL:
        case MPI_CHARACTER:
480 481
#endif

482 483 484 485 486
#if 0   /* C++ types */
        case MPI_CXX_BOOL:
        case MPI_CXX_FLOAT_COMPLEX:
        case MPI_CXX_DOUBLE_COMPLEX:
        case MPI_CXX_LONG_DOUBLE_COMPLEX:
487 488
#endif

489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
#if 0   /* Optional Fortran types */
        case MPI_DOUBLE_COMPLEX:
        case MPI_INTEGER1:
        case MPI_INTEGER2:
        case MPI_INTEGER4:
        case MPI_INTEGER8:
        case MPI_INTEGER16:
        case MPI_REAL2:
        case MPI_REAL4:
        case MPI_REAL8:
        case MPI_REAL16:
        case MPI_COMPLEX4:
        case MPI_COMPLEX8:
        case MPI_COMPLEX16:
        case MPI_COMPLEX32:
504 505
#endif

506 507 508 509 510 511
#if 0   /* C datatypes for reduction functions */
        case MPI_FLOAT_INT:
        case MPI_DOUBLE_INT:
        case MPI_LONG_INT:
        case MPI_2INT:
        case MPI_LONG_DOUBLE_INT:
512 513
#endif

514 515 516 517
#if 0   /* Fortran datatypes for reduction functions */
        case MPI_2REAL:
        case MPI_2DOUBLE_PRECISION:
        case MPI_2INTEGER:
518 519
#endif

520 521 522
#if 0   /* Random types not present in the standard */
        case MPI_2COMPLEX:
        case MPI_2DOUBLE_COMPLEX:
523 524
#endif

525 526 527
        default:
            /* Just to make sure the switch statement is not empty */
            ;
528 529 530 531
        }
    }

    mpi_errno = MPIR_Localcopy(src, scount, stype, dest, dcount, dtype);
532 533 534
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }
535

536
  fn_exit:
537 538
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
539
  fn_fail:
540 541 542 543
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

544 545 546 547 548 549 550
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_put_op
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_put_op(const void *origin_addr, int origin_count, MPI_Datatype
                                        origin_datatype, int target_rank, MPI_Aint target_disp,
                                        int target_count, MPI_Datatype target_datatype,
551
                                        MPID_Win * win_ptr)
552 553 554 555 556 557 558 559
{
    int mpi_errno = MPI_SUCCESS;
    void *base = NULL;
    int disp_unit;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);

560
    if (win_ptr->shm_allocated == TRUE) {
561 562 563 564 565 566 567 568
        base = win_ptr->shm_base_addrs[target_rank];
        disp_unit = win_ptr->disp_units[target_rank];
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

569 570
    mpi_errno = shm_copy(origin_addr, origin_count, origin_datatype,
                         (char *) base + disp_unit * target_disp, target_count, target_datatype);
571 572 573
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }
574

575
  fn_exit:
576 577 578
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
579
  fn_fail:
580 581 582 583 584 585 586 587 588 589 590 591
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_acc_op
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_count, MPI_Datatype
                                        origin_datatype, int target_rank, MPI_Aint target_disp,
                                        int target_count, MPI_Datatype target_datatype, MPI_Op op,
592
                                        MPID_Win * win_ptr)
593 594 595 596 597 598 599 600 601 602 603
{
    void *base = NULL;
    int disp_unit, shm_op = 0;
    MPI_User_function *uop = NULL;
    MPID_Datatype *dtp;
    int mpi_errno = MPI_SUCCESS;
    MPIU_CHKLMEM_DECL(2);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);

604
    if (win_ptr->shm_allocated == TRUE) {
605 606 607 608 609 610 611 612 613
        shm_op = 1;
        base = win_ptr->shm_base_addrs[target_rank];
        disp_unit = win_ptr->disp_units[target_rank];
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

614 615 616
    if (op == MPI_REPLACE) {
        if (shm_op)
            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
617
        mpi_errno = shm_copy(origin_addr, origin_count, origin_datatype,
618 619 620 621 622 623 624
                             (char *) base + disp_unit * target_disp, target_count,
                             target_datatype);
        if (shm_op)
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
625 626 627 628
        goto fn_exit;
    }

    MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
629
                         mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op);
630 631 632 633

    /* get the function by indexing into the op table */
    uop = MPIR_OP_HDL_TO_FN(op);

634 635
    if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
        MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
636 637
        /* Cast away const'ness for origin_address in order to
         * avoid changing the prototype for MPI_User_function */
638 639 640 641 642 643
        if (shm_op)
            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        (*uop) ((void *) origin_addr, (char *) base + disp_unit * target_disp,
                &target_count, &target_datatype);
        if (shm_op)
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
644
    }
645
    else {
646 647 648 649 650 651 652 653
        /* derived datatype */

        MPID_Segment *segp;
        DLOOP_VECTOR *dloop_vec;
        MPI_Aint first, last;
        int vec_len, i, type_size, count;
        MPI_Datatype type;
        MPI_Aint true_lb, true_extent, extent;
654
        void *tmp_buf = NULL, *target_buf;
655 656
        const void *source_buf;

657
        if (origin_datatype != target_datatype) {
658
            /* first copy the data into a temporary buffer with
659 660
             * the same datatype as the target. Then do the
             * accumulate operation. */
661 662 663 664 665

            MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
            MPID_Datatype_get_extent_macro(target_datatype, extent);

            MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
666
                                target_count * (MPIR_MAX(extent, true_extent)),
667 668
                                mpi_errno, "temporary buffer");
            /* adjust for potential negative lower bound in datatype */
669
            tmp_buf = (void *) ((char *) tmp_buf - true_lb);
670 671

            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
672 673 674 675
                                       origin_datatype, tmp_buf, target_count, target_datatype);
            if (mpi_errno) {
                MPIU_ERR_POP(mpi_errno);
            }
676 677
        }

678
        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
679 680
            /* target predefined type, origin derived datatype */

681 682 683 684 685 686
            if (shm_op)
                MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
            (*uop) (tmp_buf, (char *) base + disp_unit * target_disp,
                    &target_count, &target_datatype);
            if (shm_op)
                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
687 688 689 690 691
        }
        else {

            segp = MPID_Segment_alloc();
            MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
692
                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
693 694
            MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
            first = 0;
695
            last = SEGMENT_IGNORE_LAST;
696 697 698 699 700

            MPID_Datatype_get_ptr(target_datatype, dtp);
            vec_len = dtp->max_contig_blocks * target_count + 1;
            /* +1 needed because Rob says so */
            MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
701
                                vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector");
702 703 704

            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);

705
            source_buf = (tmp_buf != NULL) ? (const void *) tmp_buf : origin_addr;
706 707 708
            target_buf = (char *) base + disp_unit * target_disp;
            type = dtp->eltype;
            type_size = MPID_Datatype_get_basic_size(type);
709 710 711 712 713 714 715 716
            if (shm_op)
                MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
            for (i = 0; i < vec_len; i++) {
                MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN) / type_size, int);

                (*uop) ((char *) source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
                        (char *) target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
                        &count, &type);
717
            }
718 719
            if (shm_op)
                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
720 721 722 723 724

            MPID_Segment_free(segp);
        }
    }

725
  fn_exit:
726 727 728 729
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
730 731 732
  fn_fail:
    if (shm_op)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
733 734 735 736 737 738 739 740 741 742 743 744
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_get_acc_op
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_count, MPI_Datatype
                                            origin_datatype, void *result_addr, int result_count,
                                            MPI_Datatype result_datatype, int target_rank, MPI_Aint
745 746 747
                                            target_disp, int target_count,
                                            MPI_Datatype target_datatype, MPI_Op op,
                                            MPID_Win * win_ptr)
748 749 750 751 752 753 754 755 756 757 758
{
    int disp_unit, shm_locked = 0;
    void *base = NULL;
    MPI_User_function *uop = NULL;
    MPID_Datatype *dtp;
    int mpi_errno = MPI_SUCCESS;
    MPIU_CHKLMEM_DECL(2);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);

759
    if (win_ptr->shm_allocated == TRUE) {
760 761 762 763 764 765 766 767 768 769 770
        base = win_ptr->shm_base_addrs[target_rank];
        disp_unit = win_ptr->disp_units[target_rank];
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        shm_locked = 1;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    /* Perform the local get first, then the accumulate */
771 772
    mpi_errno = shm_copy((char *) base + disp_unit * target_disp, target_count, target_datatype,
                         result_addr, result_count, result_datatype);
773 774 775
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }
776 777 778 779 780 781 782 783 784 785 786 787

    /* NO_OP: Don't perform the accumulate */
    if (op == MPI_NO_OP) {
        if (shm_locked) {
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
            shm_locked = 0;
        }

        goto fn_exit;
    }

    if (op == MPI_REPLACE) {
788
        mpi_errno = shm_copy(origin_addr, origin_count, origin_datatype,
789 790
                             (char *) base + disp_unit * target_disp, target_count,
                             target_datatype);
791

792 793 794
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
795 796 797 798 799 800 801 802 803 804

        if (shm_locked) {
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
            shm_locked = 0;
        }

        goto fn_exit;
    }

    MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
805
                         mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op);
806 807 808 809

    /* get the function by indexing into the op table */
    uop = MPIR_OP_HDL_TO_FN(op);

810 811
    if ((op == MPI_NO_OP || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
        MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
812 813
        /* Cast away const'ness for origin_address in order to
         * avoid changing the prototype for MPI_User_function */
814 815
        (*uop) ((void *) origin_addr, (char *) base + disp_unit * target_disp,
                &target_count, &target_datatype);
816 817 818 819 820 821 822 823 824 825
    }
    else {
        /* derived datatype */

        MPID_Segment *segp;
        DLOOP_VECTOR *dloop_vec;
        MPI_Aint first, last;
        int vec_len, i, type_size, count;
        MPI_Datatype type;
        MPI_Aint true_lb, true_extent, extent;
826
        void *tmp_buf = NULL, *target_buf;
827 828 829 830
        const void *source_buf;

        if (origin_datatype != target_datatype) {
            /* first copy the data into a temporary buffer with
831 832
             * the same datatype as the target. Then do the
             * accumulate operation. */
833 834 835 836 837

            MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
            MPID_Datatype_get_extent_macro(target_datatype, extent);

            MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
838
                                target_count * (MPIR_MAX(extent, true_extent)),
839 840
                                mpi_errno, "temporary buffer");
            /* adjust for potential negative lower bound in datatype */
841
            tmp_buf = (void *) ((char *) tmp_buf - true_lb);
842 843

            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
844 845 846 847
                                       origin_datatype, tmp_buf, target_count, target_datatype);
            if (mpi_errno) {
                MPIU_ERR_POP(mpi_errno);
            }
848 849
        }

850
        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
851 852
            /* target predefined type, origin derived datatype */

853 854
            (*uop) (tmp_buf, (char *) base + disp_unit * target_disp,
                    &target_count, &target_datatype);
855 856 857 858 859
        }
        else {

            segp = MPID_Segment_alloc();
            MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
860
                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
861 862
            MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
            first = 0;
863
            last = SEGMENT_IGNORE_LAST;
864 865 866 867 868

            MPID_Datatype_get_ptr(target_datatype, dtp);
            vec_len = dtp->max_contig_blocks * target_count + 1;
            /* +1 needed because Rob says so */
            MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
869
                                vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector");
870 871 872

            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);

873
            source_buf = (tmp_buf != NULL) ? (const void *) tmp_buf : origin_addr;
874 875 876 877
            target_buf = (char *) base + disp_unit * target_disp;
            type = dtp->eltype;
            type_size = MPID_Datatype_get_basic_size(type);

878 879 880 881 882
            for (i = 0; i < vec_len; i++) {
                MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN) / type_size, int);
                (*uop) ((char *) source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
                        (char *) target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
                        &count, &type);
883 884 885 886 887 888 889 890 891 892 893
            }

            MPID_Segment_free(segp);
        }
    }

    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        shm_locked = 0;
    }

894
  fn_exit:
895 896 897 898
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
899
  fn_fail:
900 901 902 903 904 905 906 907 908 909 910 911
    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    }
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_get_op
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
912 913 914 915
static inline int MPIDI_CH3I_Shm_get_op(void *origin_addr, int origin_count,
                                        MPI_Datatype origin_datatype, int target_rank,
                                        MPI_Aint target_disp, int target_count,
                                        MPI_Datatype target_datatype, MPID_Win * win_ptr)
916 917 918 919 920 921 922 923
{
    void *base = NULL;
    int disp_unit;
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);

924
    if (win_ptr->shm_allocated == TRUE) {
925 926 927 928 929 930 931 932
        base = win_ptr->shm_base_addrs[target_rank];
        disp_unit = win_ptr->disp_units[target_rank];
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

933 934
    mpi_errno = shm_copy((char *) base + disp_unit * target_disp, target_count, target_datatype,
                         origin_addr, origin_count, origin_datatype);
935 936 937
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }
938

939
  fn_exit:
940 941 942
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
943
  fn_fail:
944 945 946 947 948 949 950 951 952 953 954
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_cas_op
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_cas_op(const void *origin_addr, const void *compare_addr,
                                        void *result_addr, MPI_Datatype datatype, int target_rank,
955
                                        MPI_Aint target_disp, MPID_Win * win_ptr)
956 957 958
{
    void *base = NULL, *dest_addr = NULL;
    int disp_unit;
959 960
    MPI_Aint len;
    int shm_locked = 0;
961 962 963 964 965
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);

966
    if (win_ptr->shm_allocated == TRUE) {
967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991
        base = win_ptr->shm_base_addrs[target_rank];
        disp_unit = win_ptr->disp_units[target_rank];

        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        shm_locked = 1;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    dest_addr = (char *) base + disp_unit * target_disp;

    MPID_Datatype_get_size_macro(datatype, len);
    MPIU_Memcpy(result_addr, dest_addr, len);

    if (MPIR_Compare_equal(compare_addr, dest_addr, datatype)) {
        MPIU_Memcpy(dest_addr, origin_addr, len);
    }

    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        shm_locked = 0;
    }

992
  fn_exit:
993 994 995
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
996
  fn_fail:
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    }
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_fop_op
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_addr,
                                        MPI_Datatype datatype, int target_rank,
1011
                                        MPI_Aint target_disp, MPI_Op op, MPID_Win * win_ptr)
1012 1013 1014 1015
{
    void *base = NULL, *dest_addr = NULL;
    MPI_User_function *uop = NULL;
    int disp_unit;
1016 1017
    MPI_Aint len;
    int one, shm_locked = 0;
1018 1019 1020 1021 1022
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);

1023
    if (win_ptr->shm_allocated == TRUE) {
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
        base = win_ptr->shm_base_addrs[target_rank];
        disp_unit = win_ptr->disp_units[target_rank];

        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        shm_locked = 1;
    }
    else {
        base =