ch3u_rma_sync.c 204 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8 9
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

categories:
    - name        : CH3
      description : cvars that control behavior of ch3

cvars:
    - name        : MPIR_CVAR_CH3_RMA_ACC_IMMED
      category    : CH3
      type        : boolean
      default     : true
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Use the immediate accumulate optimization

    - name        : MPIR_CVAR_CH3_RMA_NREQUEST_THRESHOLD
      category    : CH3
      type        : int
      default     : 4000
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Threshold at which the RMA implementation attempts to complete requests
        while completing RMA operations and while using the lazy synchonization
        approach.  Change this value if programs fail because they run out of
        requests or other internal resources

    - name        : MPIR_CVAR_CH3_RMA_NREQUEST_NEW_THRESHOLD
      category    : CH3
      type        : int
44
      default     : 0
45 46 47 48 49 50 51 52 53
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Threshold for the number of new requests since the last attempt to
        complete pending requests.  Higher values can increase performance,
        but may run the risk of exceeding the available number of requests
        or other internal resources.

54 55 56
    - name        : MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED
      category    : CH3
      type        : int
57
      default     : (-1)
58 59 60 61 62 63 64
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Threshold for the number of completed requests the runtime finds
        before it stops trying to find more completed requests in garbage
        collection function.
65 66 67 68
        Note that it works with MPIR_CVAR_CH3_RMA_GC_NUM_TESTED as an OR
        relation, which means runtime will stop checking when either one
        of its following conditions is satisfied or one of conditions of
        MPIR_CVAR_CH3_RMA_GC_NUM_TESTED is satisfied.
69 70 71 72 73 74 75 76 77 78 79
        When it is set to negative value, it means runtime will not stop
        checking the operation list until it reaches the end of the list.
        When it is set to positive value, it means runtime will not stop
        checking the operation list until it finds certain number of
        completed requests. When it is set to zero value, the outcome is
        undefined.
        Note that in garbage collection function, if runtime finds a chain
        of completed RMA requests, it will temporarily ignore this CVAR
        and try to find continuous completed requests as many as possible,
        until it meets an incomplete request.

80 81 82
    - name        : MPIR_CVAR_CH3_RMA_GC_NUM_TESTED
      category    : CH3
      type        : int
83
      default     : 100
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Threshold for the number of RMA requests the runtime tests before
        it stops trying to check more requests in garbage collection
        routine.
        Note that it works with MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED as an
        OR relation, which means runtime will stop checking when either
        one of its following conditions is satisfied or one of conditions
        of MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED is satisfied.
        When it is set to negative value, runtime will not stop checking
        operation list until runtime reaches the end of the list. It has
        the risk of O(N) traversing overhead if there is no completed
        request in the list. When it is set to positive value, it means
        runtime will not stop checking the operation list until it visits
        such number of requests. Higher values may make more completed
        requests to be found, but it has the risk of visiting too many
        requests, leading to significant performance overhead. When it is
        set to zero value, runtime will stop checking the operation list
        immediately, which may cause weird performance in practice.
        Note that in garbage collection function, if runtime finds a chain
        of completed RMA requests, it will temporarily ignore this CVAR and
        try to find continuous completed requests as many as possible, until
        it meets an incomplete request.

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
    - name        : MPIR_CVAR_CH3_RMA_LOCK_IMMED
      category    : CH3
      type        : boolean
      default     : false
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Issue a request for the passive target RMA lock immediately.  Default
        behavior is to defer the lock request until the call to MPI_Win_unlock.

    - name        : MPIR_CVAR_CH3_RMA_MERGE_LOCK_OP_UNLOCK
      category    : CH3
      type        : boolean
      default     : true
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Enable/disable an optimization that merges lock, op, and unlock
        messages, for single-operation passive target epochs.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
/* Notes for memory barriers in RMA synchronizations

   When SHM is allocated for RMA window, we need to add memory berriers at proper
   places in RMA synchronization routines to guarantee the ordering of read/write
   operations, so that any operations after synchronization calls will see the
   correct data.

   There are four kinds of operations involved in the following explanation:

   1. Local loads/stores: any operations happening outside RMA epoch and accessing
      each process's own window memory.

   2. SHM operations: any operations happening inside RMA epoch. They may access
      any processes' window memory, which include direct loads/stores, and
      RMA operations that are internally implemented as direct loads/stores in
      MPI implementation.

   3. PROC_SYNC: synchronzations among processes by sending/recving messages.

   4. MEM_SYNC: a full memory barrier. It ensures the ordering of read/write
      operations on each process.

   (1) FENCE synchronization

              RANK 0                           RANK 1

       (local loads/stores)             (local loads/stores)

           WIN_FENCE {                    WIN_FENCE {
               MEM_SYNC                       MEM_SYNC
               PROC_SYNC -------------------- PROC_SYNC
               MEM_SYNC                       MEM_SYNC
           }                              }

        (SHM operations)                  (SHM operations)

           WIN_FENCE {                     WIN_FENCE {
               MEM_SYNC                        MEM_SYNC
               PROC_SYNC --------------------- PROC_SYNC
               MEM_SYNC                        MEM_SYNC
           }                               }

      (local loads/stores)              (local loads/stores)

       We need MEM_SYNC before and after PROC_SYNC for both starting WIN_FENCE
       and ending WIN_FENCE, to ensure the ordering between local loads/stores
       and PROC_SYNC in starting WIN_FENCE (and vice versa in ending WIN_FENCE),
       and the ordering between PROC_SYNC and SHM operations in starting WIN_FENCE
       (and vice versa for ending WIN_FENCE).

       In starting WIN_FENCE, the MEM_SYNC before PROC_SYNC essentially exposes
       previous local loads/stores to other processes; after PROC_SYNC, each
       process knows that everyone else already exposed their local loads/stores;
       the MEM_SYNC after PROC_SYNC ensures that my following SHM operations will
       happen after PROC_SYNC and will see the latest data on other processes.

       In ending WIN_FENCE, the MEM_SYNC before PROC_SYNC essentially exposes
       previous SHM operations to other processes; after PROC_SYNC, each process
       knows everyone else already exposed their SHM operations; the MEM_SYNC
       after PROC_SYNC ensures that my following local loads/stores will happen
       after PROC_SYNC and will see the latest data in my memory region.

   (2) POST-START-COMPLETE-WAIT synchronization

              RANK 0                           RANK 1

                                          (local loads/stores)

           WIN_START {                      WIN_POST {
                                                MEM_SYNC
               PROC_SYNC ---------------------- PROC_SYNC
               MEM_SYNC
           }                                }

         (SHM operations)

           WIN_COMPLETE {                  WIN_WAIT/TEST {
               MEM_SYNC
               PROC_SYNC --------------------- PROC_SYNC
                                               MEM_SYNC
           }                               }

                                          (local loads/stores)

       We need MEM_SYNC before PROC_SYNC for WIN_POST and WIN_COMPLETE, and
       MEM_SYNC after PROC_SYNC in WIN_START and WIN_WAIT/TEST, to ensure the
       ordering between local loads/stores and PROC_SYNC in WIN_POST (and
       vice versa in WIN_WAIT/TEST), and the ordering between PROC_SYNC and SHM
       operations in WIN_START (and vice versa in WIN_COMPLETE).

       In WIN_POST, the MEM_SYNC before PROC_SYNC essentially exposes previous
       local loads/stores to group of origin processes; after PROC_SYNC, origin
       processes knows all target processes already exposed their local
       loads/stores; in WIN_START, the MEM_SYNC after PROC_SYNC ensures that
       following SHM operations will happen after PROC_SYNC and will see the
       latest data on target processes.

       In WIN_COMPLETE, the MEM_SYNC before PROC_SYNC essentailly exposes previous
       SHM operations to group of target processes; after PROC_SYNC, target
       processes knows all origin process already exposed their SHM operations;
       in WIN_WAIT/TEST, the MEM_SYNC after PROC_SYNC ensures that following local
       loads/stores will happen after PROC_SYNC and will see the latest data in
       my memory region.

   (3) Passive target synchronization

              RANK 0                          RANK 1

                                        WIN_LOCK(target=1) {
                                            PROC_SYNC (lock granted)
                                            MEM_SYNC
                                        }

                                        (SHM operations)

                                        WIN_UNLOCK(target=1) {
                                            MEM_SYNC
                                            PROC_SYNC (lock released)
                                        }

         PROC_SYNC -------------------- PROC_SYNC

         WIN_LOCK (target=1) {
             PROC_SYNC (lock granted)
             MEM_SYNC
         }

         (SHM operations)

         WIN_UNLOCK (target=1) {
             MEM_SYNC
             PROC_SYNC (lock released)
         }

         PROC_SYNC -------------------- PROC_SYNC

                                        WIN_LOCK(target=1) {
                                            PROC_SYNC (lock granted)
                                            MEM_SYNC
                                        }

                                        (SHM operations)

                                        WIN_UNLOCK(target=1) {
                                            MEM_SYNC
                                            PROC_SYNC (lock released)
                                        }

         We need MEM_SYNC after PROC_SYNC in WIN_LOCK, and MEM_SYNC before
         PROC_SYNC in WIN_UNLOCK, to ensure the ordering between SHM operations
         and PROC_SYNC and vice versa.

         In WIN_LOCK, the MEM_SYNC after PROC_SYNC guarantees two things:
         (a) it guarantees that following SHM operations will happen after
         lock is granted; (b) it guarantees that following SHM operations
         will happen after any PROC_SYNC with target before WIN_LOCK is called,
         which means those SHM operations will see the latest data on target
         process.

         In WIN_UNLOCK, the MEM_SYNC before PROC_SYNC also guarantees two
         things: (a) it guarantees that SHM operations will happen before
         lock is released; (b) it guarantees that SHM operations will happen
         before any PROC_SYNC with target after WIN_UNLOCK is returned, which
         means following SHM operations on that target will see the latest data.

         WIN_LOCK_ALL/UNLOCK_ALL are same with WIN_LOCK/UNLOCK.

              RANK 0                          RANK 1

         WIN_LOCK_ALL

         (SHM operations)

         WIN_FLUSH(target=1) {
             MEM_SYNC
         }

         PROC_SYNC ------------------------PROC_SYNC

                                           WIN_LOCK(target=1) {
                                               PROC_SYNC (lock granted)
                                               MEM_SYNC
                                           }

                                           (SHM operations)

                                           WIN_UNLOCK(target=1) {
                                               MEM_SYNC
                                               PROC_SYNC (lock released)
                                           }

         WIN_UNLOCK_ALL

         We need MEM_SYNC in WIN_FLUSH to ensure the ordering between SHM
         operations and PROC_SYNC.

         The MEM_SYNC in WIN_FLUSH guarantees that all SHM operations before
         this WIN_FLUSH will happen before any PROC_SYNC with target after
         this WIN_FLUSH, which means SHM operations on target process after
         PROC_SYNC with origin will see the latest data.
*/

337
void MPIDI_CH3_RMA_Init_Pvars(void)
338 339
{
}
340

341 342
/* These are used to use a common routine to complete lists of RMA
   operations with a single routine, while collecting data that
343 344 345 346
   distinguishes between different synchronization modes.  This is not
   thread-safe; the best choice for thread-safety is to eliminate this
   ability to discriminate between the different types of RMA synchronization.
*/
347

348 349 350
/*
 * These routines provide a default implementation of the MPI RMA operations
 * in terms of the low-level, two-sided channel operations.  A channel
351 352
 * may override these functions, on a per-window basis, by overriding
 * the MPID functions in the RMAFns section of MPID_Win object.
353 354
 */

355 356
#define SYNC_POST_TAG 100

357 358
static int send_lock_msg(int dest, int lock_type, MPID_Win * win_ptr);
static int send_unlock_msg(int dest, MPID_Win * win_ptr);
Xin Zhao's avatar
Xin Zhao committed
359
/* static int send_flush_msg(int dest, MPID_Win *win_ptr); */
360 361
static int wait_for_lock_granted(MPID_Win * win_ptr, int target_rank);
static int acquire_local_lock(MPID_Win * win_ptr, int lock_mode);
362
static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
363 364 365 366 367
                        MPIDI_CH3_Pkt_flags_t flags,
                        MPI_Win source_win_handle,
                        MPI_Win target_win_handle,
                        MPIDI_RMA_dtype_info * dtype_info,
                        void **dataloop, MPID_Request ** request);
368
static int recv_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
369 370 371 372 373
                        MPIDI_CH3_Pkt_flags_t flags,
                        MPI_Win source_win_handle,
                        MPI_Win target_win_handle,
                        MPIDI_RMA_dtype_info * dtype_info,
                        void **dataloop, MPID_Request ** request);
374
static int send_contig_acc_msg(MPIDI_RMA_Op_t *, MPID_Win *,
375
                               MPIDI_CH3_Pkt_flags_t flags, MPI_Win, MPI_Win, MPID_Request **);
376
static int send_immed_rmw_msg(MPIDI_RMA_Op_t *, MPID_Win *,
377 378 379
                              MPIDI_CH3_Pkt_flags_t flags, MPI_Win, MPI_Win, MPID_Request **);
static int do_passive_target_rma(MPID_Win * win_ptr, int target_rank,
                                 int *wait_for_rma_done_pkt, MPIDI_CH3_Pkt_flags_t sync_flags);
Xin Zhao's avatar
Xin Zhao committed
380 381
static int send_lock_put_or_acc(MPID_Win *, int);
static int send_lock_get(MPID_Win *, int);
382
static inline int poke_progress_engine(void);
383 384 385 386
static inline int rma_list_complete(MPID_Win * win_ptr, MPIDI_RMA_Ops_list_t * ops_list);
static inline int rma_list_gc(MPID_Win * win_ptr,
                              MPIDI_RMA_Ops_list_t * ops_list,
                              MPIDI_RMA_Op_t * last_elm, int *nDone);
387

388
static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
389
                           const void *dataloop, MPI_Aint dataloop_sz,
390
                           const void *o_addr, int o_count,
391
                           MPI_Datatype o_datatype, MPID_Datatype ** combined_dtp);
392

393
#define MPIDI_CH3I_ISSUE_RMA_OP(op_ptr_, win_ptr_, flags_, source_win_handle_, target_win_handle_,err_) \
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
    do {                                                                \
        switch ((op_ptr_)->type)                                        \
        {                                                               \
        case (MPIDI_RMA_PUT):                                           \
        case (MPIDI_RMA_ACCUMULATE):                                    \
            (err_) = send_rma_msg((op_ptr_), (win_ptr_), (flags_), (source_win_handle_), \
                                  (target_win_handle_), &(op_ptr_)->dtype_info, \
                                  &(op_ptr_)->dataloop, &(op_ptr_)->request); \
            if (err_) { MPIU_ERR_POP(err_); }                           \
            break;                                                      \
        case (MPIDI_RMA_GET_ACCUMULATE):                                \
            if ((op_ptr_)->op == MPI_NO_OP) {                           \
                /* Note: Origin arguments are ignored for NO_OP, so we don't \
                 * need to release a ref to the origin datatype. */     \
                                                                        \
                /* Convert the GAcc to a Get */                         \
                (op_ptr_)->type            = MPIDI_RMA_GET;             \
                (op_ptr_)->origin_addr     = (op_ptr_)->result_addr;    \
                (op_ptr_)->origin_count    = (op_ptr_)->result_count;   \
                (op_ptr_)->origin_datatype = (op_ptr_)->result_datatype; \
                                                                        \
                (err_) = recv_rma_msg((op_ptr_), (win_ptr_), (flags_), (source_win_handle_), \
                                      (target_win_handle_), &(op_ptr_)->dtype_info, \
                                      &(op_ptr_)->dataloop, &(op_ptr_)->request); \
            } else {                                                    \
                (err_) = send_rma_msg((op_ptr_), (win_ptr_), (flags_), (source_win_handle_), \
                                      (target_win_handle_), &(op_ptr_)->dtype_info, \
                                      &(op_ptr_)->dataloop, &(op_ptr_)->request); \
            }                                                           \
            if (err_) { MPIU_ERR_POP(err_); }                           \
            break;                                                      \
        case MPIDI_RMA_ACC_CONTIG:                                      \
            (err_) = send_contig_acc_msg((op_ptr_), (win_ptr_), (flags_), \
                                         (source_win_handle_), (target_win_handle_), \
                                         &(op_ptr_)->request);          \
            if (err_) { MPIU_ERR_POP(err_); }                           \
            break;                                                      \
        case (MPIDI_RMA_GET):                                           \
            (err_) = recv_rma_msg((op_ptr_), (win_ptr_), (flags_),      \
                                  (source_win_handle_), (target_win_handle_), \
                                  &(op_ptr_)->dtype_info,               \
                                  &(op_ptr_)->dataloop, &(op_ptr_)->request); \
            if (err_) { MPIU_ERR_POP(err_); }                           \
            break;                                                      \
        case (MPIDI_RMA_COMPARE_AND_SWAP):                              \
        case (MPIDI_RMA_FETCH_AND_OP):                                  \
            (err_) = send_immed_rmw_msg((op_ptr_), (win_ptr_), (flags_), \
                                        (source_win_handle_), (target_win_handle_), \
                                        &(op_ptr_)->request);           \
            if (err_) { MPIU_ERR_POP(err_); }                           \
            break;                                                      \
                                                                        \
        default:                                                        \
            MPIU_ERR_SETANDJUMP(err_,MPI_ERR_OTHER,"**winInvalidOp");   \
        }                                                               \
449 450 451
    } while (0)


452 453 454 455
#undef FUNCNAME
#define FUNCNAME MPIDI_Win_fence
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
456
int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
457 458
{
    int mpi_errno = MPI_SUCCESS;
459
    int comm_size;
460
    int *rma_target_proc, *nops_to_proc, i, total_op_count, *curr_ops_cnt;
461
    MPIDI_RMA_Op_t *curr_ptr;
462
    MPIDI_RMA_Ops_list_t *ops_list;
463
    MPID_Comm *comm_ptr;
464
    MPI_Win source_win_handle, target_win_handle;
465
    MPID_Progress_state progress_state;
466
    int errflag = FALSE;
467
    MPIU_CHKLMEM_DECL(3);
468 469 470 471
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);

James Dinan's avatar
James Dinan committed
472 473 474 475
    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_NONE &&
                        win_ptr->epoch_state != MPIDI_EPOCH_FENCE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

476
    /* Note that the NOPRECEDE and NOSUCCEED must be specified by all processes
477 478
     * in the window's group if any specify it */
    if (assert & MPI_MODE_NOPRECEDE) {
479 480 481 482
        /* Error: Operations were issued and the user claimed NOPRECEDE */
        MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_FENCE,
                            mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

483 484
        win_ptr->fence_issued = (assert & MPI_MODE_NOSUCCEED) ? 0 : 1;
        goto shm_barrier;
485
    }
486

487 488 489 490 491 492 493
    if (win_ptr->fence_issued == 0) {
        /* win_ptr->fence_issued == 0 means either this is the very first
         * call to fence or the preceding fence had the
         * MPI_MODE_NOSUCCEED assert.
         *
         * If this fence has MPI_MODE_NOSUCCEED, do nothing and return.
         * Otherwise just increment the fence count and return. */
494

495 496
        if (!(assert & MPI_MODE_NOSUCCEED))
            win_ptr->fence_issued = 1;
497
    }
498 499 500
    else {
        int nRequest = 0;
        int nRequestNew = 0;
501 502

        /* Ensure ordering of load/store operations. */
503
        if (win_ptr->shm_allocated == TRUE) {
504
            OPA_read_write_barrier();
505 506
        }

507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
        /* This is the second or later fence. Do all the preceding RMA ops. */
        comm_ptr = win_ptr->comm_ptr;
        /* First inform every process whether it is a target of RMA
         * ops from this process */
        comm_size = comm_ptr->local_size;

        MPIU_CHKLMEM_MALLOC(rma_target_proc, int *, comm_size * sizeof(int),
                            mpi_errno, "rma_target_proc");
        for (i = 0; i < comm_size; i++)
            rma_target_proc[i] = 0;

        /* keep track of no. of ops to each proc. Needed for knowing
         * whether or not to decrement the completion counter. The
         * completion counter is decremented only on the last
         * operation. */
        MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size * sizeof(int),
                            mpi_errno, "nops_to_proc");
        for (i = 0; i < comm_size; i++)
            nops_to_proc[i] = 0;
526

527
        /* Note, active target uses the following ops list, and passive
528
         * target uses win_ptr->targets[..] */
529
        ops_list = &win_ptr->at_rma_ops_list;
530

531 532 533
        /* set rma_target_proc[i] to 1 if rank i is a target of RMA
         * ops from this process */
        total_op_count = 0;
534
        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
        while (curr_ptr != NULL) {
            total_op_count++;
            rma_target_proc[curr_ptr->target_rank] = 1;
            nops_to_proc[curr_ptr->target_rank]++;
            curr_ptr = curr_ptr->next;
        }

        MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size * sizeof(int),
                            mpi_errno, "curr_ops_cnt");
        for (i = 0; i < comm_size; i++)
            curr_ops_cnt[i] = 0;
        /* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc.
         * As a result,
         * each process knows how many other processes will be doing
         * RMA ops on its window */

        /* first initialize the completion counter. */
        win_ptr->at_completion_counter += comm_size;

        mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_proc, 1,
555
                                                   MPI_INT, MPI_SUM, comm_ptr, &errflag);
556 557 558 559
        /* result is stored in rma_target_proc[0] */
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
560
        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
561

562 563 564 565 566
        /* Ensure ordering of load/store operations. */
        if (win_ptr->shm_allocated == TRUE) {
            OPA_read_write_barrier();
        }

567 568 569
        /* Set the completion counter */
        /* FIXME: MT: this needs to be done atomically because other
         * procs have the address and could decrement it. */
570 571
        win_ptr->at_completion_counter -= comm_size;
        win_ptr->at_completion_counter += rma_target_proc[0];
572

573
        i = 0;
574
        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
575
        while (curr_ptr != NULL) {
576 577
            MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;

578 579 580
            /* The completion counter at the target is decremented only on
             * the last RMA operation. */
            if (curr_ops_cnt[curr_ptr->target_rank] == nops_to_proc[curr_ptr->target_rank] - 1) {
581 582 583
                flags = MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE;
            }

584
            source_win_handle = win_ptr->handle;
585
            target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
586 587 588

            MPIDI_CH3I_ISSUE_RMA_OP(curr_ptr, win_ptr, flags,
                                    source_win_handle, target_win_handle, mpi_errno);
589

590 591 592 593
            i++;
            curr_ops_cnt[curr_ptr->target_rank]++;
            /* If the request is null, we can remove it immediately */
            if (!curr_ptr->request) {
594
                MPIDI_CH3I_RMA_Ops_free_and_next(ops_list, &curr_ptr);
595 596 597 598 599 600 601 602 603 604 605 606
            }
            else {
                nRequest++;
                curr_ptr = curr_ptr->next;
                /* The test on the difference is to reduce the number
                 * of times the partial complete routine is called. Without
                 * this, significant overhead is added once the
                 * number of requests exceeds the threshold, since the
                 * number that are completed in a call may be small. */
                if (nRequest > MPIR_CVAR_CH3_RMA_NREQUEST_THRESHOLD &&
                    nRequest - nRequestNew > MPIR_CVAR_CH3_RMA_NREQUEST_NEW_THRESHOLD) {
                    int nDone = 0;
607
                    mpi_errno = poke_progress_engine();
608 609
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
610

611
                    mpi_errno = rma_list_gc(win_ptr, ops_list, curr_ptr, &nDone);
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
                    /* if (nDone > 0) printf("nDone = %d\n", nDone); */
                    nRequest -= nDone;
                    nRequestNew = nRequest;
                }
            }
        }

        /* We replaced a loop over an array of requests with a list of the
         * incomplete requests.  The reason to do
         * that is for long lists - processing the entire list until
         * all are done introduces a potentially n^2 time.  In
         * testing with test/mpi/perf/manyrma.c , the number of iterations
         * within the "while (total_op_count) was O(total_op_count).
         *
         * Another alternative is to create a more compressed list (storing
         * only the necessary information, reducing the number of cache lines
         * needed while looping through the requests.
         */
        if (total_op_count) {
633
            mpi_errno = rma_list_complete(win_ptr, ops_list);
634 635 636
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
637

638 639 640 641 642
        /* MT: avoid processing unissued operations enqueued by other threads
           in rma_list_complete() */
        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
        if (curr_ptr && !curr_ptr->request)
            goto finish_up;
643
        MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(ops_list));
644

645
 finish_up:
646
	/* wait for all operations from other processes to finish */
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663
        if (win_ptr->at_completion_counter) {
            MPID_Progress_start(&progress_state);
            while (win_ptr->at_completion_counter) {
                mpi_errno = MPID_Progress_wait(&progress_state);
                /* --BEGIN ERROR HANDLING-- */
                if (mpi_errno != MPI_SUCCESS) {
                    MPID_Progress_end(&progress_state);
                    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
                }
                /* --END ERROR HANDLING-- */
            }
            MPID_Progress_end(&progress_state);
        }

        if (assert & MPI_MODE_NOSUCCEED) {
            win_ptr->fence_issued = 0;
        }
664

James Dinan's avatar
James Dinan committed
665 666 667
        win_ptr->epoch_state = MPIDI_EPOCH_NONE;
    }

668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
 shm_barrier:
    if (!(assert & MPI_MODE_NOSUCCEED)) {
        /* In a FENCE without MPI_MODE_NOSUCCEED (which means this FENCE
           might start a new Active epoch), if SHM is allocated, perform
           a barrier among processes on the same node, to prevent one
           process modifying another process's memory before that process
           starts an epoch. */

        if (win_ptr->shm_allocated == TRUE) {
            MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;

            /* Ensure ordering of load/store operations. */
            OPA_read_write_barrier();

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno) {goto fn_fail;}

            /* Ensure ordering of load/store operations. */
            OPA_read_write_barrier();
        }
    }

690
  fn_exit:
691 692 693 694
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FENCE);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
695
  fn_fail:
696 697 698 699
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

700 701 702 703 704 705
/* create_datatype() creates a new struct datatype for the dtype_info
   and the dataloop of the target datatype together with the user data */
#undef FUNCNAME
#define FUNCNAME create_datatype
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
706
static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
707 708
                           const void *dataloop, MPI_Aint dataloop_sz,
                           const void *o_addr, int o_count, MPI_Datatype o_datatype,
709
                           MPID_Datatype ** combined_dtp)
710 711 712
{
    int mpi_errno = MPI_SUCCESS;
    /* datatype_set_contents wants an array 'ints' which is the
713 714 715
     * blocklens array with count prepended to it.  So blocklens
     * points to the 2nd element of ints to avoid having to copy
     * blocklens into ints later. */
716 717 718 719 720 721 722 723 724 725 726 727 728 729
    int ints[4];
    int *blocklens = &ints[1];
    MPI_Aint displaces[3];
    MPI_Datatype datatypes[3];
    const int count = 3;
    MPI_Datatype combined_datatype;
    MPIDI_STATE_DECL(MPID_STATE_CREATE_DATATYPE);

    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DATATYPE);

    /* create datatype */
    displaces[0] = MPIU_PtrToAint(dtype_info);
    blocklens[0] = sizeof(*dtype_info);
    datatypes[0] = MPI_BYTE;
730

731
    displaces[1] = MPIU_PtrToAint(dataloop);
732
    MPIU_Assign_trunc(blocklens[1], dataloop_sz, int);
733
    datatypes[1] = MPI_BYTE;
734

735 736 737
    displaces[2] = MPIU_PtrToAint(o_addr);
    blocklens[2] = o_count;
    datatypes[2] = o_datatype;
738 739 740 741 742

    mpi_errno = MPID_Type_struct(count, blocklens, displaces, datatypes, &combined_datatype);
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);

743 744
    ints[0] = count;

745 746 747 748 749 750 751
    MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);
    mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT, count + 1,       /* ints (cnt,blklen) */
                                           count,       /* aints (disps) */
                                           count,       /* types */
                                           ints, displaces, datatypes);
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
752 753

    /* Commit datatype */
754

755 756 757
    MPID_Dataloop_create(combined_datatype,
                         &(*combined_dtp)->dataloop,
                         &(*combined_dtp)->dataloop_size,
758 759
                         &(*combined_dtp)->dataloop_depth, MPID_DATALOOP_HOMOGENEOUS);

760 761 762 763
    /* create heterogeneous dataloop */
    MPID_Dataloop_create(combined_datatype,
                         &(*combined_dtp)->hetero_dloop,
                         &(*combined_dtp)->hetero_dloop_size,
764 765 766
                         &(*combined_dtp)->hetero_dloop_depth, MPID_DATALOOP_HETEROGENEOUS);

  fn_exit:
767 768
    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DATATYPE);
    return mpi_errno;
769
  fn_fail:
770 771 772
    goto fn_exit;
}

773 774

#undef FUNCNAME
Xin Zhao's avatar
Xin Zhao committed
775
#define FUNCNAME send_rma_msg
776 777
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
778 779 780 781 782
static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
                        MPIDI_CH3_Pkt_flags_t flags,
                        MPI_Win source_win_handle,
                        MPI_Win target_win_handle,
                        MPIDI_RMA_dtype_info * dtype_info, void **dataloop, MPID_Request ** request)
783
{
784 785 786
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_put_t *put_pkt = &upkt.put;
    MPIDI_CH3_Pkt_accum_t *accum_pkt = &upkt.accum;
787
    MPID_IOV iov[MPID_IOV_LIMIT];
788
    int mpi_errno = MPI_SUCCESS;
789 790
    int origin_dt_derived, target_dt_derived, iovcnt;
    MPI_Aint origin_type_size;
791
    MPIDI_VC_t *vc;
792
    MPID_Comm *comm_ptr;
793 794
    MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
    MPID_Request *resp_req = NULL;
795
    MPIU_CHKPMEM_DECL(1);
Xin Zhao's avatar
Xin Zhao committed
796
    MPIDI_STATE_DECL(MPID_STATE_SEND_RMA_MSG);
797 798
    MPIDI_STATE_DECL(MPID_STATE_MEMCPY);

Xin Zhao's avatar
Xin Zhao committed
799
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_RMA_MSG);
800

801
    *request = NULL;
802

803
    if (rma_op->type == MPIDI_RMA_PUT) {
804 805 806
        MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        put_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
807
        put_pkt->flags = flags;
808 809 810 811 812
        put_pkt->count = rma_op->target_count;
        put_pkt->datatype = rma_op->target_datatype;
        put_pkt->dataloop_size = 0;
        put_pkt->target_win_handle = target_win_handle;
        put_pkt->source_win_handle = source_win_handle;
813

814 815 816
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) put_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*put_pkt);
    }
817
    else if (rma_op->type == MPIDI_RMA_GET_ACCUMULATE) {
818
        /* Create a request for the GACC response.  Store the response buf, count, and
819 820
         * datatype in it, and pass the request's handle in the GACC packet. When the
         * response comes from the target, it will contain the request handle. */
821 822 823 824 825 826 827 828
        resp_req = MPID_Request_create();
        MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");

        MPIU_Object_set_ref(resp_req, 2);

        resp_req->dev.user_buf = rma_op->result_addr;
        resp_req->dev.user_count = rma_op->result_count;
        resp_req->dev.datatype = rma_op->result_datatype;
829 830
        resp_req->dev.target_win_handle = target_win_handle;
        resp_req->dev.source_win_handle = source_win_handle;
831

832
        if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
833 834 835 836
            MPID_Datatype *result_dtp = NULL;
            MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
            resp_req->dev.datatype_ptr = result_dtp;
            /* this will cause the datatype to be freed when the
837
             * request is freed. */
838 839 840
        }

        /* Note: Get_accumulate uses the same packet type as accumulate */
841 842 843
        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
        accum_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
Pavan Balaji's avatar
Pavan Balaji committed
844
        accum_pkt->flags = flags;
845 846 847 848 849 850 851 852
        accum_pkt->count = rma_op->target_count;
        accum_pkt->datatype = rma_op->target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = rma_op->op;
        accum_pkt->target_win_handle = target_win_handle;
        accum_pkt->source_win_handle = source_win_handle;
        accum_pkt->request_handle = resp_req->handle;

853 854 855
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
    }
856
    else {
857 858 859
        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        accum_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
860
        accum_pkt->flags = flags;
861 862 863 864 865 866 867
        accum_pkt->count = rma_op->target_count;
        accum_pkt->datatype = rma_op->target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = rma_op->op;
        accum_pkt->target_win_handle = target_win_handle;
        accum_pkt->source_win_handle = source_win_handle;

868 869 870 871
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
    }

872
    /*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
873 874 875
     * rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
     * fflush(stdout);
     */
876

877
    comm_ptr = win_ptr->comm_ptr;
878
    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
879

880
    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
881 882 883
        origin_dt_derived = 1;
        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp);
    }
884
    else {
885 886 887
        origin_dt_derived = 0;
    }

888
    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->target_datatype)) {
889
        target_dt_derived = 1;
890
        MPID_Datatype_get_ptr(rma_op->target_datatype, target_dtp);
891
    }
892
    else {
893 894 895
        target_dt_derived = 0;
    }

896
    if (target_dt_derived) {
897
        /* derived datatype on target. fill derived datatype info */
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912
        dtype_info->is_contig = target_dtp->is_contig;
        dtype_info->max_contig_blocks = target_dtp->max_contig_blocks;
        dtype_info->size = target_dtp->size;
        dtype_info->extent = target_dtp->extent;
        dtype_info->dataloop_size = target_dtp->dataloop_size;
        dtype_info->dataloop_depth = target_dtp->dataloop_depth;
        dtype_info->eltype = target_dtp->eltype;
        dtype_info->dataloop = target_dtp->dataloop;
        dtype_info->ub = target_dtp->ub;
        dtype_info->lb = target_dtp->lb;
        dtype_info->true_ub = target_dtp->true_ub;
        dtype_info->true_lb = target_dtp->true_lb;
        dtype_info->has_sticky_ub = target_dtp->has_sticky_ub;
        dtype_info->has_sticky_lb = target_dtp->has_sticky_lb;

913
        MPIU_CHKPMEM_MALLOC(*dataloop, void *, target_dtp->dataloop_size, mpi_errno, "dataloop");
914

915
        MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY);
916
        MPIU_Memcpy(*dataloop, target_dtp->dataloop, target_dtp->dataloop_size);
917
        MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY);
918 919
        /* the dataloop can have undefined padding sections, so we need to let
         * valgrind know that it is OK to pass this data to writev later on */
920
        MPL_VG_MAKE_MEM_DEFINED(*dataloop, target_dtp->dataloop_size);
921

922
        if (rma_op->type == MPIDI_RMA_PUT) {
923
            put_pkt->dataloop_size = target_dtp->dataloop_size;
924 925
        }
        else {
926
            accum_pkt->dataloop_size = target_dtp->dataloop_size;
927
        }
928 929 930 931
    }

    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);

932
    if (!target_dt_derived) {
933
        /* basic datatype on target */
934
        if (!origin_dt_derived) {
935
            /* basic datatype on origin */
936
            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->origin_addr;
937 938
            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
            iovcnt = 2;
939
            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
940
            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, request);
941
            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
942
            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
943
        }
944
        else {
945
            /* derived datatype on origin */
946
            *request = MPID_Request_create();
947 948
            MPIU_ERR_CHKANDJUMP(*request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");

949 950
            MPIU_Object_set_ref(*request, 2);
            (*request)->kind = MPID_REQUEST_SEND;
951 952 953 954

            (*request)->dev.segment_ptr = MPID_Segment_alloc();
            MPIU_ERR_CHKANDJUMP1((*request)->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
955

956
            (*request)->dev.datatype_ptr = origin_dtp;
957
            /* this will cause the datatype to be freed when the request
958
             * is freed. */
959
            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
960
                              rma_op->origin_datatype, (*request)->dev.segment_ptr, 0);
961 962
            (*request)->dev.segment_first = 0;
            (*request)->dev.segment_size = rma_op->origin_count * origin_type_size;
963

964 965
            (*request)->dev.OnFinal = 0;
            (*request)->dev.OnDataAvail = 0;
966

967 968 969 970
            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
            mpi_errno =
                vc->sendNoncontig_fn(vc, *request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
971
            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
972 973
        }
    }
974
    else {
975
        /* derived datatype on target */
976
        MPID_Datatype *combined_dtp = NULL;
977

978 979
        *request = MPID_Request_create();
        if (*request == NULL) {
980
            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
981 982
        }

983 984
        MPIU_Object_set_ref(*request, 2);
        (*request)->kind = MPID_REQUEST_SEND;
985

986 987 988
        (*request)->dev.segment_ptr = MPID_Segment_alloc();
        MPIU_ERR_CHKANDJUMP1((*request)->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
                             "**nomem", "**nomem %s", "MPID_Segment_alloc");
989 990 991

        /* create a new datatype containing the dtype_info, dataloop, and origin data */

992 993 994 995 996
        mpi_errno =
            create_datatype(dtype_info, *dataloop, target_dtp->dataloop_size, rma_op->origin_addr,
                            rma_op->origin_count, rma_op->origin_datatype, &combined_dtp);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
997

998
        (*request)->dev.datatype_ptr = combined_dtp;
999 1000
        /* combined_datatype will be freed when request is freed */

1001
        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, (*request)->dev.segment_ptr, 0);
1002 1003
        (*request)->dev.segment_first = 0;
        (*request)->dev.segment_size = combined_dtp->size;
1004

1005 1006
        (*request)->dev.OnFinal = 0;
        (*request)->dev.OnDataAvail = 0;
1007

1008
        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
1009
        mpi_errno = vc->sendNoncontig_fn(vc, *request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
1010
        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
1011
        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
1012

1013 1014 1015
        /* we're done with the datatypes */
        if (origin_dt_derived)
            MPID_Datatype_release(origin_dtp);
1016
        MPID_Datatype_release(target_dtp);
1017 1018 1019
    }

    /* This operation can generate two requests; one for inbound and one for
1020
     * outbound data. */
1021
    if (resp_req != NULL) {
1022
        if (*request != NULL) {
1023
            /* If we have both inbound and outbound requests (i.e. GACC
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
             * operation), we need to ensure that the source buffer is
             * available and that the response data has been received before
             * informing the origin that this operation is complete.  Because
             * the update needs to be done atomically at the target, they will
             * not send back data until it has been received.  Therefore,
             * completion of the response request implies that the send request
             * has completed.
             *
             * Therefore: refs on the response request are set to two: one is
             * held by the progress engine and the other by the RMA op
             * completion code.  Refs on the outbound request are set to one;
             * it will be completed by the progress engine.
1036 1037
             */

1038 1039
            MPID_Request_release(*request);
            *request = resp_req;
1040

1041 1042
        }
        else {
1043
            *request = resp_req;
1044 1045 1046 1047 1048
        }

        /* For error checking */
        resp_req = NULL;
    }
1049

1050
  fn_exit:
1051
    MPIU_CHKPMEM_COMMIT();
Xin Zhao's avatar
Xin Zhao committed
1052
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_RMA_MSG);
1053 1054
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
1055
  fn_fail:
1056
    if (resp_req) {
1057
        MPID_Request_release(resp_req);
1058
    }
1059
    if (*request) {
1060
        MPIU_CHKPMEM_REAP();
1061 1062 1063
        if ((*request)->dev.datatype_ptr)
            MPID_Datatype_release((*request)->dev.datatype_ptr);
        MPID_Request_release(*request);
1064
    }
1065
    *request = NULL;
1066 1067 1068 1069
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

1070 1071 1072 1073
/*
 * Use this for contiguous accumulate operations
 */
#undef FUNCNAME
Xin Zhao's avatar
Xin Zhao committed
1074
#define FUNCNAME send_contig_acc_msg
1075 1076
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
1077 1078 1079 1080 1081
static int send_contig_acc_msg(MPIDI_RMA_Op_t * rma_op,
                               MPID_Win * win_ptr,
                               MPIDI_CH3_Pkt_flags_t flags,
                               MPI_Win source_win_handle,
                               MPI_Win target_win_handle, MPID_Request ** request)
1082
{
1083 1084
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_accum_t *accum_pkt = &upkt.accum;
1085
    MPID_IOV iov[MPID_IOV_LIMIT];
1086
    int mpi_errno = MPI_SUCCESS;
1087 1088
    int iovcnt;
    MPI_Aint origin_type_size;
1089
    MPIDI_VC_t *vc;
1090
    MPID_Comm *comm_ptr;
1091
    size_t len;
Xin Zhao's avatar
Xin Zhao committed
1092
    MPIDI_STATE_DECL(MPID_STATE_SEND_CONTIG_ACC_MSG);
1093

Xin Zhao's avatar
Xin Zhao committed
1094
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_CONTIG_ACC_MSG);
1095

1096
    *request = NULL;
1097 1098 1099

    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
    /* FIXME: Make this size check efficient and match the packet type */
1100
    MPIU_Assign_trunc(len, rma_op->origin_count * origin_type_size, size_t);
1101 1102 1103 1104 1105 1106 1107
    if (MPIR_CVAR_CH3_RMA_ACC_IMMED && len <= MPIDI_RMA_IMMED_INTS * sizeof(int)) {
        MPIDI_CH3_Pkt_accum_immed_t *accumi_pkt = &upkt.accum_immed;
        void *dest = accumi_pkt->data, *src = rma_op->origin_addr;

        MPIDI_Pkt_init(accumi_pkt, MPIDI_CH3_PKT_ACCUM_IMMED);
        accumi_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;