ch3u_rma_ops.c 40.9 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8 9 10 11
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

36 37 38 39
#undef FUNCNAME
#define FUNCNAME MPIDI_Put
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
40
int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
41 42
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
43 44
{
    int mpi_errno = MPI_SUCCESS;
45
    int dt_contig ATTRIBUTE((unused)), rank;
46
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
47
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
48
    MPIDI_msg_sz_t data_sz;
49
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
50
    int made_progress = 0;
51
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
52

53 54
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);

55 56 57
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

58 59 60 61
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

62 63
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

64
    if (data_sz == 0) {
65
        goto fn_exit;
66 67
    }

68
    rank = win_ptr->comm_ptr->rank;
69 70 71

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
72
        /* check if target is local and shared memory is allocated on window,
73
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
74 75

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
76 77 78 79 80
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
81 82 83 84
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

85
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
86
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
87
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
88 89
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
90 91
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
92
    }
93
    else {
94
        MPIDI_RMA_Op_t *new_ptr = NULL;
95
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
96

97
        /* queue it up */
98 99
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
100

101 102 103 104 105 106 107 108 109
        put_pkt = &(new_ptr->pkt.put);
        MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        put_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
        put_pkt->dataloop_size = 0;
        put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        put_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
110
        put_pkt->immed_len = 0;
111
        put_pkt->origin_rank = rank;
112

113 114 115 116 117 118
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

119 120 121 122
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

123 124 125 126 127
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
128
            new_ptr->is_dt = 1;
129 130 131 132
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
133
            new_ptr->is_dt = 1;
134
        }
135

Xin Zhao's avatar
Xin Zhao committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
        /* If both origin and target are basic datatype, try to
           copy origin data to packet header as much as possible. */
        if (!new_ptr->is_dt) {
            size_t len;
            MPI_Aint origin_type_size;

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed area in pkt header */
            MPIU_Assign_trunc(put_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (put_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = put_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, put_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
155 156 157 158 159

                /* If all data is in pkt header, mark this op as a candidate
                   for piggybacking LOCK. */
                if (put_pkt->immed_len == len)
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
160 161 162
            }
        }

163 164 165 166 167 168 169 170 171 172
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
173 174 175 176 177 178 179 180 181 182 183 184 185 186

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
187 188 189
    }

  fn_exit:
190
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_PUT);
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Get
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
206 207
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
208 209 210
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
211
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
212
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
213
    MPID_Datatype *dtp;
214
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
215
    int made_progress = 0;
216
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
217

218 219
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);

220 221 222
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

223 224 225 226
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

227
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
228

229
    if (data_sz == 0) {
230
        goto fn_exit;
231 232
    }

233
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
234

235 236
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
237
        /* check if target is local and shared memory is allocated on window,
238
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
239 240

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
241 242 243 244 245
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
246 247 248
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
249

250
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
251
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
252
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
253 254
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
255 256
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
257
    }
258
    else {
259
        MPIDI_RMA_Op_t *new_ptr = NULL;
260
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
261

262
        /* queue it up */
263 264
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
265

266 267 268 269 270 271 272 273 274
        get_pkt = &(new_ptr->pkt.get);
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
        get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
        get_pkt->dataloop_size = 0;
        get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        get_pkt->source_win_handle = win_ptr->handle;
275
        get_pkt->origin_rank = rank;
276

277 278 279 280 281 282
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

283 284 285 286
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

287 288 289 290 291
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
292
            new_ptr->is_dt = 1;
293 294 295 296
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
297
            new_ptr->is_dt = 1;
298
        }
299

300 301 302 303
        if (!new_ptr->is_dt) {
            new_ptr->piggyback_lock_candidate = 1;
        }

304 305 306 307 308 309 310 311 312 313
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
314 315 316 317 318 319 320 321 322 323 324 325 326 327

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
346
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
347 348
                     origin_datatype, int target_rank, MPI_Aint target_disp,
                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr)
349
{
350
    int mpi_errno = MPI_SUCCESS;
351
    MPIDI_msg_sz_t data_sz;
352
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
353
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
354
    MPID_Datatype *dtp;
355
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
356
    int made_progress = 0;
357
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
358

359 360
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

361 362 363
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

364 365 366 367
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

368 369
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

370
    if (data_sz == 0) {
371
        goto fn_exit;
372
    }
373

374
    rank = win_ptr->comm_ptr->rank;
375 376 377

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
378
        /* check if target is local and shared memory is allocated on window,
379
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
380 381

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
382 383 384 385 386
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
387 388 389 390
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

391
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
392
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
393 394 395 396 397 398
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
399
    }
400
    else {
401
        MPIDI_RMA_Op_t *new_ptr = NULL;
402
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
403

404
        /* queue it up */
405 406
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
407

408 409 410 411 412 413 414 415 416 417 418
        accum_pkt = &(new_ptr->pkt.accum);

        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = op;
        accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        accum_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
419
        accum_pkt->immed_len = 0;
420
        accum_pkt->origin_rank = rank;
421

422 423 424 425 426
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

427 428 429 430
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

431 432 433 434 435
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
436
            new_ptr->is_dt = 1;
437 438 439 440
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
441
            new_ptr->is_dt = 1;
442
        }
443

Xin Zhao's avatar
Xin Zhao committed
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
        /* If both origin and target are basic datatype, try to
           copy origin data to packet header as much as possible. */
        if (!new_ptr->is_dt) {
            size_t len;
            MPI_Aint origin_type_size;

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed areas in packet header */
            MPIU_Assign_trunc(accum_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (accum_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = accum_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, accum_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
463 464 465 466 467

                /* If all data is in pkt header, mark this op as
                   a candidate for piggybacking LOCK. */
                if (accum_pkt->immed_len == len)
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
468 469 470
            }
        }

471 472 473 474 475 476 477 478 479 480
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
481 482 483 484 485 486 487 488 489 490 491 492 493 494

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
495 496
    }

497
  fn_exit:
498 499 500 501 502 503 504 505 506 507 508
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
#define FUNCNAME MPIDI_Get_accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                         int target_count, MPI_Datatype target_datatype, MPI_Op op,
                         MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
525
    int made_progress = 0;
526 527 528 529
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE);

530 531 532
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;

        /* Append the operation to the window's RMA ops queue */
574 575
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
576 577 578

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

579 580 581 582 583 584 585 586 587 588 589
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;
590
            get_pkt->origin_rank = rank;
591 592 593 594 595

            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = result_count;
            new_ptr->origin_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
596 597 598 599 600 601 602 603 604 605 606

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
607 608 609 610

            if (!new_ptr->is_dt) {
                new_ptr->piggyback_lock_candidate = 1;
            }
611 612 613
        }

        else {
Xin Zhao's avatar
Xin Zhao committed
614 615 616
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &(new_ptr->pkt.get_accum);
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
            get_accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
617
                win_ptr->disp_units[target_rank] * target_disp;
Xin Zhao's avatar
Xin Zhao committed
618 619 620 621 622 623
            get_accum_pkt->count = target_count;
            get_accum_pkt->datatype = target_datatype;
            get_accum_pkt->dataloop_size = 0;
            get_accum_pkt->op = op;
            get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_accum_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
624
            get_accum_pkt->immed_len = 0;
625
            get_accum_pkt->origin_rank = rank;
626 627 628 629 630 631 632 633

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_count = result_count;
            new_ptr->result_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                MPID_Datatype_get_ptr(origin_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
Xin Zhao's avatar
Xin Zhao committed
652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

            /* If all buffers are basic datatype, try to copy origin data to
               packet header as much as possible. */
            if (!new_ptr->is_dt) {
                size_t len;
                MPI_Aint origin_type_size;

                MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
                /* length of origin data */
                MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
                /* length of origin data that can fit into immed area in packet header */
                MPIU_Assign_trunc(get_accum_pkt->immed_len,
                                  MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                                  size_t);

                if (get_accum_pkt->immed_len > 0) {
                    void *src = new_ptr->origin_addr, *dest = get_accum_pkt->data;
                    /* copy data from origin buffer to immed area in packet header */
                    mpi_errno = immed_copy(src, dest, get_accum_pkt->immed_len);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
672 673 674 675 676

                    /* If all data is in pkt header, mark this op as a candidate
                       for piggybacking LOCK. */
                    if (get_accum_pkt->immed_len == len)
                        new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
677 678
                }
            }
679
        }
680

681 682 683 684
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

685 686 687 688 689 690 691 692 693 694
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
695 696 697 698 699 700 701 702 703 704 705 706 707 708

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Compare_and_swap
724 725
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
726 727 728
int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                           void *result_addr, MPI_Datatype datatype, int target_rank,
                           MPI_Aint target_disp, MPID_Win * win_ptr)
729
{
730 731 732
    int mpi_errno = MPI_SUCCESS;
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
733
    int made_progress = 0;
734 735

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
736

737 738
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_COMPARE_AND_SWAP);

739 740 741
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

742 743 744 745 746
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;
747

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* The datatype must be predefined, and one of: C integer, Fortran integer,
     * Logical, Multi-language types, or Byte.  This is checked above the ADI,
     * so there's no need to check it again here. */

    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
                                          datatype, target_rank, target_disp, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;
778
        MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
779 780

        /* Append this operation to the RMA ops queue */
781 782
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
783

784 785 786 787 788 789 790
        cas_pkt = &(new_ptr->pkt.cas);
        MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
        cas_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        cas_pkt->datatype = datatype;
        cas_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        cas_pkt->source_win_handle = win_ptr->handle;
791
        cas_pkt->origin_rank = rank;
792

793 794 795 796 797 798 799
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = 1;
        new_ptr->origin_datatype = datatype;
        new_ptr->result_addr = result_addr;
        new_ptr->result_datatype = datatype;
        new_ptr->compare_addr = (void *) compare_addr;
        new_ptr->compare_datatype = datatype;
800
        new_ptr->target_rank = target_rank;
801
        new_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
802 803 804 805

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
806 807 808 809 810 811 812 813 814 815 816

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
817 818 819 820 821 822 823 824 825 826 827 828 829 830

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
831 832 833 834 835 836 837 838 839
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
840 841 842 843
}


#undef FUNCNAME
844
#define FUNCNAME MPIDI_Fetch_and_op
845 846
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
847 848 849
int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                       MPI_Datatype datatype, int target_rank,
                       MPI_Aint target_disp, MPI_Op op, MPID_Win * win_ptr)
850 851
{
    int mpi_errno = MPI_SUCCESS;
852 853
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
854
    int made_progress = 0;
855 856

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
857

858 859
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_FETCH_AND_OP);

860 861 862
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
883

884 885
    /* The datatype and op must be predefined.  This is checked above the ADI,
     * so there's no need to check it again here. */
886

887 888 889 890 891 892 893 894 895 896 897
    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
                                          target_rank, target_disp, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;
898

899
        /* Append this operation to the RMA ops queue */
900 901
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
902

903 904 905 906 907 908 909 910 911 912 913
        if (op == MPI_NO_OP) {
            /* Convert FOP to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = 1;
            get_pkt->datatype = datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;
914
            get_pkt->origin_rank = rank;
915

916 917 918 919
            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = 1;
            new_ptr->origin_datatype = datatype;
            new_ptr->target_rank = target_rank;
920
            new_ptr->piggyback_lock_candidate = 1;
921 922 923
        }
        else {
            MPIDI_CH3_Pkt_fop_t *fop_pkt = &(new_ptr->pkt.fop);
Xin Zhao's avatar
Xin Zhao committed
924 925 926
            size_t len;
            MPI_Aint origin_type_size;

927 928 929 930 931 932 933
            MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
            fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            fop_pkt->datatype = datatype;
            fop_pkt->op = op;
            fop_pkt->source_win_handle = win_ptr->handle;
            fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
Xin Zhao's avatar
Xin Zhao committed
934
            fop_pkt->immed_len = 0;
935
            fop_pkt->origin_rank = rank;
936 937 938 939 940 941 942

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = 1;
            new_ptr->origin_datatype = datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_datatype = datatype;
            new_ptr->target_rank = target_rank;
943
            new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed area in pkt header */
            MPIU_Assign_trunc(fop_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (fop_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = fop_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, fop_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
            }
959
        }
960 961 962 963

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
964 965 966 967 968 969 970 971 972 973 974

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
975 976 977 978 979 980 981 982 983 984 985 986 987 988

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
989 990 991 992
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_FETCH_AND_OP);
993
    return mpi_errno;
994 995 996 997
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
998
}