ch3u_rma_ops.c 42 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13
#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

38 39 40 41
#undef FUNCNAME
#define FUNCNAME MPIDI_Put
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
42
int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
43 44
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
45 46
{
    int mpi_errno = MPI_SUCCESS;
47
    int dt_contig ATTRIBUTE((unused)), rank;
48
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
49
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
50
    MPIDI_msg_sz_t data_sz;
51
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
52
    int made_progress = 0;
53
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
54

55 56
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);

57 58 59
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

60 61 62 63
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

64 65
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

66
    if (data_sz == 0) {
67
        goto fn_exit;
68 69
    }

70
    rank = win_ptr->comm_ptr->rank;
71 72 73

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
74
        /* check if target is local and shared memory is allocated on window,
75
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
76 77

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
78 79 80 81 82
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
83 84 85 86
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

87
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
88
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
89
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
90 91
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
92 93
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
94
    }
95
    else {
96
        MPIDI_RMA_Op_t *new_ptr = NULL;
97
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
98

99
        /* queue it up */
100 101
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
102

Xin Zhao's avatar
Xin Zhao committed
103 104
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

105 106 107 108 109 110 111 112 113
        put_pkt = &(new_ptr->pkt.put);
        MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        put_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
        put_pkt->dataloop_size = 0;
        put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        put_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
114
        put_pkt->immed_len = 0;
115
        put_pkt->origin_rank = rank;
116

117 118 119 120 121 122
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

Xin Zhao's avatar
Xin Zhao committed
123 124
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

125 126 127 128
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

Xin Zhao's avatar
Xin Zhao committed
129 130
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

131 132 133 134 135
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
136
            new_ptr->is_dt = 1;
137 138 139 140
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
141
            new_ptr->is_dt = 1;
142
        }
143

Xin Zhao's avatar
Xin Zhao committed
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
        /* If both origin and target are basic datatype, try to
           copy origin data to packet header as much as possible. */
        if (!new_ptr->is_dt) {
            size_t len;
            MPI_Aint origin_type_size;

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed area in pkt header */
            MPIU_Assign_trunc(put_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (put_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = put_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, put_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
163 164 165 166 167

                /* If all data is in pkt header, mark this op as a candidate
                   for piggybacking LOCK. */
                if (put_pkt->immed_len == len)
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
168 169 170
            }
        }

Xin Zhao's avatar
Xin Zhao committed
171 172
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

173 174 175 176 177 178 179 180 181 182
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
183 184 185 186 187 188 189 190 191 192 193 194 195 196

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
197 198 199
    }

  fn_exit:
200
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_PUT);
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Get
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
216 217
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
218 219 220
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
221
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
222
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
223
    MPID_Datatype *dtp;
224
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
225
    int made_progress = 0;
226
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
227

228 229
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);

230 231 232
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

233 234 235 236
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

237
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
238

239
    if (data_sz == 0) {
240
        goto fn_exit;
241 242
    }

243
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
244

245 246
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
247
        /* check if target is local and shared memory is allocated on window,
248
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
249 250

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
251 252 253 254 255
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
256 257 258
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
259

260
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
261
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
262
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
263 264
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
265 266
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
267
    }
268
    else {
269
        MPIDI_RMA_Op_t *new_ptr = NULL;
270
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
271

272
        /* queue it up */
273 274
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
275

Xin Zhao's avatar
Xin Zhao committed
276 277
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

278 279 280 281 282 283 284 285 286
        get_pkt = &(new_ptr->pkt.get);
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
        get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
        get_pkt->dataloop_size = 0;
        get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        get_pkt->source_win_handle = win_ptr->handle;
287
        get_pkt->origin_rank = rank;
288

289 290 291 292 293 294
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

Xin Zhao's avatar
Xin Zhao committed
295 296
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

297 298 299 300
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

Xin Zhao's avatar
Xin Zhao committed
301 302
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

303 304 305 306 307
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
308
            new_ptr->is_dt = 1;
309 310 311 312
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
313
            new_ptr->is_dt = 1;
314
        }
315

316 317 318 319
        if (!new_ptr->is_dt) {
            new_ptr->piggyback_lock_candidate = 1;
        }

Xin Zhao's avatar
Xin Zhao committed
320 321
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

322 323 324 325 326 327 328 329 330 331
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
332 333 334 335 336 337 338 339 340 341 342 343 344 345

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
364
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
365 366
                     origin_datatype, int target_rank, MPI_Aint target_disp,
                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr)
367
{
368
    int mpi_errno = MPI_SUCCESS;
369
    MPIDI_msg_sz_t data_sz;
370
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
371
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
372
    MPID_Datatype *dtp;
373
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
374
    int made_progress = 0;
375
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
376

377 378
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

379 380 381
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

382 383 384 385
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

386 387
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

388
    if (data_sz == 0) {
389
        goto fn_exit;
390
    }
391

392
    rank = win_ptr->comm_ptr->rank;
393 394 395

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
396
        /* check if target is local and shared memory is allocated on window,
397
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
398 399

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
400 401 402 403 404
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
405 406 407 408
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

409
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
410
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
411 412 413 414 415 416
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
417
    }
418
    else {
419
        MPIDI_RMA_Op_t *new_ptr = NULL;
420
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
421

422
        /* queue it up */
423 424
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
425

Xin Zhao's avatar
Xin Zhao committed
426 427
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

428 429 430 431 432 433 434 435 436 437 438
        accum_pkt = &(new_ptr->pkt.accum);

        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = op;
        accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        accum_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
439
        accum_pkt->immed_len = 0;
440
        accum_pkt->origin_rank = rank;
441

442 443 444 445 446
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

Xin Zhao's avatar
Xin Zhao committed
447 448
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

449 450 451 452
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

Xin Zhao's avatar
Xin Zhao committed
453 454
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

455 456 457 458 459
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
460
            new_ptr->is_dt = 1;
461 462 463 464
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
465
            new_ptr->is_dt = 1;
466
        }
467

Xin Zhao's avatar
Xin Zhao committed
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
        /* If both origin and target are basic datatype, try to
           copy origin data to packet header as much as possible. */
        if (!new_ptr->is_dt) {
            size_t len;
            MPI_Aint origin_type_size;

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed areas in packet header */
            MPIU_Assign_trunc(accum_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (accum_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = accum_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, accum_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
487 488 489 490 491

                /* If all data is in pkt header, mark this op as
                   a candidate for piggybacking LOCK. */
                if (accum_pkt->immed_len == len)
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
492 493 494
            }
        }

Xin Zhao's avatar
Xin Zhao committed
495 496
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

497 498 499 500 501 502 503 504 505 506
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
507 508 509 510 511 512 513 514 515 516 517 518 519 520

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
521 522
    }

523
  fn_exit:
524 525 526 527 528 529 530 531 532 533 534
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
#define FUNCNAME MPIDI_Get_accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                         int target_count, MPI_Datatype target_datatype, MPI_Op op,
                         MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
551
    int made_progress = 0;
552 553 554 555
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE);

556 557 558
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;

        /* Append the operation to the window's RMA ops queue */
600 601
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
602 603 604

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
605 606
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

607 608 609 610 611 612 613 614 615 616 617
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;
618
            get_pkt->origin_rank = rank;
619 620 621 622 623

            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = result_count;
            new_ptr->origin_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
624 625 626 627 628 629 630 631 632 633 634

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
635 636 637 638

            if (!new_ptr->is_dt) {
                new_ptr->piggyback_lock_candidate = 1;
            }
639 640 641
        }

        else {
Xin Zhao's avatar
Xin Zhao committed
642 643 644
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &(new_ptr->pkt.get_accum);
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
            get_accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
645
                win_ptr->disp_units[target_rank] * target_disp;
Xin Zhao's avatar
Xin Zhao committed
646 647 648 649 650 651
            get_accum_pkt->count = target_count;
            get_accum_pkt->datatype = target_datatype;
            get_accum_pkt->dataloop_size = 0;
            get_accum_pkt->op = op;
            get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_accum_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
652
            get_accum_pkt->immed_len = 0;
653
            get_accum_pkt->origin_rank = rank;
654 655 656 657 658 659 660 661

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_count = result_count;
            new_ptr->result_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                MPID_Datatype_get_ptr(origin_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
Xin Zhao's avatar
Xin Zhao committed
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699

            /* If all buffers are basic datatype, try to copy origin data to
               packet header as much as possible. */
            if (!new_ptr->is_dt) {
                size_t len;
                MPI_Aint origin_type_size;

                MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
                /* length of origin data */
                MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
                /* length of origin data that can fit into immed area in packet header */
                MPIU_Assign_trunc(get_accum_pkt->immed_len,
                                  MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                                  size_t);

                if (get_accum_pkt->immed_len > 0) {
                    void *src = new_ptr->origin_addr, *dest = get_accum_pkt->data;
                    /* copy data from origin buffer to immed area in packet header */
                    mpi_errno = immed_copy(src, dest, get_accum_pkt->immed_len);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
700 701 702 703 704

                    /* If all data is in pkt header, mark this op as a candidate
                       for piggybacking LOCK. */
                    if (get_accum_pkt->immed_len == len)
                        new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
705 706
                }
            }
707
        }
708

Xin Zhao's avatar
Xin Zhao committed
709 710
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

711 712 713 714
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

715 716 717 718 719 720 721 722 723 724
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
725 726 727 728 729 730 731 732 733 734 735 736 737 738

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Compare_and_swap
754 755
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
756 757 758
int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                           void *result_addr, MPI_Datatype datatype, int target_rank,
                           MPI_Aint target_disp, MPID_Win * win_ptr)
759
{
760 761 762
    int mpi_errno = MPI_SUCCESS;
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
763
    int made_progress = 0;
764 765

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
766

767 768
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_COMPARE_AND_SWAP);

769 770 771
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

772 773 774 775 776
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;
777

778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* The datatype must be predefined, and one of: C integer, Fortran integer,
     * Logical, Multi-language types, or Byte.  This is checked above the ADI,
     * so there's no need to check it again here. */

    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
                                          datatype, target_rank, target_disp, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;
808
        MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
809 810

        /* Append this operation to the RMA ops queue */
811 812
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
813

Xin Zhao's avatar
Xin Zhao committed
814 815
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

816 817 818 819 820 821 822
        cas_pkt = &(new_ptr->pkt.cas);
        MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
        cas_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        cas_pkt->datatype = datatype;
        cas_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        cas_pkt->source_win_handle = win_ptr->handle;
823
        cas_pkt->origin_rank = rank;
824

825 826 827 828 829 830 831
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = 1;
        new_ptr->origin_datatype = datatype;
        new_ptr->result_addr = result_addr;
        new_ptr->result_datatype = datatype;
        new_ptr->compare_addr = (void *) compare_addr;
        new_ptr->compare_datatype = datatype;
832
        new_ptr->target_rank = target_rank;
833
        new_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
834

Xin Zhao's avatar
Xin Zhao committed
835 836
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

837 838 839
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
840 841 842 843 844 845 846 847 848 849 850

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
851 852 853 854 855 856 857 858 859 860 861 862 863 864

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
865 866 867 868 869 870 871 872 873
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
874 875 876 877
}


#undef FUNCNAME
878
#define FUNCNAME MPIDI_Fetch_and_op
879 880
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
881 882 883
int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                       MPI_Datatype datatype, int target_rank,
                       MPI_Aint target_disp, MPI_Op op, MPID_Win * win_ptr)
884 885
{
    int mpi_errno = MPI_SUCCESS;
886 887
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
888
    int made_progress = 0;
889 890

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
891

892 893
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_FETCH_AND_OP);

894 895 896
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
917

918 919
    /* The datatype and op must be predefined.  This is checked above the ADI,
     * so there's no need to check it again here. */
920

921 922 923 924 925 926 927 928 929 930 931
    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
                                          target_rank, target_disp, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;
932

933
        /* Append this operation to the RMA ops queue */
934 935
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
936

Xin Zhao's avatar
Xin Zhao committed
937 938
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

939 940 941 942 943 944 945 946 947 948 949
        if (op == MPI_NO_OP) {
            /* Convert FOP to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = 1;
            get_pkt->datatype = datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;
950
            get_pkt->origin_rank = rank;
951

952 953 954 955
            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = 1;
            new_ptr->origin_datatype = datatype;
            new_ptr->target_rank = target_rank;
956
            new_ptr->piggyback_lock_candidate = 1;
957 958 959
        }
        else {
            MPIDI_CH3_Pkt_fop_t *fop_pkt = &(new_ptr->pkt.fop);
Xin Zhao's avatar
Xin Zhao committed
960 961 962
            size_t len;
            MPI_Aint origin_type_size;

963 964 965 966 967 968 969
            MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
            fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            fop_pkt->datatype = datatype;
            fop_pkt->op = op;
            fop_pkt->source_win_handle = win_ptr->handle;
            fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
Xin Zhao's avatar
Xin Zhao committed
970
            fop_pkt->immed_len = 0;
971
            fop_pkt->origin_rank = rank;
972 973 974 975 976 977 978

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = 1;
            new_ptr->origin_datatype = datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_datatype = datatype;
            new_ptr->target_rank = target_rank;
979
            new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
980 981 982 983 984 985 986 987 988 989 990 991 992 993 994

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed area in pkt header */
            MPIU_Assign_trunc(fop_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (fop_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = fop_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, fop_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
            }
995
        }
996

Xin Zhao's avatar
Xin Zhao committed
997 998
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

999 1000 1001
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        win_ptr->posted_ops_cnt++;
        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
1027 1028 1029 1030
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_FETCH_AND_OP);
1031
    return mpi_errno;
1032 1033 1034 1035
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
1036
}