ch3u_rma_ops.c 47.6 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13
#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

38
#undef FUNCNAME
39
#define FUNCNAME MPIDI_CH3I_Put
40 41
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
42 43 44 45
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
46 47
{
    int mpi_errno = MPI_SUCCESS;
48
    int dt_contig ATTRIBUTE((unused)), rank;
49
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
50
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
51
    MPIDI_msg_sz_t data_sz;
52
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
53
    int made_progress = 0;
54
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
55

56
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
57

58 59 60
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

61 62 63 64
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

65 66
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

67
    if (data_sz == 0) {
68
        goto fn_exit;
69 70
    }

71
    rank = win_ptr->comm_ptr->rank;
72 73 74

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
75
        /* check if target is local and shared memory is allocated on window,
76
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
77 78

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
79 80 81 82 83
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
84 85 86 87
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

88
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
89
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
90
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
91 92
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
93 94
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
95 96 97 98 99 100

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
101
    }
102
    else {
103
        MPIDI_RMA_Op_t *new_ptr = NULL;
104
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
105

106
        /* queue it up */
107 108
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
109

Xin Zhao's avatar
Xin Zhao committed
110 111
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

112 113 114 115 116 117 118 119 120
        put_pkt = &(new_ptr->pkt.put);
        MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        put_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
        put_pkt->dataloop_size = 0;
        put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        put_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
121
        put_pkt->immed_len = 0;
122
        put_pkt->origin_rank = rank;
123
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
124

125 126 127 128 129
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
130 131 132 133 134 135
        new_ptr->ureq = NULL; /* reset user request */

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
136

Xin Zhao's avatar
Xin Zhao committed
137 138
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

139 140 141 142
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

Xin Zhao's avatar
Xin Zhao committed
143 144
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

145 146 147 148 149
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
150
            new_ptr->is_dt = 1;
151 152 153 154
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
155
            new_ptr->is_dt = 1;
156
        }
157

Xin Zhao's avatar
Xin Zhao committed
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
        /* If both origin and target are basic datatype, try to
           copy origin data to packet header as much as possible. */
        if (!new_ptr->is_dt) {
            size_t len;
            MPI_Aint origin_type_size;

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed area in pkt header */
            MPIU_Assign_trunc(put_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (put_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = put_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, put_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
177 178 179 180 181

                /* If all data is in pkt header, mark this op as a candidate
                   for piggybacking LOCK. */
                if (put_pkt->immed_len == len)
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
182 183 184
            }
        }

Xin Zhao's avatar
Xin Zhao committed
185 186
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

187 188 189 190
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
191
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
192 193 194 195
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
196 197 198 199 200 201 202 203 204 205 206 207 208 209

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
210 211 212
    }

  fn_exit:
213
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
214 215 216 217 218 219 220 221 222
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
223
#define FUNCNAME MPIDI_CH3I_Get
224 225
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
226 227 228 229
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
230 231 232
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
233
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
234
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
235
    MPID_Datatype *dtp;
236
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
237
    int made_progress = 0;
238
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
239

240
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
241

242 243 244
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

245 246 247 248
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

249
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
250

251
    if (data_sz == 0) {
252
        goto fn_exit;
253 254
    }

255
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
256

257 258
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
259
        /* check if target is local and shared memory is allocated on window,
260
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
261 262

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
263 264 265 266 267
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
268 269 270
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
271

272
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
273
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
274
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
275 276
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
277 278
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
279 280 281 282 283 284

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
285
    }
286
    else {
287
        MPIDI_RMA_Op_t *new_ptr = NULL;
288
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
289

290
        /* queue it up */
291 292
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
293

Xin Zhao's avatar
Xin Zhao committed
294 295
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

296 297 298 299 300 301 302 303 304
        get_pkt = &(new_ptr->pkt.get);
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
        get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
        get_pkt->dataloop_size = 0;
        get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        get_pkt->source_win_handle = win_ptr->handle;
305
        get_pkt->origin_rank = rank;
306
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
307

308 309 310 311 312
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
313 314 315 316 317 318
        new_ptr->ureq = NULL; /* reset user request */

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
319

Xin Zhao's avatar
Xin Zhao committed
320 321
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

322 323 324 325
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

Xin Zhao's avatar
Xin Zhao committed
326 327
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

328 329 330 331 332
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
333
            new_ptr->is_dt = 1;
334 335 336 337
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
338
            new_ptr->is_dt = 1;
339
        }
340

341 342 343 344
        if (!new_ptr->is_dt) {
            new_ptr->piggyback_lock_candidate = 1;
        }

Xin Zhao's avatar
Xin Zhao committed
345 346
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

347 348 349 350
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
351
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
352 353 354 355
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
356 357 358 359 360 361 362 363 364 365 366 367 368 369

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
370 371 372
    }

  fn_exit:
373
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
374 375 376 377 378 379 380 381 382 383
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
384
#define FUNCNAME MPIDI_CH3I_Accumulate
385 386
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
387 388 389 390
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
391
{
392
    int mpi_errno = MPI_SUCCESS;
393
    MPIDI_msg_sz_t data_sz;
394
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
395
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
396
    MPID_Datatype *dtp;
397
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
398
    int made_progress = 0;
399
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
400

401
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
402

403 404 405
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

406 407 408 409
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

410 411
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

412
    if (data_sz == 0) {
413
        goto fn_exit;
414
    }
415

416
    rank = win_ptr->comm_ptr->rank;
417 418 419

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
420
        /* check if target is local and shared memory is allocated on window,
421
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
422 423

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
424 425 426 427 428
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
429 430 431 432
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

433
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
434
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
435 436 437 438 439 440
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
441 442 443 444 445 446

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
447
    }
448
    else {
449
        MPIDI_RMA_Op_t *new_ptr = NULL;
450
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
451

452
        /* queue it up */
453 454
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
455

Xin Zhao's avatar
Xin Zhao committed
456 457
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

458 459 460 461 462 463 464 465 466 467 468
        accum_pkt = &(new_ptr->pkt.accum);

        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = op;
        accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        accum_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
469
        accum_pkt->immed_len = 0;
470
        accum_pkt->origin_rank = rank;
471
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
472

473 474 475 476
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
477 478 479 480 481 482
        new_ptr->ureq = NULL; /* reset user request */

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
483

Xin Zhao's avatar
Xin Zhao committed
484 485
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

486 487 488 489
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

Xin Zhao's avatar
Xin Zhao committed
490 491
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

492 493 494 495 496
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
497
            new_ptr->is_dt = 1;
498 499 500 501
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
502
            new_ptr->is_dt = 1;
503
        }
504

Xin Zhao's avatar
Xin Zhao committed
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
        /* If both origin and target are basic datatype, try to
           copy origin data to packet header as much as possible. */
        if (!new_ptr->is_dt) {
            size_t len;
            MPI_Aint origin_type_size;

            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
            /* length of origin data */
            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
            /* length of origin data that can fit into immed areas in packet header */
            MPIU_Assign_trunc(accum_pkt->immed_len,
                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                              size_t);

            if (accum_pkt->immed_len > 0) {
                void *src = new_ptr->origin_addr, *dest = accum_pkt->data;
                /* copy data from origin buffer to immed area in packet header */
                mpi_errno = immed_copy(src, dest, accum_pkt->immed_len);
                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
524 525 526 527 528

                /* If all data is in pkt header, mark this op as
                   a candidate for piggybacking LOCK. */
                if (accum_pkt->immed_len == len)
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
529 530 531
            }
        }

Xin Zhao's avatar
Xin Zhao committed
532 533
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

534 535 536 537
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
538
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
539 540 541 542
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
543 544 545 546 547 548 549 550 551 552 553 554 555 556

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
557 558
    }

559
  fn_exit:
560
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
561 562 563 564 565 566 567 568 569 570
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
571
#define FUNCNAME MPIDI_CH3I_Get_accumulate
572 573
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
574 575 576 577 578
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
579 580 581 582 583 584 585 586
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
587
    int made_progress = 0;
588
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
589

590
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
591

592 593 594
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
631 632 633 634 635 636

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
637 638 639 640 641
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;

        /* Append the operation to the window's RMA ops queue */
642 643
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
644 645 646

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
647 648
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

649 650 651 652 653 654 655 656 657 658 659
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;
660
            get_pkt->origin_rank = rank;
661
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
662 663 664 665 666

            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = result_count;
            new_ptr->origin_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
667 668 669 670 671 672 673 674 675 676 677

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
678 679 680 681

            if (!new_ptr->is_dt) {
                new_ptr->piggyback_lock_candidate = 1;
            }
682 683 684
        }

        else {
Xin Zhao's avatar
Xin Zhao committed
685 686 687
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &(new_ptr->pkt.get_accum);
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
            get_accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
688
                win_ptr->disp_units[target_rank] * target_disp;
Xin Zhao's avatar
Xin Zhao committed
689 690 691 692 693 694
            get_accum_pkt->count = target_count;
            get_accum_pkt->datatype = target_datatype;
            get_accum_pkt->dataloop_size = 0;
            get_accum_pkt->op = op;
            get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_accum_pkt->source_win_handle = win_ptr->handle;
Xin Zhao's avatar
Xin Zhao committed
695
            get_accum_pkt->immed_len = 0;
696
            get_accum_pkt->origin_rank = rank;
697
            get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
698 699 700 701 702 703 704 705

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_count = result_count;
            new_ptr->result_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                MPID_Datatype_get_ptr(origin_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
Xin Zhao's avatar
Xin Zhao committed
724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743

            /* If all buffers are basic datatype, try to copy origin data to
               packet header as much as possible. */
            if (!new_ptr->is_dt) {
                size_t len;
                MPI_Aint origin_type_size;

                MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
                /* length of origin data */
                MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
                /* length of origin data that can fit into immed area in packet header */
                MPIU_Assign_trunc(get_accum_pkt->immed_len,
                                  MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
                                  size_t);

                if (get_accum_pkt->immed_len > 0) {
                    void *src = new_ptr->origin_addr, *dest = get_accum_pkt->data;
                    /* copy data from origin buffer to immed area in packet header */
                    mpi_errno = immed_copy(src, dest, get_accum_pkt->immed_len);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
744 745 746 747 748

                    /* If all data is in pkt header, mark this op as a candidate
                       for piggybacking LOCK. */
                    if (get_accum_pkt->immed_len == len)
                        new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
749 750
                }
            }
751
        }
752

753 754 755 756 757 758 759
        new_ptr->ureq = NULL; /* reset user request */

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }

Xin Zhao's avatar
Xin Zhao committed
760 761
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

762 763 764 765
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

766 767 768 769
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
770
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
771 772 773 774
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
775 776 777 778 779 780 781 782 783 784 785 786 787 788

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
789 790
    }

791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Put
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);

    mpi_errno = MPIDI_CH3I_Put(origin_addr, origin_count, origin_datatype,
                               target_rank, target_disp, target_count, target_datatype,
                               win_ptr, NULL);

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_PUT);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
#define FUNCNAME MPIDI_Get
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);

    mpi_errno = MPIDI_CH3I_Get(origin_addr, origin_count, origin_datatype,
                               target_rank, target_disp, target_count, target_datatype,
                               win_ptr, NULL);

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
#define FUNCNAME MPIDI_Accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                     origin_datatype, int target_rank, MPI_Aint target_disp,
                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

    mpi_errno = MPIDI_CH3I_Accumulate(origin_addr, origin_count, origin_datatype,
                                      target_rank, target_disp, target_count, target_datatype,
                                      op, win_ptr, NULL);

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
#define FUNCNAME MPIDI_Get_accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                         int target_count, MPI_Datatype target_datatype, MPI_Op op,
                         MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE);

    mpi_errno = MPIDI_CH3I_Get_accumulate(origin_addr, origin_count, origin_datatype,
                                          result_addr, result_count, result_datatype,
                                          target_rank, target_disp, target_count,
                                          target_datatype, op, win_ptr, NULL);

903 904 905 906 907 908 909 910 911 912 913 914 915
  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Compare_and_swap
916 917
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
918 919 920
int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                           void *result_addr, MPI_Datatype datatype, int target_rank,
                           MPI_Aint target_disp, MPID_Win * win_ptr)
921
{
922 923 924
    int mpi_errno = MPI_SUCCESS;
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
925
    int made_progress = 0;
926 927

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
928

929 930
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_COMPARE_AND_SWAP);

931 932 933
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

934 935 936 937 938
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;
939

940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* The datatype must be predefined, and one of: C integer, Fortran integer,
     * Logical, Multi-language types, or Byte.  This is checked above the ADI,
     * so there's no need to check it again here. */

    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
                                          datatype, target_rank, target_disp, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;
970
        MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
971 972

        /* Append this operation to the RMA ops queue */
973 974
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
975

Xin Zhao's avatar
Xin Zhao committed
976 977
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

978 979 980 981 982 983 984
        cas_pkt = &(new_ptr->pkt.cas);
        MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
        cas_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        cas_pkt->datatype = datatype;
        cas_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        cas_pkt->source_win_handle = win_ptr->handle;
985
        cas_pkt->origin_rank = rank;
986
        cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
987

988 989 990 991 992 993 994
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = 1;
        new_ptr->origin_datatype = datatype;
        new_ptr->result_addr = result_addr;
        new_ptr->result_datatype = datatype;
        new_ptr->compare_addr = (void *) compare_addr;
        new_ptr->compare_datatype = datatype;
995
        new_ptr->target_rank = target_rank;
996
        new_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
997
        new_ptr->ureq = NULL; /* reset user request */
998

Xin Zhao's avatar
Xin Zhao committed
999 1000
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

1001 1002 1003
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
1004 1005 1006 1007 1008

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
1009
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
1010 1011 1012 1013
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
1028 1029 1030 1031 1032 1033 1034 1035 1036
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
1037 1038 1039 1040
}


#undef FUNCNAME
1041
#define FUNCNAME MPIDI_Fetch_and_op
1042 1043
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
1044 1045 1046
int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                       MPI_Datatype datatype, int target_rank,
                       MPI_Aint target_disp, MPI_Op op, MPID_Win * win_ptr)
1047 1048
{
    int mpi_errno = MPI_SUCCESS;
1049 1050
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
1051
    int made_progress = 0;
1052 1053

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
1054

1055 1056
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_FETCH_AND_OP);

1057 1058 1059
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
1080

1081 1082
    /* The datatype and op must be predefined.  This is checked above the ADI,
     * so there's no need to check it again here. */
1083

1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
                                          target_rank, target_disp, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;
1095

1096
        /* Append this operation to the RMA ops queue */
1097 1098
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
1099

Xin Zhao's avatar
Xin Zhao committed
1100 1101
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
        if (op == MPI_NO_OP) {
            /* Convert FOP to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = 1;
            get_pkt->datatype = datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;