ch3u_rma_ops.c 28.5 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

9
static int enableShortACC = 1;
10

11 12 13 14 15 16 17
#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

#undef FUNCNAME
#define FUNCNAME MPIDI_Put
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
18
int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
19 20
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
21 22
{
    int mpi_errno = MPI_SUCCESS;
23
    int dt_contig ATTRIBUTE((unused)), rank;
24
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
25
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
26
    MPIDI_msg_sz_t data_sz;
27
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
28
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
29

30 31
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);

32 33 34 35
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

36
    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
James Dinan's avatar
James Dinan committed
37 38 39 40 41 42
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

43 44
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

45
    if (data_sz == 0) {
46
        goto fn_exit;
47 48
    }

49
    rank = win_ptr->comm_ptr->rank;
50 51 52

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
53
        /* check if target is local and shared memory is allocated on window,
54
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
55 56

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
57 58 59 60 61
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
62 63 64 65
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

66
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
67
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
68
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
69 70
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
71 72
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
73
    }
74
    else {
75
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
76
        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
77
        MPIDI_RMA_Op_t *new_ptr = NULL;
78
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
79

80
        /* queue it up */
81
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
82 83 84
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
85

86 87 88 89 90 91 92 93 94 95
        put_pkt = &(new_ptr->pkt.put);
        MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        put_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
        put_pkt->dataloop_size = 0;
        put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        put_pkt->source_win_handle = win_ptr->handle;

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
112 113 114
    }

  fn_exit:
115
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_PUT);
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Get
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
131 132
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
133 134 135
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
136
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
137
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
138
    MPID_Datatype *dtp;
139
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
140
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
141

142 143
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);

144 145 146 147
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

148
    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
James Dinan's avatar
James Dinan committed
149 150 151 152 153 154
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

155
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
156

157
    if (data_sz == 0) {
158
        goto fn_exit;
159 160
    }

161
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
162

163 164
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
165
        /* check if target is local and shared memory is allocated on window,
166
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
167 168

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
169 170 171 172 173
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
174 175 176
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
177

178
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
179
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
180
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
181 182
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
183 184
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
185
    }
186
    else {
187
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
188
        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
189
        MPIDI_RMA_Op_t *new_ptr = NULL;
190
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
191

192
        /* queue it up */
193
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
194 195 196 197
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }

198 199 200 201 202 203 204 205 206 207
        get_pkt = &(new_ptr->pkt.get);
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
        get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
        get_pkt->dataloop_size = 0;
        get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        get_pkt->source_win_handle = win_ptr->handle;

208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
242
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
243 244
                     origin_datatype, int target_rank, MPI_Aint target_disp,
                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr)
245
{
246
    int mpi_errno = MPI_SUCCESS;
247
    MPIDI_msg_sz_t data_sz;
248
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
249
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
250
    MPID_Datatype *dtp;
251
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
252
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
253

254 255
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

256 257 258 259
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

260
    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
James Dinan's avatar
James Dinan committed
261 262 263 264 265 266
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

267 268
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

269
    if (data_sz == 0) {
270
        goto fn_exit;
271
    }
272

273
    rank = win_ptr->comm_ptr->rank;
274 275 276

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
277
        /* check if target is local and shared memory is allocated on window,
278
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
279 280

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
281 282 283 284 285
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
286 287 288 289
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

290
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
291
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
292 293 294 295 296 297
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
298
    }
299
    else {
300
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
301
        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
302
        MPIDI_RMA_Op_t *new_ptr = NULL;
303
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
304

305
        /* queue it up */
306
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
307 308 309
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
310

311 312
        /* If predefined and contiguous, use a simplified element */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
313
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) {
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
            MPI_Aint origin_type_size;
            size_t len;

            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);
            if (MPIR_CVAR_CH3_RMA_ACC_IMMED && len <= MPIDI_RMA_IMMED_INTS * sizeof(int)) {
                MPIDI_CH3_Pkt_accum_immed_t *accumi_pkt;

                accumi_pkt = &(new_ptr->pkt.accum_immed);
                MPIDI_Pkt_init(accumi_pkt, MPIDI_CH3_PKT_ACCUM_IMMED);
                accumi_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                    win_ptr->disp_units[target_rank] * target_disp;
                accumi_pkt->count = target_count;
                accumi_pkt->datatype = target_datatype;
                accumi_pkt->op = op;
                accumi_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
                accumi_pkt->source_win_handle = win_ptr->handle;

                new_ptr->origin_addr = (void *) origin_addr;
                new_ptr->origin_count = origin_count;
                new_ptr->origin_datatype = origin_datatype;
                new_ptr->target_rank = target_rank;
                goto fn_exit;
            }
338 339
        }

340 341 342 343 344 345 346 347 348 349 350 351
        accum_pkt = &(new_ptr->pkt.accum);

        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = op;
        accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        accum_pkt->source_win_handle = win_ptr->handle;

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
367 368
    }

369
  fn_exit:
370 371 372 373 374 375 376 377 378 379 380
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
#define FUNCNAME MPIDI_Get_accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                         int target_count, MPI_Datatype target_datatype, MPI_Op op,
                         MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE);

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
447
        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
448 449 450
        MPIDI_RMA_Op_t *new_ptr = NULL;

        /* Append the operation to the window's RMA ops queue */
451
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
452 453 454 455 456 457
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
            get_pkt->dataloop_size = 0;
            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            get_pkt->source_win_handle = win_ptr->handle;

            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = result_count;
            new_ptr->origin_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
        }

        else {
            MPIDI_CH3_Pkt_accum_t *accum_pkt = &(new_ptr->pkt.accum);
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
            accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                win_ptr->disp_units[target_rank] * target_disp;
            accum_pkt->count = target_count;
            accum_pkt->datatype = target_datatype;
            accum_pkt->dataloop_size = 0;
            accum_pkt->op = op;
            accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
            accum_pkt->source_win_handle = win_ptr->handle;

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_count = result_count;
            new_ptr->result_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
        }
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (op != MPI_NO_OP && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
            MPID_Datatype_get_ptr(result_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Compare_and_swap
526 527
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
528 529 530
int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                           void *result_addr, MPI_Datatype datatype, int target_rank,
                           MPI_Aint target_disp, MPID_Win * win_ptr)
531
{
532 533 534 535 536 537 538 539 540 541 542 543 544 545
    int mpi_errno = MPI_SUCCESS;
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_COMPARE_AND_SWAP);

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }
546

547 548
    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
549

550
    rank = win_ptr->comm_ptr->rank;
551

552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* The datatype must be predefined, and one of: C integer, Fortran integer,
     * Logical, Multi-language types, or Byte.  This is checked above the ADI,
     * so there's no need to check it again here. */

    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
                                          datatype, target_rank, target_disp, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
582
        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
583
        MPIDI_RMA_Op_t *new_ptr = NULL;
584
        MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
585 586

        /* Append this operation to the RMA ops queue */
587
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
588 589 590 591
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }

592 593 594 595 596 597 598 599
        cas_pkt = &(new_ptr->pkt.cas);
        MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
        cas_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        cas_pkt->datatype = datatype;
        cas_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
        cas_pkt->source_win_handle = win_ptr->handle;

600 601 602 603 604 605 606
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = 1;
        new_ptr->origin_datatype = datatype;
        new_ptr->result_addr = result_addr;
        new_ptr->result_datatype = datatype;
        new_ptr->compare_addr = (void *) compare_addr;
        new_ptr->compare_datatype = datatype;
607
        new_ptr->target_rank = target_rank;
608 609 610 611 612 613 614 615 616
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
617 618 619 620
}


#undef FUNCNAME
621
#define FUNCNAME MPIDI_Fetch_and_op
622 623
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
624 625 626
int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                       MPI_Datatype datatype, int target_rank,
                       MPI_Aint target_disp, MPI_Op op, MPID_Win * win_ptr)
627 628
{
    int mpi_errno = MPI_SUCCESS;
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_FETCH_AND_OP);

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
662

663 664
    /* The datatype and op must be predefined.  This is checked above the ADI,
     * so there's no need to check it again here. */
665

666 667 668 669 670 671 672 673 674 675 676
    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
                                          target_rank, target_disp, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
677
        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
678
        MPIDI_RMA_Op_t *new_ptr = NULL;
679
        MPIDI_CH3_Pkt_fop_t *fop_pkt = NULL;
680

681
        /* Append this operation to the RMA ops queue */
682
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
683 684 685 686
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }

687 688 689 690 691 692 693 694 695
        fop_pkt = &(new_ptr->pkt.fop);
        MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
        fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
            win_ptr->disp_units[target_rank] * target_disp;
        fop_pkt->datatype = datatype;
        fop_pkt->op = op;
        fop_pkt->source_win_handle = win_ptr->handle;
        fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];

696 697 698 699 700
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = 1;
        new_ptr->origin_datatype = datatype;
        new_ptr->result_addr = result_addr;
        new_ptr->result_datatype = datatype;
701
        new_ptr->target_rank = target_rank;
702 703 704 705
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_FETCH_AND_OP);
706
    return mpi_errno;
707 708 709 710
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
711
}