ch3u_rma_ops.c 57.9 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13
#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

54 55 56
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

57
#undef FUNCNAME
58
#define FUNCNAME MPIDI_CH3I_Put
59 60
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
61 62 63 64
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
65 66
{
    int mpi_errno = MPI_SUCCESS;
67
    int dt_contig ATTRIBUTE((unused)), rank;
68
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
69
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
70
    MPIDI_msg_sz_t data_sz;
71
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
72
    int made_progress = 0;
73
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
74

75
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
76

77 78 79
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

80 81 82 83
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

84 85
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

86
    if (data_sz == 0) {
87
        goto fn_exit;
88 89
    }

90
    rank = win_ptr->comm_ptr->rank;
91 92 93

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
94
        /* check if target is local and shared memory is allocated on window,
95
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
96 97

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
98 99 100 101 102
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
103 104 105 106
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

107
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
108
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
109
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
110 111
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
112 113
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
114 115 116 117 118 119

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
120
    }
121
    else {
122
        MPIDI_RMA_Op_t *new_ptr = NULL;
123
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
124 125 126
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
127
        int is_origin_contig, is_target_contig;
128

129
        /* queue it up */
130
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
131 132
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
133

Xin Zhao's avatar
Xin Zhao committed
134 135
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

136
        /******************** Setting operation struct areas ***********************/
137

138 139 140 141 142
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
143 144 145 146 147

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
148 149 150 151 152 153

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
154
            new_ptr->is_dt = 1;
155 156 157 158
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
159
            new_ptr->is_dt = 1;
160
        }
161

162 163 164
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

165 166 167 168
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

        /* Judge if we can use IMMED data packet */
169
        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
170
            MPIU_Assign_trunc(immed_len,
171
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
172 173 174 175 176
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

177
        /* Judge if this operation is an piggyback candidate */
Xin Zhao's avatar
Xin Zhao committed
178
        if (!new_ptr->is_dt) {
179 180 181
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
182
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
183
                new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
184 185
        }

186 187 188
        /************** Setting packet struct areas in operation ****************/

        put_pkt = &(new_ptr->pkt.put);
189 190 191 192 193 194 195 196

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

197 198
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
199 200
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
201
        put_pkt->info.metadata.dataloop_size = 0;
202
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
203 204
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
205
        if (use_immed_pkt) {
206
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
207
            mpi_errno = immed_copy(src, dest, len);
208 209
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
210
        }
211

Xin Zhao's avatar
Xin Zhao committed
212 213
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

214 215 216 217
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

218
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
219 220
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
221 222

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
223
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
224 225 226 227
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
228 229 230 231 232 233 234 235 236 237 238 239 240 241

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
242 243 244
    }

  fn_exit:
245
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
246 247 248 249 250 251 252 253 254
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
255
#define FUNCNAME MPIDI_CH3I_Get
256 257
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
258 259 260 261
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
262 263 264
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
265
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
266
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
267
    MPID_Datatype *dtp;
268
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
269
    int made_progress = 0;
270
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
271

272
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
273

274 275 276
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

277 278 279 280
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

281
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
282

283
    if (data_sz == 0) {
284
        goto fn_exit;
285 286
    }

287
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
288

289 290
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
291
        /* check if target is local and shared memory is allocated on window,
292
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
293 294

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
295 296 297 298 299
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
300 301 302
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
303

304
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
305
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
306
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
307 308
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
309 310
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
311 312 313 314 315 316

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
317
    }
318
    else {
319
        MPIDI_RMA_Op_t *new_ptr = NULL;
320
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
321 322 323
        MPI_Aint target_type_size;
        size_t immed_len, len;
        int use_immed_resp_pkt = FALSE;
324
        int is_origin_contig, is_target_contig;
325

326
        /* queue it up */
327
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
328 329
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
330

Xin Zhao's avatar
Xin Zhao committed
331 332
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

333
        /******************** Setting operation struct areas ***********************/
334

335 336 337 338 339
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
340 341 342 343 344

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
345 346 347 348 349 350

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
351
            new_ptr->is_dt = 1;
352 353 354 355
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
356
            new_ptr->is_dt = 1;
357
        }
358

359 360 361
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

362 363 364 365
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
        MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

        /* Judge if we can use IMMED data response packet */
366
        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
367
            MPIU_Assign_trunc(immed_len,
368
                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
369 370 371 372 373
                              size_t);
            if (len <= immed_len)
                use_immed_resp_pkt = TRUE;
        }

374
        /* Judge if this operation is an piggyback candidate. */
375
        if (!new_ptr->is_dt) {
376 377 378
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
379 380 381
            new_ptr->piggyback_lock_candidate = 1;
        }

382 383 384 385
        /************** Setting packet struct areas in operation ****************/

        get_pkt = &(new_ptr->pkt.get);
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
386 387
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
388 389
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
390
        get_pkt->info.metadata.dataloop_size = 0;
391
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
392
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
393 394
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
395

Xin Zhao's avatar
Xin Zhao committed
396 397
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

398 399 400 401
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

402
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
403 404
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
405 406

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
407
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
408 409 410 411
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
412 413 414 415 416 417 418 419 420 421 422 423 424 425

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
426 427 428
    }

  fn_exit:
429
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
430 431 432 433 434 435 436 437 438 439
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
440
#define FUNCNAME MPIDI_CH3I_Accumulate
441 442
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
443 444 445 446
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
447
{
448
    int mpi_errno = MPI_SUCCESS;
449
    MPIDI_msg_sz_t data_sz;
450
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
451
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
452
    MPID_Datatype *dtp;
453
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
454
    int made_progress = 0;
455
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
456

457
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
458

459 460 461
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

462 463 464 465
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

466 467
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

468
    if (data_sz == 0) {
469
        goto fn_exit;
470
    }
471

472
    rank = win_ptr->comm_ptr->rank;
473 474 475

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
476
        /* check if target is local and shared memory is allocated on window,
477
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
478 479

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
480 481 482 483 484
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
485 486 487 488
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

489
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
490
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
491 492 493 494 495 496
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
497 498 499 500 501 502

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
503
    }
504
    else {
505
        MPIDI_RMA_Op_t *new_ptr = NULL;
506
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
507 508 509
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
510
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
511 512 513 514
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
515

516
        /* queue it up */
517
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
518 519
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
520

Xin Zhao's avatar
Xin Zhao committed
521 522
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

523
        /******************** Setting operation struct areas ***********************/
524

525 526 527 528
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
529 530 531 532 533

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
534 535 536 537

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
538
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
539
            new_ptr->is_dt = 1;
540 541
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
542
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
543
            new_ptr->is_dt = 1;
544
        }
545

546 547 548
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
549 550 551 552 553 554 555
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            predefined_dtp_size = origin_type_size;
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
556 557
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
558
            predefined_dtp_count = len / predefined_dtp_size;
559
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

579 580 581
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

582
        /* Judge if we can use IMMED data packet */
583
        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
584
            MPIU_Assign_trunc(immed_len,
585
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
586 587 588 589 590
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

591
        /* Judge if this operation is an piggyback candidate. */
Xin Zhao's avatar
Xin Zhao committed
592
        if (!new_ptr->is_dt) {
593 594 595
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
596
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
597
                new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
598 599
        }

600 601 602 603
        /************** Setting packet struct areas in operation ****************/

        accum_pkt = &(new_ptr->pkt.accum);

604 605 606 607 608 609 610
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

611 612
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
613 614
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
615
        accum_pkt->info.metadata.dataloop_size = 0;
616
        accum_pkt->op = op;
617
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
618
        accum_pkt->source_win_handle = win_ptr->handle;
619
        accum_pkt->info.metadata.stream_offset = 0;
620
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
621
        if (use_immed_pkt) {
622
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
623
            mpi_errno = immed_copy(src, dest, len);
624 625
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
626
        }
627

Xin Zhao's avatar
Xin Zhao committed
628 629
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

630 631 632 633
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

634
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
635 636
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
637 638

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
639
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
640 641 642 643
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
644 645 646 647 648 649 650 651 652 653 654 655 656 657

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
658 659
    }

660
  fn_exit:
661
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
662 663 664 665 666 667 668 669 670 671
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
672
#define FUNCNAME MPIDI_CH3I_Get_accumulate
673 674
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
675 676 677 678 679
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
680 681 682 683 684 685 686 687
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
688
    int made_progress = 0;
689
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
690

691
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
692

693 694 695
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
732 733 734 735 736 737

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
738 739 740 741 742
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;

        /* Append the operation to the window's RMA ops queue */
743
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
744 745
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
746 747 748

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
749 750
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

751 752
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
753
            MPIDI_CH3_Pkt_get_t *get_pkt;
754 755 756
            MPI_Aint target_type_size;
            size_t len, immed_len;
            int use_immed_resp_pkt = FALSE;
757
            int is_result_contig, is_target_contig;
758 759

            /******************** Setting operation struct areas ***********************/
760 761 762 763 764

            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = result_count;
            new_ptr->origin_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
765 766 767 768 769

            /* Remember user request */
            if (ureq) {
                new_ptr->ureq = ureq;
            }
770 771 772 773 774 775 776 777 778 779 780

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
781

782 783 784
            MPID_Datatype_get_size_macro(target_datatype, target_type_size);
            MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

785 786 787
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

788
            /* Judge if we can use IMMED data response packet */
789
            if (!new_ptr->is_dt && is_result_contig && is_target_contig) {
790
                MPIU_Assign_trunc(immed_len,
791
                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
792 793 794 795 796
                                  size_t);
                if (len <= immed_len)
                    use_immed_resp_pkt = TRUE;
            }

797
            /* Judge if this operation is a piggyback candidate */
798
            if (!new_ptr->is_dt) {
799 800 801
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
802 803
                new_ptr->piggyback_lock_candidate = 1;
            }
804 805 806 807 808

            /************** Setting packet struct areas in operation ****************/

            get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
809 810
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
811 812
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
813
            get_pkt->info.metadata.dataloop_size = 0;
814
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
815
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
816 817
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
818 819 820
        }

        else {
821
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
822 823 824
            MPI_Aint origin_type_size;
            size_t immed_len, orig_len;
            int use_immed_pkt = FALSE;
825
            int is_origin_contig, is_target_contig, is_result_contig;
Xin Zhao's avatar
Xin Zhao committed
826 827 828 829
            MPI_Aint stream_elem_count, stream_unit_count;
            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
            int i;
830 831

            /******************** Setting operation struct areas ***********************/
832 833 834 835 836 837 838 839

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_count = result_count;
            new_ptr->result_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
840 841 842 843 844

            /* Remember user request */
            if (ureq) {
                new_ptr->ureq = ureq;
            }
845 846 847 848

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
849
                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
850 851 852
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
853
                MPID_Datatype_get_ptr(result_datatype, target_dtp);
854 855 856
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
857
                MPID_Datatype_get_ptr(target_datatype, result_dtp);
858 859
                new_ptr->is_dt = 1;
            }
Xin Zhao's avatar
Xin Zhao committed
860

861 862 863
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
864 865 866 867 868 869 870
            /* Get size and count for predefined datatype elements */
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                predefined_dtp_size = origin_type_size;
                predefined_dtp_count = origin_count;
                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
            }
            else {
871 872
                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
873
                predefined_dtp_count = orig_len / predefined_dtp_size;
874
                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
            }
            MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                        predefined_dtp_extent > 0);

            /* Calculate number of predefined elements in each stream unit, and
             * total number of stream units. */
            stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
            stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
            MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

            for (i = 0; i < stream_unit_count; i++) {
                if (origin_dtp != NULL) {
                    MPID_Datatype_add_ref(origin_dtp);
                }
                if (target_dtp != NULL) {
                    MPID_Datatype_add_ref(target_dtp);
                }
                if (result_dtp != NULL) {
                    MPID_Datatype_add_ref(result_dtp);
                }
            }

897 898 899 900
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);

901
            /* Judge if we can use IMMED data packet */
902
            if (!new_ptr->is_dt && is_origin_contig && is_target_contig && is_result_contig) {
903
                MPIU_Assign_trunc(immed_len,
904
                                  (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
905 906 907 908 909
                                  size_t);
                if (orig_len <= immed_len)
                    use_immed_pkt = TRUE;
            }

910
            /* Judge if this operation is a piggyback candidate */
Xin Zhao's avatar
Xin Zhao committed
911
            if (!new_ptr->is_dt) {
912 913 914
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for origin, target and result data. We should extend this optimization to derived
                 * datatypes as well. */
915
                if (orig_len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
916
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
917
            }
918

919
            /************** Setting packet struct areas in operation ****************/
920

921
            get_accum_pkt = &(new_ptr->pkt.get_accum);
922 923 924 925 926 927 928 929

            if (use_immed_pkt) {
                MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
            }
            else {
                MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
            }