ch3u_rma_ops.c 54.5 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13
#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

54 55 56
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

57
#undef FUNCNAME
58
#define FUNCNAME MPIDI_CH3I_Put
59 60
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
61 62 63 64
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
65 66
{
    int mpi_errno = MPI_SUCCESS;
67
    int dt_contig ATTRIBUTE((unused)), rank;
68
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
69
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
70
    MPIDI_msg_sz_t data_sz;
71
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
72
    int made_progress = 0;
73
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
74

75
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
76

77 78 79
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

80 81 82 83
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

84 85
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

86
    if (data_sz == 0) {
87
        goto fn_exit;
88 89
    }

90
    rank = win_ptr->comm_ptr->rank;
91 92 93

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
94
        /* check if target is local and shared memory is allocated on window,
95
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
96 97

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
98 99 100 101 102
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
103 104 105 106
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

107
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
108
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
109
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
110 111
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
112 113
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
114 115 116 117 118 119

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
120
    }
121
    else {
122
        MPIDI_RMA_Op_t *new_ptr = NULL;
123
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
124 125 126
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
127
        int is_origin_contig, is_target_contig;
128

129
        /* queue it up */
130
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
131 132
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
133

Xin Zhao's avatar
Xin Zhao committed
134 135
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

136
        /******************** Setting operation struct areas ***********************/
137

138 139 140 141 142
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
143 144 145 146 147

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
148 149 150 151 152 153

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
154
            new_ptr->is_dt = 1;
155 156 157 158
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
159
            new_ptr->is_dt = 1;
160
        }
161

162 163 164
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

165 166 167 168
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

        /* Judge if we can use IMMED data packet */
169
        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
170
            MPIU_Assign_trunc(immed_len,
171
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
172 173 174 175 176
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

177
        /* Judge if this operation is an piggyback candidate */
Xin Zhao's avatar
Xin Zhao committed
178
        if (!new_ptr->is_dt) {
179 180 181
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
182
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
183
                new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
184 185
        }

186 187 188
        /************** Setting packet struct areas in operation ****************/

        put_pkt = &(new_ptr->pkt.put);
189 190 191 192 193 194 195 196

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

197 198
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
199 200
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
201
        put_pkt->info.dataloop_size = 0;
202
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
203 204
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
205
        if (use_immed_pkt) {
206
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
207
            mpi_errno = immed_copy(src, dest, len);
208 209
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
210
        }
211

Xin Zhao's avatar
Xin Zhao committed
212 213
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

214 215 216 217
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

218
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
219 220
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
221 222

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
223
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
224 225 226 227
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
228 229 230 231 232 233 234 235 236 237 238 239 240 241

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
242 243 244
    }

  fn_exit:
245
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
246 247 248 249 250 251 252 253 254
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
255
#define FUNCNAME MPIDI_CH3I_Get
256 257
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
258 259 260 261
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
262 263 264
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
265
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
266
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
267
    MPID_Datatype *dtp;
268
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
269
    int made_progress = 0;
270
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
271

272
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
273

274 275 276
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

277 278 279 280
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

281
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
282

283
    if (data_sz == 0) {
284
        goto fn_exit;
285 286
    }

287
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
288

289 290
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
291
        /* check if target is local and shared memory is allocated on window,
292
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
293 294

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
295 296 297 298 299
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
300 301 302
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
303

304
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
305
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
306
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
307 308
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
309 310
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
311 312 313 314 315 316

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
317
    }
318
    else {
319
        MPIDI_RMA_Op_t *new_ptr = NULL;
320
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
321 322 323
        MPI_Aint target_type_size;
        size_t immed_len, len;
        int use_immed_resp_pkt = FALSE;
324
        int is_origin_contig, is_target_contig;
325

326
        /* queue it up */
327
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
328 329
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
330

Xin Zhao's avatar
Xin Zhao committed
331 332
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

333
        /******************** Setting operation struct areas ***********************/
334

335 336 337 338 339
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
340 341 342 343 344

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
345 346 347 348 349 350

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
351
            new_ptr->is_dt = 1;
352 353 354 355
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
356
            new_ptr->is_dt = 1;
357
        }
358

359 360 361
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

362 363 364 365
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
        MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

        /* Judge if we can use IMMED data response packet */
366
        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
367
            MPIU_Assign_trunc(immed_len,
368
                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
369 370 371 372 373
                              size_t);
            if (len <= immed_len)
                use_immed_resp_pkt = TRUE;
        }

374
        /* Judge if this operation is an piggyback candidate. */
375
        if (!new_ptr->is_dt) {
376 377 378
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
379 380 381
            new_ptr->piggyback_lock_candidate = 1;
        }

382 383 384 385
        /************** Setting packet struct areas in operation ****************/

        get_pkt = &(new_ptr->pkt.get);
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
386 387
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
388 389
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
390
        get_pkt->info.dataloop_size = 0;
391
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
392
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
393 394
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
395

Xin Zhao's avatar
Xin Zhao committed
396 397
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

398 399 400 401
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

402
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
403 404
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
405 406

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
407
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
408 409 410 411
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
412 413 414 415 416 417 418 419 420 421 422 423 424 425

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
426 427 428
    }

  fn_exit:
429
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
430 431 432 433 434 435 436 437 438 439
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
440
#define FUNCNAME MPIDI_CH3I_Accumulate
441 442
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
443 444 445 446
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
447
{
448
    int mpi_errno = MPI_SUCCESS;
449
    MPIDI_msg_sz_t data_sz;
450
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
451
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
452
    MPID_Datatype *dtp;
453
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
454
    int made_progress = 0;
455
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
456

457
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
458

459 460 461
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

462 463 464 465
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

466 467
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

468
    if (data_sz == 0) {
469
        goto fn_exit;
470
    }
471

472
    rank = win_ptr->comm_ptr->rank;
473 474 475

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
476
        /* check if target is local and shared memory is allocated on window,
477
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
478 479

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
480 481 482 483 484
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
485 486 487 488
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

489
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
490
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
491 492 493 494 495 496
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
497 498 499 500 501 502

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
503
    }
504
    else {
505
        MPIDI_RMA_Op_t *new_ptr = NULL;
506
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
507 508 509
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
510
        int is_origin_contig, is_target_contig;
511

512
        /* queue it up */
513
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
514 515
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
516

Xin Zhao's avatar
Xin Zhao committed
517 518
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

519
        /******************** Setting operation struct areas ***********************/
520

521 522 523 524
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
525 526 527 528 529

        /* Remember user request */
        if (ureq) {
            new_ptr->ureq = ureq;
        }
530 531 532 533 534 535

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
536
            new_ptr->is_dt = 1;
537 538 539 540
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
541
            new_ptr->is_dt = 1;
542
        }
543

544 545 546
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

547 548 549
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

550
        /* Judge if we can use IMMED data packet */
551
        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
552
            MPIU_Assign_trunc(immed_len,
553
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
554 555 556 557 558
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

559
        /* Judge if this operation is an piggyback candidate. */
Xin Zhao's avatar
Xin Zhao committed
560
        if (!new_ptr->is_dt) {
561 562 563
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
564
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
565
                new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
566 567
        }

568 569 570 571
        /************** Setting packet struct areas in operation ****************/

        accum_pkt = &(new_ptr->pkt.accum);

572 573 574 575 576 577 578
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

579 580
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
581 582
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
583
        accum_pkt->info.dataloop_size = 0;
584
        accum_pkt->op = op;
585
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
586 587
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
588
        if (use_immed_pkt) {
589
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
590
            mpi_errno = immed_copy(src, dest, len);
591 592
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
593
        }
594

Xin Zhao's avatar
Xin Zhao committed
595 596
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

597 598 599 600
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

601
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
602 603
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
604 605

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
606
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
607 608 609 610
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
611 612 613 614 615 616 617 618 619 620 621 622 623 624

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
625 626
    }

627
  fn_exit:
628
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
629 630 631 632 633 634 635 636 637 638
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
639
#define FUNCNAME MPIDI_CH3I_Get_accumulate
640 641
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
642 643 644 645 646
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
647 648 649 650 651 652 653 654
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
655
    int made_progress = 0;
656
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
657

658
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
659

660 661 662
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
699 700 701 702 703 704

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
705 706 707 708 709
    }
    else {
        MPIDI_RMA_Op_t *new_ptr = NULL;

        /* Append the operation to the window's RMA ops queue */
710
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
711 712
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
713 714 715

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
716 717
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

718 719
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
720
            MPIDI_CH3_Pkt_get_t *get_pkt;
721 722 723
            MPI_Aint target_type_size;
            size_t len, immed_len;
            int use_immed_resp_pkt = FALSE;
724
            int is_result_contig, is_target_contig;
725 726

            /******************** Setting operation struct areas ***********************/
727 728 729 730 731

            new_ptr->origin_addr = result_addr;
            new_ptr->origin_count = result_count;
            new_ptr->origin_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
732 733 734 735 736

            /* Remember user request */
            if (ureq) {
                new_ptr->ureq = ureq;
            }
737 738 739 740 741 742 743 744 745 746 747

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
748

749 750 751
            MPID_Datatype_get_size_macro(target_datatype, target_type_size);
            MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

752 753 754
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

755
            /* Judge if we can use IMMED data response packet */
756
            if (!new_ptr->is_dt && is_result_contig && is_target_contig) {
757
                MPIU_Assign_trunc(immed_len,
758
                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
759 760 761 762 763
                                  size_t);
                if (len <= immed_len)
                    use_immed_resp_pkt = TRUE;
            }

764
            /* Judge if this operation is a piggyback candidate */
765
            if (!new_ptr->is_dt) {
766 767 768
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
769 770
                new_ptr->piggyback_lock_candidate = 1;
            }
771 772 773 774 775

            /************** Setting packet struct areas in operation ****************/

            get_pkt = &(new_ptr->pkt.get);
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
776 777
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
778 779
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
780
            get_pkt->info.dataloop_size = 0;
781
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
782
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
783 784
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
785 786 787
        }

        else {
788
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
789 790 791
            MPI_Aint origin_type_size;
            size_t immed_len, orig_len;
            int use_immed_pkt = FALSE;
792
            int is_origin_contig, is_target_contig, is_result_contig;
793 794

            /******************** Setting operation struct areas ***********************/
795 796 797 798 799 800 801 802

            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->result_addr = result_addr;
            new_ptr->result_count = result_count;
            new_ptr->result_datatype = result_datatype;
            new_ptr->target_rank = target_rank;
803 804 805 806 807

            /* Remember user request */
            if (ureq) {
                new_ptr->ureq = ureq;
            }
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                MPID_Datatype_get_ptr(origin_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
                new_ptr->is_dt = 1;
            }
Xin Zhao's avatar
Xin Zhao committed
826

827 828 829
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);

830 831 832 833
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);

834
            /* Judge if we can use IMMED data packet */
835
            if (!new_ptr->is_dt && is_origin_contig && is_target_contig && is_result_contig) {
836
                MPIU_Assign_trunc(immed_len,
837
                                  (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
838 839 840 841 842
                                  size_t);
                if (orig_len <= immed_len)
                    use_immed_pkt = TRUE;
            }

843
            /* Judge if this operation is a piggyback candidate */
Xin Zhao's avatar
Xin Zhao committed
844
            if (!new_ptr->is_dt) {
845 846 847
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for origin, target and result data. We should extend this optimization to derived
                 * datatypes as well. */
848
                if (orig_len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
849
                    new_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
850
            }
851

852
            /************** Setting packet struct areas in operation ****************/
853

854
            get_accum_pkt = &(new_ptr->pkt.get_accum);
855 856 857 858 859 860 861 862

            if (use_immed_pkt) {
                MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
            }
            else {
                MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
            }

863 864
            get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
865 866
            get_accum_pkt->count = target_count;
            get_accum_pkt->datatype = target_datatype;
867
            get_accum_pkt->info.dataloop_size = 0;
868
            get_accum_pkt->op = op;
869
            get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
870
            get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
871
            if (use_immed_pkt) {
872
                void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
873
                mpi_errno = immed_copy(src, dest, orig_len);
874 875
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
876
            }
877 878
        }

Xin Zhao's avatar
Xin Zhao committed
879 880
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

881 882 883 884
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

885
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
886 887
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
888 889

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
890
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
891 892 893 894
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
895 896 897 898 899 900 901 902 903 904 905 906 907 908

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
909 910
    }

911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945