ch3u_rma_ops.c 57.6 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

51 52 53
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

54
#undef FUNCNAME
55
#define FUNCNAME MPIDI_CH3I_Put
56 57
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
58 59 60 61
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
62 63
{
    int mpi_errno = MPI_SUCCESS;
64
    int dt_contig ATTRIBUTE((unused)), rank;
65
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
66
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
67
    MPIDI_msg_sz_t data_sz;
68
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
69
    int made_progress = 0;
70
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
71

72
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
73

74 75 76
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

77 78 79 80
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

81 82
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

83
    if (data_sz == 0) {
84
        goto fn_exit;
85 86
    }

87
    rank = win_ptr->comm_ptr->rank;
88 89 90

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
91
        /* check if target is local and shared memory is allocated on window,
92
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
93 94

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
95 96 97 98 99
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
100 101 102 103
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

104
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
105
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
106
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
107 108
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
109 110
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
111 112 113 114 115 116

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
117
    }
118
    else {
119
        MPIDI_RMA_Op_t *op_ptr = NULL;
120
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
121 122 123
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
124
        int is_origin_contig, is_target_contig;
125

126
        /* queue it up */
127
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
128 129
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
130

Xin Zhao's avatar
Xin Zhao committed
131 132
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

133
        /******************** Setting operation struct areas ***********************/
134

135
        /* FIXME: For contig and very short operations, use a streamlined op */
136 137 138 139
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
140 141 142

        /* Remember user request */
        if (ureq) {
143
            op_ptr->ureq = ureq;
144
        }
145 146 147 148 149 150

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
151
            op_ptr->is_dt = 1;
152 153 154 155
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
156
            op_ptr->is_dt = 1;
157
        }
158

159 160 161
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

162 163 164 165
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

        /* Judge if we can use IMMED data packet */
166
        if (!op_ptr->is_dt && is_origin_contig && is_target_contig) {
167
            MPIU_Assign_trunc(immed_len,
168
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
169 170 171 172 173
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

174
        /* Judge if this operation is an piggyback candidate */
175
        if (!op_ptr->is_dt) {
176 177 178
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
179
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
180
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
181 182
        }

183 184
        /************** Setting packet struct areas in operation ****************/

185
        put_pkt = &(op_ptr->pkt.put);
186 187 188 189 190 191 192 193

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

194 195
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
196 197
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
198
        put_pkt->info.dataloop_size = 0;
199
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
200 201
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
202
        if (use_immed_pkt) {
203
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
204
            mpi_errno = immed_copy(src, dest, len);
205 206
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
207
        }
208

Xin Zhao's avatar
Xin Zhao committed
209 210
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

211
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
212 213 214
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

215
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
216 217
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
218 219

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
220
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
221 222 223 224
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
225 226 227 228 229 230 231 232 233 234 235 236 237 238

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
239 240 241
    }

  fn_exit:
242
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
243 244 245 246 247 248 249 250 251
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
252
#define FUNCNAME MPIDI_CH3I_Get
253 254
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
255 256 257 258
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
259 260 261
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
262
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
263
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
264
    MPID_Datatype *dtp;
265
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
266
    int made_progress = 0;
267
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
268

269
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
270

271 272 273
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

274 275 276 277
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

278
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
279

280
    if (data_sz == 0) {
281
        goto fn_exit;
282 283
    }

284
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
285

286 287
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
288
        /* check if target is local and shared memory is allocated on window,
289
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
290 291

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
292 293 294 295 296
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
297 298 299
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
300

301
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
302
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
303
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
304 305
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
306 307
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
308 309 310 311 312 313

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
314
    }
315
    else {
316
        MPIDI_RMA_Op_t *op_ptr = NULL;
317
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
318 319 320
        MPI_Aint target_type_size;
        size_t immed_len, len;
        int use_immed_resp_pkt = FALSE;
321
        int is_origin_contig, is_target_contig;
322

323
        /* queue it up */
324
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
325 326
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
327

Xin Zhao's avatar
Xin Zhao committed
328 329
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

330
        /******************** Setting operation struct areas ***********************/
331

332
        /* FIXME: For contig and very short operations, use a streamlined op */
333 334 335 336
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
337 338 339

        /* Remember user request */
        if (ureq) {
340
            op_ptr->ureq = ureq;
341
        }
342 343 344 345 346 347

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
348
            op_ptr->is_dt = 1;
349 350 351 352
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
353
            op_ptr->is_dt = 1;
354
        }
355

356 357 358
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

359 360 361 362
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
        MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

        /* Judge if we can use IMMED data response packet */
363
        if (!op_ptr->is_dt && is_origin_contig && is_target_contig) {
364
            MPIU_Assign_trunc(immed_len,
365
                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
366 367 368 369 370
                              size_t);
            if (len <= immed_len)
                use_immed_resp_pkt = TRUE;
        }

371
        /* Judge if this operation is an piggyback candidate. */
372
        if (!op_ptr->is_dt) {
373 374 375
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
376
            op_ptr->piggyback_lock_candidate = 1;
377 378
        }

379 380
        /************** Setting packet struct areas in operation ****************/

381
        get_pkt = &(op_ptr->pkt.get);
382
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
383 384
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
385 386
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
387
        get_pkt->info.dataloop_size = 0;
388
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
389
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
390 391
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
392

Xin Zhao's avatar
Xin Zhao committed
393 394
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

395
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
396 397 398
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

399
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
400 401
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
402 403

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
404
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
405 406 407 408
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
409 410 411 412 413 414 415 416 417 418 419 420 421 422

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
423 424 425
    }

  fn_exit:
426
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
427 428 429 430 431 432 433 434 435 436
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
437
#define FUNCNAME MPIDI_CH3I_Accumulate
438 439
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
440 441 442 443
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
444
{
445
    int mpi_errno = MPI_SUCCESS;
446
    MPIDI_msg_sz_t data_sz;
447
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
448
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
449
    MPID_Datatype *dtp;
450
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
451
    int made_progress = 0;
452
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
453

454
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
455

456 457 458
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

459 460 461 462
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

463 464
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

465
    if (data_sz == 0) {
466
        goto fn_exit;
467
    }
468

469
    rank = win_ptr->comm_ptr->rank;
470 471 472

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
473
        /* check if target is local and shared memory is allocated on window,
474
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
475 476

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
477 478 479 480 481
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
482 483 484 485
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

486
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
487
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
488 489 490 491 492 493
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
494 495 496 497 498 499

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
500
    }
501
    else {
502
        MPIDI_RMA_Op_t *op_ptr = NULL;
503
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
504 505 506
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
507
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
508 509 510 511
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
512

513
        /* queue it up */
514
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
515 516
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
517

Xin Zhao's avatar
Xin Zhao committed
518 519
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

520
        /******************** Setting operation struct areas ***********************/
521

522 523 524 525
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
526 527 528

        /* Remember user request */
        if (ureq) {
529
            op_ptr->ureq = ureq;
530
        }
531 532 533 534

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
535
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
536
            op_ptr->is_dt = 1;
537 538
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
539
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
540
            op_ptr->is_dt = 1;
541
        }
542

543 544 545
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
546 547 548 549 550 551 552
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            predefined_dtp_size = origin_type_size;
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
553 554
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
555
            predefined_dtp_count = len / predefined_dtp_size;
556
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

576 577 578
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

579
        /* Judge if we can use IMMED data packet */
580
        if (!op_ptr->is_dt && is_origin_contig && is_target_contig) {
581
            MPIU_Assign_trunc(immed_len,
582
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
583 584 585 586 587
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

588
        /* Judge if this operation is an piggyback candidate. */
589
        if (!op_ptr->is_dt) {
590 591 592
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
593
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
594
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
595 596
        }

597 598
        /************** Setting packet struct areas in operation ****************/

599
        accum_pkt = &(op_ptr->pkt.accum);
600

601 602 603 604 605 606 607
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

608 609
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
610 611
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
612
        accum_pkt->info.dataloop_size = 0;
613
        accum_pkt->op = op;
614
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
615 616
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
617
        if (use_immed_pkt) {
618
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
619
            mpi_errno = immed_copy(src, dest, len);
620 621
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
622
        }
623

Xin Zhao's avatar
Xin Zhao committed
624 625
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

626
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
627 628 629
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

630
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
631 632
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
633 634

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
635
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
636 637 638 639
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
640 641 642 643 644 645 646 647 648 649 650 651 652 653

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
654 655
    }

656
  fn_exit:
657
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
658 659 660 661 662 663 664 665 666 667
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
668
#define FUNCNAME MPIDI_CH3I_Get_accumulate
669 670
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
671 672 673 674 675
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
676 677 678 679 680 681 682 683
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
684
    int made_progress = 0;
685
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
686

687
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
688

689 690 691
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
728 729 730 731 732 733

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
734 735
    }
    else {
736
        MPIDI_RMA_Op_t *op_ptr = NULL;
737 738

        /* Append the operation to the window's RMA ops queue */
739
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
740 741
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
742 743 744

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
745 746
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

747 748
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
749
            MPIDI_CH3_Pkt_get_t *get_pkt;
750 751 752
            MPI_Aint target_type_size;
            size_t len, immed_len;
            int use_immed_resp_pkt = FALSE;
753
            int is_result_contig, is_target_contig;
754 755

            /******************** Setting operation struct areas ***********************/
756

757 758 759 760
            op_ptr->origin_addr = result_addr;
            op_ptr->origin_count = result_count;
            op_ptr->origin_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
761 762 763

            /* Remember user request */
            if (ureq) {
764
                op_ptr->ureq = ureq;
765
            }
766 767 768 769

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
770
                op_ptr->is_dt = 1;
771 772 773 774
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
775
                op_ptr->is_dt = 1;
776
            }
777

778 779 780
            MPID_Datatype_get_size_macro(target_datatype, target_type_size);
            MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

781 782 783
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

784
            /* Judge if we can use IMMED data response packet */
785
            if (!op_ptr->is_dt && is_result_contig && is_target_contig) {
786
                MPIU_Assign_trunc(immed_len,
787
                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
788 789 790 791 792
                                  size_t);
                if (len <= immed_len)
                    use_immed_resp_pkt = TRUE;
            }

793
            /* Judge if this operation is a piggyback candidate */
794
            if (!op_ptr->is_dt) {
795 796 797
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
798
                op_ptr->piggyback_lock_candidate = 1;
799
            }
800 801 802

            /************** Setting packet struct areas in operation ****************/

803
            get_pkt = &(op_ptr->pkt.get);
804
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
805 806
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
807 808
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
809
            get_pkt->info.dataloop_size = 0;
810
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
811
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
812 813
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
814 815 816
        }

        else {
817
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
818 819 820
            MPI_Aint origin_type_size;
            size_t immed_len, orig_len;
            int use_immed_pkt = FALSE;
821
            int is_origin_contig, is_target_contig, is_result_contig;
Xin Zhao's avatar
Xin Zhao committed
822 823 824 825
            MPI_Aint stream_elem_count, stream_unit_count;
            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
            int i;
826 827

            /******************** Setting operation struct areas ***********************/
828

829 830 831 832 833 834 835
            op_ptr->origin_addr = (void *) origin_addr;
            op_ptr->origin_count = origin_count;
            op_ptr->origin_datatype = origin_datatype;
            op_ptr->result_addr = result_addr;
            op_ptr->result_count = result_count;
            op_ptr->result_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
836 837 838

            /* Remember user request */
            if (ureq) {
839
                op_ptr->ureq = ureq;
840
            }
841 842 843 844

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
845
                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
846
                op_ptr->is_dt = 1;
847 848
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
849
                MPID_Datatype_get_ptr(result_datatype, target_dtp);
850
                op_ptr->is_dt = 1;
851 852
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
853
                MPID_Datatype_get_ptr(target_datatype, result_dtp);
854
                op_ptr->is_dt = 1;
855
            }
Xin Zhao's avatar
Xin Zhao committed
856

857 858 859
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
860 861 862 863 864 865 866
            /* Get size and count for predefined datatype elements */
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                predefined_dtp_size = origin_type_size;
                predefined_dtp_count = origin_count;
                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
            }
            else {
867 868
                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
869
                predefined_dtp_count = orig_len / predefined_dtp_size;
870
                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886