ch3u_rma_ops.c 56.2 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

51 52 53
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

54
#undef FUNCNAME
55
#define FUNCNAME MPIDI_CH3I_Put
56 57
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
58 59 60 61
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
62 63
{
    int mpi_errno = MPI_SUCCESS;
64
    int dt_contig ATTRIBUTE((unused)), rank;
65
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
66
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
67
    MPIDI_msg_sz_t data_sz;
68
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
69
    int made_progress = 0;
70
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
71

72
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
73

74 75 76
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

77 78 79 80
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

81 82
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

83
    if (data_sz == 0) {
84
        goto fn_exit;
85 86
    }

87
    rank = win_ptr->comm_ptr->rank;
88 89 90

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
91
        /* check if target is local and shared memory is allocated on window,
92
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
93 94

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
95 96 97 98 99
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
100 101 102 103
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

104
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
105
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
106
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
107 108
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
109 110
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
111 112 113

        if (ureq) {
            /* Complete user request and release the ch3 ref */
114
            MPIDI_CH3U_Request_complete(ureq);
115
        }
116
    }
117
    else {
118
        MPIDI_RMA_Op_t *op_ptr = NULL;
119
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
120
        int use_immed_pkt = FALSE;
121
        int is_origin_contig, is_target_contig;
122

123
        /* queue it up */
124
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
125 126
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
127

Xin Zhao's avatar
Xin Zhao committed
128 129
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

130
        /******************** Setting operation struct areas ***********************/
131

132
        /* FIXME: For contig and very short operations, use a streamlined op */
133 134 135 136
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
137 138

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
139
        op_ptr->ureq = ureq;
140 141 142 143 144 145 146 147 148 149 150

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
151

152 153 154
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

155
        /* Judge if we can use IMMED data packet */
156 157
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
158
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
159 160 161
                use_immed_pkt = TRUE;
        }

162
        /* Judge if this operation is an piggyback candidate */
163 164
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
165 166 167
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
168
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
169
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
170 171
        }

172 173
        /************** Setting packet struct areas in operation ****************/

174
        put_pkt = &(op_ptr->pkt.put);
175 176 177 178 179 180 181 182

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

183 184
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
185 186
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
187
        put_pkt->info.dataloop_size = 0;
188
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
189 190
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
191
        if (use_immed_pkt) {
192
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
193
            mpi_errno = immed_copy(src, dest, data_sz);
194 195
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
196
        }
197

Xin Zhao's avatar
Xin Zhao committed
198 199
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

200
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
201 202 203
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

204
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
205 206
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
207 208

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
209
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
210 211 212 213
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
214 215 216 217 218 219 220 221 222 223 224 225 226 227

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
228 229 230
    }

  fn_exit:
231
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
232 233 234 235 236 237 238 239 240
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
241
#define FUNCNAME MPIDI_CH3I_Get
242 243
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
244 245 246 247
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
248 249 250
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
251
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
252
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
253
    MPID_Datatype *dtp;
254
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
255
    int made_progress = 0;
256
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
257

258
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
259

260 261 262
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

263 264 265 266
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

267
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
268

269
    if (data_sz == 0) {
270
        goto fn_exit;
271 272
    }

273
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
274

275 276
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
277
        /* check if target is local and shared memory is allocated on window,
278
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
279 280

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
281 282 283 284 285
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
286 287 288
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
289

290
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
291
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
292
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
293 294
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
295 296
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
297 298 299

        if (ureq) {
            /* Complete user request and release the ch3 ref */
300
            MPIDI_CH3U_Request_complete(ureq);
301
        }
302
    }
303
    else {
304
        MPIDI_RMA_Op_t *op_ptr = NULL;
305
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
306
        MPI_Aint target_type_size;
307
        size_t len;
308
        int use_immed_resp_pkt = FALSE;
309
        int is_origin_contig, is_target_contig;
310

311
        /* queue it up */
312
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
313 314
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
315

Xin Zhao's avatar
Xin Zhao committed
316 317
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

318
        /******************** Setting operation struct areas ***********************/
319

320
        /* FIXME: For contig and very short operations, use a streamlined op */
321 322 323 324
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
325 326

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
327
        op_ptr->ureq = ureq;
328 329 330 331 332 333 334 335 336 337 338

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
339

340 341 342
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

343 344 345 346
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
        MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

        /* Judge if we can use IMMED data response packet */
347 348
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
349
            if (len <= MPIDI_RMA_IMMED_BYTES)
350 351 352
                use_immed_resp_pkt = TRUE;
        }

353
        /* Judge if this operation is an piggyback candidate. */
354 355
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
356 357 358
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
359
            op_ptr->piggyback_lock_candidate = 1;
360 361
        }

362 363
        /************** Setting packet struct areas in operation ****************/

364
        get_pkt = &(op_ptr->pkt.get);
365
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
366 367
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
368 369
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
370
        get_pkt->info.dataloop_size = 0;
371
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
372
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
373 374
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
375

Xin Zhao's avatar
Xin Zhao committed
376 377
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

378
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
379 380 381
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

382
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
383 384
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
385 386

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
387
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
388 389 390 391
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
392 393 394 395 396 397 398 399 400 401 402 403 404 405

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
406 407 408
    }

  fn_exit:
409
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
410 411 412 413 414 415 416 417 418 419
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
420
#define FUNCNAME MPIDI_CH3I_Accumulate
421 422
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
423 424 425 426
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
427
{
428
    int mpi_errno = MPI_SUCCESS;
429
    MPIDI_msg_sz_t data_sz;
430
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
431
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
432
    MPID_Datatype *dtp;
433
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
434
    int made_progress = 0;
435
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
436

437
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
438

439 440 441
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

442 443 444 445
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

446 447
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

448
    if (data_sz == 0) {
449
        goto fn_exit;
450
    }
451

452
    rank = win_ptr->comm_ptr->rank;
453 454 455

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
456
        /* check if target is local and shared memory is allocated on window,
457
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
458 459

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
460 461 462 463 464
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
465 466 467 468
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

469
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
470
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
471 472 473 474 475 476
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
477 478 479

        if (ureq) {
            /* Complete user request and release the ch3 ref */
480
            MPIDI_CH3U_Request_complete(ureq);
481
        }
482
    }
483
    else {
484
        MPIDI_RMA_Op_t *op_ptr = NULL;
485
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
486
        int use_immed_pkt = FALSE;
487
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
488 489 490 491
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
492

493
        /* queue it up */
494
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
495 496
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
497

Xin Zhao's avatar
Xin Zhao committed
498 499
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

500
        /******************** Setting operation struct areas ***********************/
501

502 503 504 505
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
506 507

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
508
        op_ptr->ureq = ureq;
509 510 511 512

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
513
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
514 515
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
516
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
517
        }
518

Xin Zhao's avatar
Xin Zhao committed
519 520
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
521
            MPID_Datatype_get_size_macro(origin_datatype, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
522 523 524 525
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
526 527
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
528
            predefined_dtp_count = data_sz / predefined_dtp_size;
529
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

549 550 551
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

552
        /* Judge if we can use IMMED data packet */
553 554
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
555
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
556 557 558
                use_immed_pkt = TRUE;
        }

559
        /* Judge if this operation is an piggyback candidate. */
560 561
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
562 563 564
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
565
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
566
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
567 568
        }

569 570
        /************** Setting packet struct areas in operation ****************/

571
        accum_pkt = &(op_ptr->pkt.accum);
572

573 574 575 576 577 578 579
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

580 581
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
582 583
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
584
        accum_pkt->info.dataloop_size = 0;
585
        accum_pkt->op = op;
586
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
587 588
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
589
        if (use_immed_pkt) {
590
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
591
            mpi_errno = immed_copy(src, dest, data_sz);
592 593
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
594
        }
595

Xin Zhao's avatar
Xin Zhao committed
596 597
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

598
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
599 600 601
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

602
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
603 604
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
605 606

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
607
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
608 609 610 611
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
612 613 614 615 616 617 618 619 620 621 622 623 624 625

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
626 627
    }

628
  fn_exit:
629
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
630 631 632 633 634 635 636 637 638 639
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
640
#define FUNCNAME MPIDI_CH3I_Get_accumulate
641 642
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
643 644 645 646 647
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
648 649 650 651 652 653 654 655
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
656
    int made_progress = 0;
657
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
658

659
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
660

661 662 663
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
700 701 702

        if (ureq) {
            /* Complete user request and release the ch3 ref */
703
            MPIDI_CH3U_Request_complete(ureq);
704
        }
705 706
    }
    else {
707
        MPIDI_RMA_Op_t *op_ptr = NULL;
708 709

        /* Append the operation to the window's RMA ops queue */
710
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
711 712
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
713 714 715

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
716 717
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

718 719
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
720
            MPIDI_CH3_Pkt_get_t *get_pkt;
721
            int use_immed_resp_pkt = FALSE;
722
            int is_result_contig, is_target_contig;
723 724

            /******************** Setting operation struct areas ***********************/
725

726 727 728 729
            op_ptr->origin_addr = result_addr;
            op_ptr->origin_count = result_count;
            op_ptr->origin_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
730 731

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
732
            op_ptr->ureq = ureq;
733 734 735 736 737 738 739 740 741

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
            }
742

743 744 745
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

746
            /* Judge if we can use IMMED data response packet */
747 748 749
            if (MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
                is_result_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
750
                if (data_sz <= MPIDI_RMA_IMMED_BYTES)
751 752 753
                    use_immed_resp_pkt = TRUE;
            }

754
            /* Judge if this operation is a piggyback candidate */
755 756
            if (MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
757 758 759
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
760
                op_ptr->piggyback_lock_candidate = 1;
761
            }
762 763 764

            /************** Setting packet struct areas in operation ****************/

765
            get_pkt = &(op_ptr->pkt.get);
766
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
767 768
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
769 770
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
771
            get_pkt->info.dataloop_size = 0;
772
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
773
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
774 775
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
776 777 778
        }

        else {
779
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
780
            MPI_Aint origin_type_size;
781
            size_t orig_len;
782
            int use_immed_pkt = FALSE;
783
            int is_origin_contig, is_target_contig, is_result_contig;
Xin Zhao's avatar
Xin Zhao committed
784 785 786 787
            MPI_Aint stream_elem_count, stream_unit_count;
            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
            int i;
788 789

            /******************** Setting operation struct areas ***********************/
790

791 792 793 794 795 796 797
            op_ptr->origin_addr = (void *) origin_addr;
            op_ptr->origin_count = origin_count;
            op_ptr->origin_datatype = origin_datatype;
            op_ptr->result_addr = result_addr;
            op_ptr->result_count = result_count;
            op_ptr->result_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
798 799

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
800
            op_ptr->ureq = ureq;
801 802 803 804

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
805
                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
806 807
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
808
                MPID_Datatype_get_ptr(result_datatype, target_dtp);
809 810
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
811
                MPID_Datatype_get_ptr(target_datatype, result_dtp);
812
            }
Xin Zhao's avatar
Xin Zhao committed
813

814 815 816
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
817 818 819 820 821 822 823
            /* Get size and count for predefined datatype elements */
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                predefined_dtp_size = origin_type_size;
                predefined_dtp_count = origin_count;
                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
            }
            else {
824 825
                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
826
                predefined_dtp_count = orig_len / predefined_dtp_size;
827
                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
            }
            MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                        predefined_dtp_extent > 0);

            /* Calculate number of predefined elements in each stream unit, and
             * total number of stream units. */
            stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
            stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
            MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

            for (i = 0; i < stream_unit_count; i++) {
                if (origin_dtp != NULL) {
                    MPID_Datatype_add_ref(origin_dtp);
                }
                if (target_dtp != NULL) {
                    MPID_Datatype_add_ref(target_dtp);
                }
                if (result_dtp != NULL) {
                    MPID_Datatype_add_ref(result_dtp);
                }
            }

850 851 852 853
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);

854
            /* Judge if we can use IMMED data packet */
855 856 857 858
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
                is_origin_contig && is_target_contig && is_result_contig) {
859
                if (orig_len <= MPIDI_RMA_IMMED_BYTES)
860 861 862
                    use_immed_pkt = TRUE;
            }

863
            /* Judge if this operation is a piggyback candidate */
864 865 866
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
867 868 869
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for origin, target and result data. We should extend this optimization to derived
                 * datatypes as well. */
870
                if (orig_len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
871
                    op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
872
            }
873

874
            /************** Setting packet struct areas in operation ****************/
875

876
            get_accum_pkt = &(op_ptr->pkt.get_accum);