ch3u_rma_ops.c 57.4 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

51 52 53
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

54
#undef FUNCNAME
55
#define FUNCNAME MPIDI_CH3I_Put
56 57
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
58 59 60 61
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
62 63
{
    int mpi_errno = MPI_SUCCESS;
64
    int dt_contig ATTRIBUTE((unused)), rank;
65
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
66
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
67
    MPIDI_msg_sz_t data_sz;
68
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
69
    int made_progress = 0;
70
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
71

72
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
73

74 75 76
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

77 78 79 80
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

81 82
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

83
    if (data_sz == 0) {
84
        goto fn_exit;
85 86
    }

87
    rank = win_ptr->comm_ptr->rank;
88 89 90

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
91
        /* check if target is local and shared memory is allocated on window,
92
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
93 94

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
95 96 97 98 99
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
100 101 102 103
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

104
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
105
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
106
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
107 108
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
109 110
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
111 112 113 114 115 116

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
117
    }
118
    else {
119
        MPIDI_RMA_Op_t *op_ptr = NULL;
120
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
121 122 123
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
124
        int is_origin_contig, is_target_contig;
125

126
        /* queue it up */
127
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
128 129
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
130

Xin Zhao's avatar
Xin Zhao committed
131 132
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

133
        /******************** Setting operation struct areas ***********************/
134

135
        /* FIXME: For contig and very short operations, use a streamlined op */
136 137 138 139
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
140 141

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
142
        op_ptr->ureq = ureq;
143 144 145 146 147 148

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
149
            op_ptr->is_dt = 1;
150 151 152 153
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
154
            op_ptr->is_dt = 1;
155
        }
156

157 158 159
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

160 161 162 163
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

        /* Judge if we can use IMMED data packet */
164
        if (!op_ptr->is_dt && is_origin_contig && is_target_contig) {
165
            MPIU_Assign_trunc(immed_len,
166
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
167 168 169 170 171
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

172
        /* Judge if this operation is an piggyback candidate */
173
        if (!op_ptr->is_dt) {
174 175 176
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
177
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
178
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
179 180
        }

181 182
        /************** Setting packet struct areas in operation ****************/

183
        put_pkt = &(op_ptr->pkt.put);
184 185 186 187 188 189 190 191

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

192 193
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
194 195
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
196
        put_pkt->info.dataloop_size = 0;
197
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
198 199
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
200
        if (use_immed_pkt) {
201
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
202
            mpi_errno = immed_copy(src, dest, len);
203 204
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
205
        }
206

Xin Zhao's avatar
Xin Zhao committed
207 208
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

209
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
210 211 212
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

213
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
214 215
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
216 217

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
218
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
219 220 221 222
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
223 224 225 226 227 228 229 230 231 232 233 234 235 236

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
237 238 239
    }

  fn_exit:
240
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
241 242 243 244 245 246 247 248 249
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
250
#define FUNCNAME MPIDI_CH3I_Get
251 252
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
253 254 255 256
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
257 258 259
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
260
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
261
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
262
    MPID_Datatype *dtp;
263
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
264
    int made_progress = 0;
265
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
266

267
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
268

269 270 271
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

272 273 274 275
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

276
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
277

278
    if (data_sz == 0) {
279
        goto fn_exit;
280 281
    }

282
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
283

284 285
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
286
        /* check if target is local and shared memory is allocated on window,
287
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
288 289

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
290 291 292 293 294
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
295 296 297
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
298

299
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
300
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
301
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
302 303
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
304 305
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
306 307 308 309 310 311

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
312
    }
313
    else {
314
        MPIDI_RMA_Op_t *op_ptr = NULL;
315
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
316 317 318
        MPI_Aint target_type_size;
        size_t immed_len, len;
        int use_immed_resp_pkt = FALSE;
319
        int is_origin_contig, is_target_contig;
320

321
        /* queue it up */
322
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
323 324
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
325

Xin Zhao's avatar
Xin Zhao committed
326 327
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

328
        /******************** Setting operation struct areas ***********************/
329

330
        /* FIXME: For contig and very short operations, use a streamlined op */
331 332 333 334
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
335 336

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
337
        op_ptr->ureq = ureq;
338 339 340 341 342 343

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
344
            op_ptr->is_dt = 1;
345 346 347 348
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
349
            op_ptr->is_dt = 1;
350
        }
351

352 353 354
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

355 356 357 358
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
        MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

        /* Judge if we can use IMMED data response packet */
359
        if (!op_ptr->is_dt && is_origin_contig && is_target_contig) {
360
            MPIU_Assign_trunc(immed_len,
361
                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
362 363 364 365 366
                              size_t);
            if (len <= immed_len)
                use_immed_resp_pkt = TRUE;
        }

367
        /* Judge if this operation is an piggyback candidate. */
368
        if (!op_ptr->is_dt) {
369 370 371
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
372
            op_ptr->piggyback_lock_candidate = 1;
373 374
        }

375 376
        /************** Setting packet struct areas in operation ****************/

377
        get_pkt = &(op_ptr->pkt.get);
378
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
379 380
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
381 382
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
383
        get_pkt->info.dataloop_size = 0;
384
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
385
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
386 387
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
388

Xin Zhao's avatar
Xin Zhao committed
389 390
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

391
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
392 393 394
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

395
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
396 397
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
398 399

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
400
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
401 402 403 404
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
405 406 407 408 409 410 411 412 413 414 415 416 417 418

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
419 420 421
    }

  fn_exit:
422
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
423 424 425 426 427 428 429 430 431 432
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
433
#define FUNCNAME MPIDI_CH3I_Accumulate
434 435
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
436 437 438 439
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
440
{
441
    int mpi_errno = MPI_SUCCESS;
442
    MPIDI_msg_sz_t data_sz;
443
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
444
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
445
    MPID_Datatype *dtp;
446
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
447
    int made_progress = 0;
448
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
449

450
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
451

452 453 454
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

455 456 457 458
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

459 460
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

461
    if (data_sz == 0) {
462
        goto fn_exit;
463
    }
464

465
    rank = win_ptr->comm_ptr->rank;
466 467 468

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
469
        /* check if target is local and shared memory is allocated on window,
470
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
471 472

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
473 474 475 476 477
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
478 479 480 481
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

482
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
483
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
484 485 486 487 488 489
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
490 491 492 493 494 495

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
496
    }
497
    else {
498
        MPIDI_RMA_Op_t *op_ptr = NULL;
499
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
500 501 502
        MPI_Aint origin_type_size;
        size_t immed_len, len;
        int use_immed_pkt = FALSE;
503
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
504 505 506 507
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
508

509
        /* queue it up */
510
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
511 512
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
513

Xin Zhao's avatar
Xin Zhao committed
514 515
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

516
        /******************** Setting operation struct areas ***********************/
517

518 519 520 521
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
522 523

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
524
        op_ptr->ureq = ureq;
525 526 527 528

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
529
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
530
            op_ptr->is_dt = 1;
531 532
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
533
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
534
            op_ptr->is_dt = 1;
535
        }
536

537 538 539
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
540 541 542 543 544 545 546
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            predefined_dtp_size = origin_type_size;
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
547 548
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
549
            predefined_dtp_count = len / predefined_dtp_size;
550
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

570 571 572
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

573
        /* Judge if we can use IMMED data packet */
574
        if (!op_ptr->is_dt && is_origin_contig && is_target_contig) {
575
            MPIU_Assign_trunc(immed_len,
576
                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
577 578 579 580 581
                              size_t);
            if (len <= immed_len)
                use_immed_pkt = TRUE;
        }

582
        /* Judge if this operation is an piggyback candidate. */
583
        if (!op_ptr->is_dt) {
584 585 586
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
587
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
588
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
589 590
        }

591 592
        /************** Setting packet struct areas in operation ****************/

593
        accum_pkt = &(op_ptr->pkt.accum);
594

595 596 597 598 599 600 601
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

602 603
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
604 605
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
606
        accum_pkt->info.dataloop_size = 0;
607
        accum_pkt->op = op;
608
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
609 610
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
611
        if (use_immed_pkt) {
612
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
613
            mpi_errno = immed_copy(src, dest, len);
614 615
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
616
        }
617

Xin Zhao's avatar
Xin Zhao committed
618 619
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

620
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
621 622 623
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

624
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
625 626
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
627 628

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
629
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
630 631 632 633
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
634 635 636 637 638 639 640 641 642 643 644 645 646 647

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
648 649
    }

650
  fn_exit:
651
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
652 653 654 655 656 657 658 659 660 661
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
662
#define FUNCNAME MPIDI_CH3I_Get_accumulate
663 664
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
665 666 667 668 669
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
670 671 672 673 674 675 676 677
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
678
    int made_progress = 0;
679
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
680

681
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
682

683 684 685
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
722 723 724 725 726 727

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            MPID_Request_set_completed(ureq);
            MPID_Request_release(ureq);
        }
728 729
    }
    else {
730
        MPIDI_RMA_Op_t *op_ptr = NULL;
731 732

        /* Append the operation to the window's RMA ops queue */
733
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
734 735
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
736 737 738

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
739 740
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

741 742
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
743
            MPIDI_CH3_Pkt_get_t *get_pkt;
744 745 746
            MPI_Aint target_type_size;
            size_t len, immed_len;
            int use_immed_resp_pkt = FALSE;
747
            int is_result_contig, is_target_contig;
748 749

            /******************** Setting operation struct areas ***********************/
750

751 752 753 754
            op_ptr->origin_addr = result_addr;
            op_ptr->origin_count = result_count;
            op_ptr->origin_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
755 756

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
757
            op_ptr->ureq = ureq;
758 759 760 761

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
762
                op_ptr->is_dt = 1;
763 764 765 766
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
767
                op_ptr->is_dt = 1;
768
            }
769

770 771 772
            MPID_Datatype_get_size_macro(target_datatype, target_type_size);
            MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

773 774 775
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

776
            /* Judge if we can use IMMED data response packet */
777
            if (!op_ptr->is_dt && is_result_contig && is_target_contig) {
778
                MPIU_Assign_trunc(immed_len,
779
                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
780 781 782 783 784
                                  size_t);
                if (len <= immed_len)
                    use_immed_resp_pkt = TRUE;
            }

785
            /* Judge if this operation is a piggyback candidate */
786
            if (!op_ptr->is_dt) {
787 788 789
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
790
                op_ptr->piggyback_lock_candidate = 1;
791
            }
792 793 794

            /************** Setting packet struct areas in operation ****************/

795
            get_pkt = &(op_ptr->pkt.get);
796
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
797 798
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
799 800
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
801
            get_pkt->info.dataloop_size = 0;
802
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
803
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
804 805
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
806 807 808
        }

        else {
809
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
810 811 812
            MPI_Aint origin_type_size;
            size_t immed_len, orig_len;
            int use_immed_pkt = FALSE;
813
            int is_origin_contig, is_target_contig, is_result_contig;
Xin Zhao's avatar
Xin Zhao committed
814 815 816 817
            MPI_Aint stream_elem_count, stream_unit_count;
            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
            int i;
818 819

            /******************** Setting operation struct areas ***********************/
820

821 822 823 824 825 826 827
            op_ptr->origin_addr = (void *) origin_addr;
            op_ptr->origin_count = origin_count;
            op_ptr->origin_datatype = origin_datatype;
            op_ptr->result_addr = result_addr;
            op_ptr->result_count = result_count;
            op_ptr->result_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
828 829

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
830
            op_ptr->ureq = ureq;
831 832 833 834

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
835
                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
836
                op_ptr->is_dt = 1;
837 838
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
839
                MPID_Datatype_get_ptr(result_datatype, target_dtp);
840
                op_ptr->is_dt = 1;
841 842
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
843
                MPID_Datatype_get_ptr(target_datatype, result_dtp);
844
                op_ptr->is_dt = 1;
845
            }
Xin Zhao's avatar
Xin Zhao committed
846

847 848 849
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
850 851 852 853 854 855 856
            /* Get size and count for predefined datatype elements */
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                predefined_dtp_size = origin_type_size;
                predefined_dtp_count = origin_count;
                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
            }
            else {
857 858
                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
859
                predefined_dtp_count = orig_len / predefined_dtp_size;
860
                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882
            }
            MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                        predefined_dtp_extent > 0);

            /* Calculate number of predefined elements in each stream unit, and
             * total number of stream units. */
            stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
            stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
            MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

            for (i = 0; i < stream_unit_count; i++) {
                if (origin_dtp != NULL) {
                    MPID_Datatype_add_ref(origin_dtp);
                }
                if (target_dtp != NULL) {
                    MPID_Datatype_add_ref(target_dtp);
                }
                if (result_dtp != NULL) {
                    MPID_Datatype_add_ref(result_dtp);
                }
            }

883 884 885 886
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);

887
            /* Judge if we can use IMMED data packet */