ch3u_rma_ops.c 56.7 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

51 52 53
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

54
#undef FUNCNAME
55
#define FUNCNAME MPIDI_CH3I_Put
56 57
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
58 59 60 61
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
62 63
{
    int mpi_errno = MPI_SUCCESS;
64
    int dt_contig ATTRIBUTE((unused)), rank;
65
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
66
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
67
    MPIDI_msg_sz_t data_sz;
68
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
69
    int made_progress = 0;
70
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
71

72
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
73

74 75 76
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

77 78 79 80
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

81 82
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

83
    if (data_sz == 0) {
84
        goto fn_exit;
85 86
    }

87
    rank = win_ptr->comm_ptr->rank;
88 89 90

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
91
        /* check if target is local and shared memory is allocated on window,
92
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
93 94

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
95 96 97 98 99
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
100 101 102 103
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

104
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
105
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
106
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
107 108
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
109 110
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
111 112 113

        if (ureq) {
            /* Complete user request and release the ch3 ref */
114
            MPIDI_CH3U_Request_complete(ureq);
115
        }
116
    }
117
    else {
118
        MPIDI_RMA_Op_t *op_ptr = NULL;
119
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
120
        MPI_Aint origin_type_size;
121
        size_t len;
122
        int use_immed_pkt = FALSE;
123
        int is_origin_contig, is_target_contig;
124

125
        /* queue it up */
126
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
127 128
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
129

Xin Zhao's avatar
Xin Zhao committed
130 131
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

132
        /******************** Setting operation struct areas ***********************/
133

134
        /* FIXME: For contig and very short operations, use a streamlined op */
135 136 137 138
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
139 140

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
141
        op_ptr->ureq = ureq;
142 143 144 145 146 147 148 149 150 151 152

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
153

154 155 156
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

157 158 159 160
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

        /* Judge if we can use IMMED data packet */
161 162
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
163
            if (len <= MPIDI_RMA_IMMED_BYTES)
164 165 166
                use_immed_pkt = TRUE;
        }

167
        /* Judge if this operation is an piggyback candidate */
168 169
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
170 171 172
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
173
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
174
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
175 176
        }

177 178
        /************** Setting packet struct areas in operation ****************/

179
        put_pkt = &(op_ptr->pkt.put);
180 181 182 183 184 185 186 187

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

188 189
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
190 191
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
192
        put_pkt->info.dataloop_size = 0;
193
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
194 195
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
196
        if (use_immed_pkt) {
197
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
198
            mpi_errno = immed_copy(src, dest, len);
199 200
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
201
        }
202

Xin Zhao's avatar
Xin Zhao committed
203 204
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

205
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
206 207 208
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

209
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
210 211
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
212 213

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
214
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
215 216 217 218
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
219 220 221 222 223 224 225 226 227 228 229 230 231 232

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
233 234 235
    }

  fn_exit:
236
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
237 238 239 240 241 242 243 244 245
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
246
#define FUNCNAME MPIDI_CH3I_Get
247 248
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
249 250 251 252
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
253 254 255
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
256
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
257
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
258
    MPID_Datatype *dtp;
259
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
260
    int made_progress = 0;
261
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
262

263
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
264

265 266 267
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

268 269 270 271
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

272
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
273

274
    if (data_sz == 0) {
275
        goto fn_exit;
276 277
    }

278
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
279

280 281
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
282
        /* check if target is local and shared memory is allocated on window,
283
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
284 285

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
286 287 288 289 290
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
291 292 293
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
294

295
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
296
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
297
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
298 299
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
300 301
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
302 303 304

        if (ureq) {
            /* Complete user request and release the ch3 ref */
305
            MPIDI_CH3U_Request_complete(ureq);
306
        }
307
    }
308
    else {
309
        MPIDI_RMA_Op_t *op_ptr = NULL;
310
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
311
        MPI_Aint target_type_size;
312
        size_t len;
313
        int use_immed_resp_pkt = FALSE;
314
        int is_origin_contig, is_target_contig;
315

316
        /* queue it up */
317
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
318 319
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
320

Xin Zhao's avatar
Xin Zhao committed
321 322
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

323
        /******************** Setting operation struct areas ***********************/
324

325
        /* FIXME: For contig and very short operations, use a streamlined op */
326 327 328 329
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
330 331

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
332
        op_ptr->ureq = ureq;
333 334 335 336 337 338 339 340 341 342 343

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
344

345 346 347
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

348 349 350 351
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
        MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

        /* Judge if we can use IMMED data response packet */
352 353
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
354
            if (len <= MPIDI_RMA_IMMED_BYTES)
355 356 357
                use_immed_resp_pkt = TRUE;
        }

358
        /* Judge if this operation is an piggyback candidate. */
359 360
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
361 362 363
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
364
            op_ptr->piggyback_lock_candidate = 1;
365 366
        }

367 368
        /************** Setting packet struct areas in operation ****************/

369
        get_pkt = &(op_ptr->pkt.get);
370
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
371 372
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
373 374
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
375
        get_pkt->info.dataloop_size = 0;
376
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
377
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
378 379
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
380

Xin Zhao's avatar
Xin Zhao committed
381 382
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

383
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
384 385 386
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

387
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
388 389
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
390 391

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
392
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
393 394 395 396
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
397 398 399 400 401 402 403 404 405 406 407 408 409 410

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
411 412 413
    }

  fn_exit:
414
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
415 416 417 418 419 420 421 422 423 424
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
425
#define FUNCNAME MPIDI_CH3I_Accumulate
426 427
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
428 429 430 431
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
432
{
433
    int mpi_errno = MPI_SUCCESS;
434
    MPIDI_msg_sz_t data_sz;
435
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
436
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
437
    MPID_Datatype *dtp;
438
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
439
    int made_progress = 0;
440
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
441

442
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
443

444 445 446
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

447 448 449 450
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

451 452
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

453
    if (data_sz == 0) {
454
        goto fn_exit;
455
    }
456

457
    rank = win_ptr->comm_ptr->rank;
458 459 460

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
461
        /* check if target is local and shared memory is allocated on window,
462
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
463 464

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
465 466 467 468 469
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
470 471 472 473
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

474
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
475
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
476 477 478 479 480 481
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
482 483 484

        if (ureq) {
            /* Complete user request and release the ch3 ref */
485
            MPIDI_CH3U_Request_complete(ureq);
486
        }
487
    }
488
    else {
489
        MPIDI_RMA_Op_t *op_ptr = NULL;
490
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
491
        MPI_Aint origin_type_size;
492
        size_t len;
493
        int use_immed_pkt = FALSE;
494
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
495 496 497 498
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
499

500
        /* queue it up */
501
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
502 503
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
504

Xin Zhao's avatar
Xin Zhao committed
505 506
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

507
        /******************** Setting operation struct areas ***********************/
508

509 510 511 512
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
513 514

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
515
        op_ptr->ureq = ureq;
516 517 518 519

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
520
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
521 522
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
523
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
524
        }
525

526 527 528
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
529 530 531 532 533 534 535
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            predefined_dtp_size = origin_type_size;
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
536 537
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
538
            predefined_dtp_count = len / predefined_dtp_size;
539
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

559 560 561
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

562
        /* Judge if we can use IMMED data packet */
563 564
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
565
            if (len <= MPIDI_RMA_IMMED_BYTES)
566 567 568
                use_immed_pkt = TRUE;
        }

569
        /* Judge if this operation is an piggyback candidate. */
570 571
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
572 573 574
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
575
            if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
576
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
577 578
        }

579 580
        /************** Setting packet struct areas in operation ****************/

581
        accum_pkt = &(op_ptr->pkt.accum);
582

583 584 585 586 587 588 589
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

590 591
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
592 593
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
594
        accum_pkt->info.dataloop_size = 0;
595
        accum_pkt->op = op;
596
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
597 598
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
599
        if (use_immed_pkt) {
600
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
601
            mpi_errno = immed_copy(src, dest, len);
602 603
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
604
        }
605

Xin Zhao's avatar
Xin Zhao committed
606 607
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

608
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
609 610 611
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

612
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
613 614
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
615 616

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
617
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
618 619 620 621
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
622 623 624 625 626 627 628 629 630 631 632 633 634 635

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
636 637
    }

638
  fn_exit:
639
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
640 641 642 643 644 645 646 647 648 649
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
650
#define FUNCNAME MPIDI_CH3I_Get_accumulate
651 652
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
653 654 655 656 657
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
658 659 660 661 662 663 664 665
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
666
    int made_progress = 0;
667
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
668

669
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
670

671 672 673
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
710 711 712

        if (ureq) {
            /* Complete user request and release the ch3 ref */
713
            MPIDI_CH3U_Request_complete(ureq);
714
        }
715 716
    }
    else {
717
        MPIDI_RMA_Op_t *op_ptr = NULL;
718 719

        /* Append the operation to the window's RMA ops queue */
720
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
721 722
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
723 724 725

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
726 727
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

728 729
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
730
            MPIDI_CH3_Pkt_get_t *get_pkt;
731
            MPI_Aint target_type_size;
732
            size_t len;
733
            int use_immed_resp_pkt = FALSE;
734
            int is_result_contig, is_target_contig;
735 736

            /******************** Setting operation struct areas ***********************/
737

738 739 740 741
            op_ptr->origin_addr = result_addr;
            op_ptr->origin_count = result_count;
            op_ptr->origin_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
742 743

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
744
            op_ptr->ureq = ureq;
745 746 747 748 749 750 751 752 753

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
            }
754

755 756 757
            MPID_Datatype_get_size_macro(target_datatype, target_type_size);
            MPIU_Assign_trunc(len, target_count * target_type_size, size_t);

758 759 760
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

761
            /* Judge if we can use IMMED data response packet */
762 763 764
            if (MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
                is_result_contig && is_target_contig) {
765
                if (len <= MPIDI_RMA_IMMED_BYTES)
766 767 768
                    use_immed_resp_pkt = TRUE;
            }

769
            /* Judge if this operation is a piggyback candidate */
770 771
            if (MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
772 773 774
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
775
                op_ptr->piggyback_lock_candidate = 1;
776
            }
777 778 779

            /************** Setting packet struct areas in operation ****************/

780
            get_pkt = &(op_ptr->pkt.get);
781
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
782 783
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
784 785
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
786
            get_pkt->info.dataloop_size = 0;
787
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
788
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
789 790
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
791 792 793
        }

        else {
794
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
795
            MPI_Aint origin_type_size;
796
            size_t orig_len;
797
            int use_immed_pkt = FALSE;
798
            int is_origin_contig, is_target_contig, is_result_contig;
Xin Zhao's avatar
Xin Zhao committed
799 800 801 802
            MPI_Aint stream_elem_count, stream_unit_count;
            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
            int i;
803 804

            /******************** Setting operation struct areas ***********************/
805

806 807 808 809 810 811 812
            op_ptr->origin_addr = (void *) origin_addr;
            op_ptr->origin_count = origin_count;
            op_ptr->origin_datatype = origin_datatype;
            op_ptr->result_addr = result_addr;
            op_ptr->result_count = result_count;
            op_ptr->result_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
813 814

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
815
            op_ptr->ureq = ureq;
816 817 818 819

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
820
                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
821 822
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
823
                MPID_Datatype_get_ptr(result_datatype, target_dtp);
824 825
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
826
                MPID_Datatype_get_ptr(target_datatype, result_dtp);
827
            }
Xin Zhao's avatar
Xin Zhao committed
828

829 830 831
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);

Xin Zhao's avatar
Xin Zhao committed
832 833 834 835 836 837 838
            /* Get size and count for predefined datatype elements */
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                predefined_dtp_size = origin_type_size;
                predefined_dtp_count = origin_count;
                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
            }
            else {
839 840
                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
841
                predefined_dtp_count = orig_len / predefined_dtp_size;
842
                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
            }
            MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                        predefined_dtp_extent > 0);

            /* Calculate number of predefined elements in each stream unit, and
             * total number of stream units. */
            stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
            stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
            MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

            for (i = 0; i < stream_unit_count; i++) {
                if (origin_dtp != NULL) {
                    MPID_Datatype_add_ref(origin_dtp);
                }
                if (target_dtp != NULL) {
                    MPID_Datatype_add_ref(target_dtp);
                }
                if (result_dtp != NULL) {
                    MPID_Datatype_add_ref(result_dtp);
                }
            }

865 866 867 868
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);

869
            /* Judge if we can use IMMED data packet */
870 871 872 873
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
                is_origin_contig && is_target_contig && is_result_contig) {
874
                if (orig_len <= MPIDI_RMA_IMMED_BYTES)
875 876 877
                    use_immed_pkt = TRUE;
            }

878
            /* Judge if this operation is a piggyback candidate */
879 880 881
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {