ch3u_rma_ops.c 56.3 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

51 52 53
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

54
#undef FUNCNAME
55
#define FUNCNAME MPIDI_CH3I_Put
56 57
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
58 59 60 61
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
62 63
{
    int mpi_errno = MPI_SUCCESS;
64
    int dt_contig ATTRIBUTE((unused)), rank;
65
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
66
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
67
    MPIDI_msg_sz_t data_sz;
68
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
69
    int made_progress = 0;
70
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
71

72
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
73

74 75 76
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

77 78 79 80
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

81 82
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

83
    if (data_sz == 0) {
84
        goto fn_exit;
85 86
    }

87
    rank = win_ptr->comm_ptr->rank;
88 89 90

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
91
        /* check if target is local and shared memory is allocated on window,
92
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
93 94

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
95 96 97 98 99
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
100 101 102 103
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

104
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
105
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
106
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
107 108
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
109 110
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
111 112 113

        if (ureq) {
            /* Complete user request and release the ch3 ref */
114
            MPIDI_CH3U_Request_complete(ureq);
115
        }
116
    }
117
    else {
118
        MPIDI_RMA_Op_t *op_ptr = NULL;
119
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
120
        int use_immed_pkt = FALSE;
121
        int is_origin_contig, is_target_contig;
122

123
        /* queue it up */
124
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
125 126
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
127

Xin Zhao's avatar
Xin Zhao committed
128 129
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

130
        /******************** Setting operation struct areas ***********************/
131

132
        /* FIXME: For contig and very short operations, use a streamlined op */
133 134 135 136
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
137 138

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
139
        op_ptr->ureq = ureq;
140 141 142 143 144 145 146 147 148 149 150

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
151

152 153 154
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

155
        /* Judge if we can use IMMED data packet */
156 157
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
158
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
159 160 161
                use_immed_pkt = TRUE;
        }

162
        /* Judge if this operation is an piggyback candidate */
163 164
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
165 166 167
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
168
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
169
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
170 171
        }

172 173
        /************** Setting packet struct areas in operation ****************/

174
        put_pkt = &(op_ptr->pkt.put);
175 176 177 178 179 180 181 182

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

183 184
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
185 186
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
187
        put_pkt->info.dataloop_size = 0;
188
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
189 190
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
191
        if (use_immed_pkt) {
192
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
193
            mpi_errno = immed_copy(src, dest, data_sz);
194 195
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
196
        }
197

Xin Zhao's avatar
Xin Zhao committed
198 199
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

200
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
201 202 203
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

204
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
205 206
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
207 208

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
209
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
210 211 212 213
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
214 215 216 217 218 219 220 221 222 223 224 225 226 227

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
228 229 230
    }

  fn_exit:
231
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
232 233 234 235 236 237 238 239 240
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
241
#define FUNCNAME MPIDI_CH3I_Get
242 243
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
244 245 246 247
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
248 249
{
    int mpi_errno = MPI_SUCCESS;
250
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
251
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
252
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
253
    MPID_Datatype *dtp;
254
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
255
    int made_progress = 0;
256
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
257

258
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
259

260 261 262
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

263 264 265 266
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

267 268
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, orig_data_sz, dtp,
                            dt_true_lb);
269

270
    if (orig_data_sz == 0) {
271
        goto fn_exit;
272 273
    }

274
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
275

276 277
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
278
        /* check if target is local and shared memory is allocated on window,
279
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
280 281

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
282 283 284 285 286
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
287 288 289
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
290

291
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
292
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
293
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
294 295
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
296 297
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
298 299 300

        if (ureq) {
            /* Complete user request and release the ch3 ref */
301
            MPIDI_CH3U_Request_complete(ureq);
302
        }
303
    }
304
    else {
305
        MPIDI_RMA_Op_t *op_ptr = NULL;
306
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
307 308
        MPI_Aint target_type_size;
        int use_immed_resp_pkt = FALSE;
309
        int is_origin_contig, is_target_contig;
310

311
        /* queue it up */
312
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
313 314
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
315

Xin Zhao's avatar
Xin Zhao committed
316 317
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

318
        /******************** Setting operation struct areas ***********************/
319

320
        /* FIXME: For contig and very short operations, use a streamlined op */
321 322 323 324
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
325 326

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
327
        op_ptr->ureq = ureq;
328 329 330 331 332 333 334 335 336 337 338

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
339

340 341 342
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

343
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
344
        MPIU_Assign_trunc(target_data_sz, target_count * target_type_size, MPIDI_msg_sz_t);
345 346

        /* Judge if we can use IMMED data response packet */
347 348
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
349
            if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
350 351 352
                use_immed_resp_pkt = TRUE;
        }

353
        /* Judge if this operation is an piggyback candidate. */
354 355
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
356 357 358
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
359
            op_ptr->piggyback_lock_candidate = 1;
360 361
        }

362 363
        /************** Setting packet struct areas in operation ****************/

364
        get_pkt = &(op_ptr->pkt.get);
365
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
366 367
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
368 369
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
370
        get_pkt->info.dataloop_size = 0;
371
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
372
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
373 374
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
375

Xin Zhao's avatar
Xin Zhao committed
376 377
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

378
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
379 380 381
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

382
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
383 384
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
385 386

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
387
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
388 389 390 391
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
392 393 394 395 396 397 398 399 400 401 402 403 404 405

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
406 407 408
    }

  fn_exit:
409
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
410 411 412 413 414 415 416 417 418 419
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
420
#define FUNCNAME MPIDI_CH3I_Accumulate
421 422
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
423 424 425 426
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
427
{
428
    int mpi_errno = MPI_SUCCESS;
429
    MPIDI_msg_sz_t data_sz;
430
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
431
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
432
    MPID_Datatype *dtp;
433
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
434
    int made_progress = 0;
435
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
436

437
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
438

439 440 441
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

442 443 444 445
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

446 447
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

448
    if (data_sz == 0) {
449
        goto fn_exit;
450
    }
451

452
    rank = win_ptr->comm_ptr->rank;
453 454 455

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
456
        /* check if target is local and shared memory is allocated on window,
457
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
458 459

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
460 461 462 463 464
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
465 466 467 468
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

469
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
470
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
471 472 473 474 475 476
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
477 478 479

        if (ureq) {
            /* Complete user request and release the ch3 ref */
480
            MPIDI_CH3U_Request_complete(ureq);
481
        }
482
    }
483
    else {
484
        MPIDI_RMA_Op_t *op_ptr = NULL;
485
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
486
        int use_immed_pkt = FALSE;
487
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
488 489 490 491
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
492

493
        /* queue it up */
494
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
495 496
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
497

Xin Zhao's avatar
Xin Zhao committed
498 499
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

500
        /******************** Setting operation struct areas ***********************/
501

502 503 504 505
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
506 507

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
508
        op_ptr->ureq = ureq;
509 510 511 512

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
513
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
514 515
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
516
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
517
        }
518

Xin Zhao's avatar
Xin Zhao committed
519 520
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
521
            MPID_Datatype_get_size_macro(origin_datatype, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
522 523 524 525
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
526 527
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
528
            predefined_dtp_count = data_sz / predefined_dtp_size;
529
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

549 550 551
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

552
        /* Judge if we can use IMMED data packet */
553 554
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
555
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
556 557 558
                use_immed_pkt = TRUE;
        }

559
        /* Judge if this operation is an piggyback candidate. */
560 561
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
562 563 564
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
565
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
566
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
567 568
        }

569 570
        /************** Setting packet struct areas in operation ****************/

571
        accum_pkt = &(op_ptr->pkt.accum);
572

573 574 575 576 577 578 579
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

580 581
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
582 583
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
584
        accum_pkt->info.dataloop_size = 0;
585
        accum_pkt->op = op;
586
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
587 588
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
589
        if (use_immed_pkt) {
590
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
591
            mpi_errno = immed_copy(src, dest, data_sz);
592 593
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
594
        }
595

Xin Zhao's avatar
Xin Zhao committed
596 597
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

598
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
599 600 601
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

602
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
603 604
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
605 606

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
607
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
608 609 610 611
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
612 613 614 615 616 617 618 619 620 621 622 623 624 625

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
626 627
    }

628
  fn_exit:
629
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
630 631 632 633 634 635 636 637 638 639
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
640
#define FUNCNAME MPIDI_CH3I_Get_accumulate
641 642
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
643 644 645 646 647
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
648 649
{
    int mpi_errno = MPI_SUCCESS;
650
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
651 652 653 654 655
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
656
    int made_progress = 0;
657
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
658

659
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
660

661 662 663
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

664 665 666 667
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

668 669
    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
                            dt_true_lb);
670

671
    if (target_data_sz == 0) {
672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
701 702 703

        if (ureq) {
            /* Complete user request and release the ch3 ref */
704
            MPIDI_CH3U_Request_complete(ureq);
705
        }
706 707
    }
    else {
708
        MPIDI_RMA_Op_t *op_ptr = NULL;
709 710

        /* Append the operation to the window's RMA ops queue */
711
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
712 713
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
714 715 716

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
717 718
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

719 720
        if (op == MPI_NO_OP) {
            /* Convert GAcc to a Get */
721
            MPIDI_CH3_Pkt_get_t *get_pkt;
722
            int use_immed_resp_pkt = FALSE;
723
            int is_result_contig, is_target_contig;
724 725

            /******************** Setting operation struct areas ***********************/
726

727 728 729 730
            op_ptr->origin_addr = result_addr;
            op_ptr->origin_count = result_count;
            op_ptr->origin_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
731 732

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
733
            op_ptr->ureq = ureq;
734 735 736 737 738 739 740 741 742

            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
                MPID_Datatype_get_ptr(result_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
                MPID_Datatype_get_ptr(target_datatype, dtp);
                MPID_Datatype_add_ref(dtp);
            }
743

744 745 746
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);

747
            /* Judge if we can use IMMED data response packet */
748 749 750
            if (MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
                is_result_contig && is_target_contig) {
751
                if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
752 753 754
                    use_immed_resp_pkt = TRUE;
            }

755
            /* Judge if this operation is a piggyback candidate */
756 757
            if (MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
758 759 760
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for both origin and target data. We should extend this optimization to derived
                 * datatypes as well. */
761
                op_ptr->piggyback_lock_candidate = 1;
762
            }
763 764 765

            /************** Setting packet struct areas in operation ****************/

766
            get_pkt = &(op_ptr->pkt.get);
767
            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
768 769
            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
770 771
            get_pkt->count = target_count;
            get_pkt->datatype = target_datatype;
772
            get_pkt->info.dataloop_size = 0;
773
            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
774
            get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
775 776
            if (use_immed_resp_pkt == TRUE)
                get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
777 778 779
        }

        else {
780
            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
781 782
            MPI_Aint origin_type_size;
            int use_immed_pkt = FALSE;
783
            int is_origin_contig, is_target_contig, is_result_contig;
Xin Zhao's avatar
Xin Zhao committed
784 785 786 787
            MPI_Aint stream_elem_count, stream_unit_count;
            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
            int i;
788 789

            /******************** Setting operation struct areas ***********************/
790

791 792 793 794 795 796 797
            op_ptr->origin_addr = (void *) origin_addr;
            op_ptr->origin_count = origin_count;
            op_ptr->origin_datatype = origin_datatype;
            op_ptr->result_addr = result_addr;
            op_ptr->result_count = result_count;
            op_ptr->result_datatype = result_datatype;
            op_ptr->target_rank = target_rank;
798 799

            /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
800
            op_ptr->ureq = ureq;
801 802 803 804

            /* if source or target datatypes are derived, increment their
             * reference counts */
            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
805
                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
806 807
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
808
                MPID_Datatype_get_ptr(result_datatype, result_dtp);
809 810
            }
            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
811
                MPID_Datatype_get_ptr(target_datatype, target_dtp);
812
            }
Xin Zhao's avatar
Xin Zhao committed
813

814
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
815
            MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t);
816

Xin Zhao's avatar
Xin Zhao committed
817 818 819 820 821 822 823
            /* Get size and count for predefined datatype elements */
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
                predefined_dtp_size = origin_type_size;
                predefined_dtp_count = origin_count;
                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
            }
            else {
824 825
                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
826
                predefined_dtp_count = orig_data_sz / predefined_dtp_size;
827
                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
            }
            MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                        predefined_dtp_extent > 0);

            /* Calculate number of predefined elements in each stream unit, and
             * total number of stream units. */
            stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
            stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
            MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

            for (i = 0; i < stream_unit_count; i++) {
                if (origin_dtp != NULL) {
                    MPID_Datatype_add_ref(origin_dtp);
                }
                if (target_dtp != NULL) {
                    MPID_Datatype_add_ref(target_dtp);
                }
                if (result_dtp != NULL) {
                    MPID_Datatype_add_ref(result_dtp);
                }
            }

850 851 852 853
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
            MPID_Datatype_is_contig(result_datatype, &is_result_contig);

854
            /* Judge if we can use IMMED data packet */
855 856 857 858
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
                is_origin_contig && is_target_contig && is_result_contig) {
859
                if (orig_data_sz <= MPIDI_RMA_IMMED_BYTES)
860 861 862
                    use_immed_pkt = TRUE;
            }

863
            /* Judge if this operation is a piggyback candidate */
864 865 866
            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
                MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
867 868 869
                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
                 * for origin, target and result data. We should extend this optimization to derived
                 * datatypes as well. */
870
                if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)