ch3u_rma_ops.c 51.3 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
      category    : CH3
      type        : int
      default     : 100
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of number of posted operations
          when starting poking progress in operation routines.
          When the value is negative, runtime never pokes progress
          engine in operation routines; when the value is zero,
          runtime always pokes progress engine in operation
          routines; when the value is larger than zero, runtime
          starts to poke progress engine when number of posted
          operations reaches that value.

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

51 52 53
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

54
#undef FUNCNAME
55
#define FUNCNAME MPIDI_CH3I_Put
56 57
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
58 59 60 61
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
62 63
{
    int mpi_errno = MPI_SUCCESS;
64
    int dt_contig ATTRIBUTE((unused)), rank;
65
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
66
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
67
    MPIDI_msg_sz_t data_sz;
68
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
69
    int made_progress = 0;
70
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
71

72
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
73

74 75 76
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

77 78 79 80
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

81 82
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

83
    if (data_sz == 0) {
84
        goto fn_exit;
85 86
    }

87
    rank = win_ptr->comm_ptr->rank;
88 89 90

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
91
        /* check if target is local and shared memory is allocated on window,
92
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
93 94

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
95 96 97 98 99
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
100 101 102 103
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

104
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
105
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
106
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
107 108
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
109 110
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
111 112 113

        if (ureq) {
            /* Complete user request and release the ch3 ref */
114
            MPIDI_CH3U_Request_complete(ureq);
115
        }
116
    }
117
    else {
118
        MPIDI_RMA_Op_t *op_ptr = NULL;
119
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
120
        int use_immed_pkt = FALSE;
121
        int is_origin_contig, is_target_contig;
122

123
        /* queue it up */
124
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
125 126
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
127

Xin Zhao's avatar
Xin Zhao committed
128 129
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

130
        /******************** Setting operation struct areas ***********************/
131

132
        /* FIXME: For contig and very short operations, use a streamlined op */
133 134 135 136
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
137 138

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
139
        op_ptr->ureq = ureq;
140 141 142 143 144 145 146 147 148 149 150

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
151

152 153 154
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

155
        /* Judge if we can use IMMED data packet */
156 157
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
158
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
159 160 161
                use_immed_pkt = TRUE;
        }

162
        /* Judge if this operation is an piggyback candidate */
163 164
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
165 166 167
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
168
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
169
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
170 171
        }

172 173
        /************** Setting packet struct areas in operation ****************/

174
        put_pkt = &(op_ptr->pkt.put);
175 176 177 178 179 180 181 182

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

183 184
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
185 186
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
187
        put_pkt->info.dataloop_size = 0;
188
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
189 190
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
191
        if (use_immed_pkt) {
192
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
193
            mpi_errno = immed_copy(src, dest, data_sz);
194 195
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
196
        }
197

Xin Zhao's avatar
Xin Zhao committed
198 199
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

200
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
201 202 203
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

204
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
205 206
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
207 208

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
209
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
210 211 212 213
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
214 215 216 217 218 219 220 221 222 223 224 225 226 227

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
228 229 230
    }

  fn_exit:
231
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
232 233 234 235 236 237 238 239 240
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
241
#define FUNCNAME MPIDI_CH3I_Get
242 243
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
244 245 246 247
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
248 249
{
    int mpi_errno = MPI_SUCCESS;
250
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
251
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
252
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
253
    MPID_Datatype *dtp;
254
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
255
    int made_progress = 0;
256
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
257

258
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
259

260 261 262
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

263 264 265 266
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

267 268
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, orig_data_sz, dtp,
                            dt_true_lb);
269

270
    if (orig_data_sz == 0) {
271
        goto fn_exit;
272 273
    }

274
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
275

276 277
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
278
        /* check if target is local and shared memory is allocated on window,
279
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
280 281

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
282 283 284 285 286
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
287 288 289
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
290

291
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
292
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
293
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
294 295
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
296 297
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
298 299 300

        if (ureq) {
            /* Complete user request and release the ch3 ref */
301
            MPIDI_CH3U_Request_complete(ureq);
302
        }
303
    }
304
    else {
305
        MPIDI_RMA_Op_t *op_ptr = NULL;
306
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
307 308
        MPI_Aint target_type_size;
        int use_immed_resp_pkt = FALSE;
309
        int is_origin_contig, is_target_contig;
310

311
        /* queue it up */
312
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
313 314
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
315

Xin Zhao's avatar
Xin Zhao committed
316 317
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

318
        /******************** Setting operation struct areas ***********************/
319

320
        /* FIXME: For contig and very short operations, use a streamlined op */
321 322 323 324
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
325 326

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
327
        op_ptr->ureq = ureq;
328 329 330 331 332 333 334 335 336 337 338

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
339

340 341 342
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

343
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
344
        MPIU_Assign_trunc(target_data_sz, target_count * target_type_size, MPIDI_msg_sz_t);
345 346

        /* Judge if we can use IMMED data response packet */
347 348
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
349
            if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
350 351 352
                use_immed_resp_pkt = TRUE;
        }

353
        /* Judge if this operation is an piggyback candidate. */
354 355
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
356 357 358
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
359
            op_ptr->piggyback_lock_candidate = 1;
360 361
        }

362 363
        /************** Setting packet struct areas in operation ****************/

364
        get_pkt = &(op_ptr->pkt.get);
365
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
366 367
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
368 369
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
370
        get_pkt->info.dataloop_size = 0;
371
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
372
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
373 374
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
375

Xin Zhao's avatar
Xin Zhao committed
376 377
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

378
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
379 380 381
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

382
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
383 384
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
385 386

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
387
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
388 389 390 391
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
392 393 394 395 396 397 398 399 400 401 402 403 404 405

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
406 407 408
    }

  fn_exit:
409
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
410 411 412 413 414 415 416 417 418 419
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
420
#define FUNCNAME MPIDI_CH3I_Accumulate
421 422
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
423 424 425 426
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
427
{
428
    int mpi_errno = MPI_SUCCESS;
429
    MPIDI_msg_sz_t data_sz;
430
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
431
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
432
    MPID_Datatype *dtp;
433
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
434
    int made_progress = 0;
435
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
436

437
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
438

439 440 441
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

442 443 444 445
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

446 447
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

448
    if (data_sz == 0) {
449
        goto fn_exit;
450
    }
451

452
    rank = win_ptr->comm_ptr->rank;
453 454 455

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
456
        /* check if target is local and shared memory is allocated on window,
457
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
458 459

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
460 461 462 463 464
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
465 466 467 468
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

469
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
470
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
471 472 473 474 475 476
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
477 478 479

        if (ureq) {
            /* Complete user request and release the ch3 ref */
480
            MPIDI_CH3U_Request_complete(ureq);
481
        }
482
    }
483
    else {
484
        MPIDI_RMA_Op_t *op_ptr = NULL;
485
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
486
        int use_immed_pkt = FALSE;
487
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
488 489 490 491
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
492

493
        /* queue it up */
494
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
495 496
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
497

Xin Zhao's avatar
Xin Zhao committed
498 499
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

500
        /******************** Setting operation struct areas ***********************/
501

502 503 504 505
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
506 507

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
508
        op_ptr->ureq = ureq;
509 510 511 512

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
513
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
514 515
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
516
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
517
        }
518

Xin Zhao's avatar
Xin Zhao committed
519 520
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
521
            MPID_Datatype_get_size_macro(origin_datatype, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
522 523 524 525
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
526 527
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
528
            predefined_dtp_count = data_sz / predefined_dtp_size;
529
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

549 550 551
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

552
        /* Judge if we can use IMMED data packet */
553 554
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
555
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
556 557 558
                use_immed_pkt = TRUE;
        }

559
        /* Judge if this operation is an piggyback candidate. */
560 561
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
562 563 564
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
565
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
566
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
567 568
        }

569 570
        /************** Setting packet struct areas in operation ****************/

571
        accum_pkt = &(op_ptr->pkt.accum);
572

573 574 575 576 577 578 579
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

580 581
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
582 583
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
584
        accum_pkt->info.dataloop_size = 0;
585
        accum_pkt->op = op;
586
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
587 588
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
589
        if (use_immed_pkt) {
590
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
591
            mpi_errno = immed_copy(src, dest, data_sz);
592 593
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
594
        }
595

Xin Zhao's avatar
Xin Zhao committed
596 597
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

598
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
599 600 601
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

602
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
603 604
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
605 606

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
607
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
608 609 610 611
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
612 613 614 615 616 617 618 619 620 621 622 623 624 625

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
626 627
    }

628
  fn_exit:
629
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
630 631 632 633 634 635 636 637 638 639
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
640
#define FUNCNAME MPIDI_CH3I_Get_accumulate
641 642
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
643 644 645 646 647
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
648 649
{
    int mpi_errno = MPI_SUCCESS;
650
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
651 652 653 654 655
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
656
    int made_progress = 0;
657
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
658

659
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
660

661 662 663
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

664 665 666 667
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

668 669
    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
                            dt_true_lb);
670

671
    if (target_data_sz == 0) {
672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
701 702 703

        if (ureq) {
            /* Complete user request and release the ch3 ref */
704
            MPIDI_CH3U_Request_complete(ureq);
705
        }
706 707
    }
    else {
708
        MPIDI_RMA_Op_t *op_ptr = NULL;
709 710 711 712 713 714 715
        MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
        MPI_Aint origin_type_size;
        int use_immed_pkt = FALSE, i;
        int is_origin_contig, is_target_contig, is_result_contig;
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
716 717

        /* Append the operation to the window's RMA ops queue */
718
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
719 720
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
721 722 723

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
724 725
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

726
        /******************** Setting operation struct areas ***********************/
727

728 729 730 731 732 733 734
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->result_addr = result_addr;
        op_ptr->result_count = result_count;
        op_ptr->result_datatype = result_datatype;
        op_ptr->target_rank = target_rank;
735

736 737
        /* Remember user request */
        op_ptr->ureq = ureq;
738

739 740 741 742 743 744 745 746 747 748 749
        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
            MPID_Datatype_get_ptr(result_datatype, result_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
        }
750

751 752
        MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
        MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t);
753

754 755 756 757 758
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            predefined_dtp_size = origin_type_size;
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
759 760
        }
        else {
761 762 763 764 765 766 767
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
            predefined_dtp_count = orig_data_sz / predefined_dtp_size;
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);
Xin Zhao's avatar
Xin Zhao committed
768

769 770 771 772 773
        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
774

775 776 777
        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
Xin Zhao's avatar
Xin Zhao committed
778
            }
779 780
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
Xin Zhao's avatar
Xin Zhao committed
781
            }
782 783
            if (result_dtp != NULL) {
                MPID_Datatype_add_ref(result_dtp);
Xin Zhao's avatar
Xin Zhao committed
784
            }
785
        }
Xin Zhao's avatar
Xin Zhao committed
786

787 788 789
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
        MPID_Datatype_is_contig(result_datatype, &is_result_contig);
790

791 792 793 794 795 796 797 798
        /* Judge if we can use IMMED data packet */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
            is_origin_contig && is_target_contig && is_result_contig) {
            if (orig_data_sz <= MPIDI_RMA_IMMED_BYTES)
                use_immed_pkt = TRUE;
        }
799

800 801 802 803 804 805 806 807 808 809
        /* Judge if this operation is a piggyback candidate */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for origin, target and result data. We should extend this optimization to derived
             * datatypes as well. */
            if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                op_ptr->piggyback_lock_candidate = 1;
        }
810

811
        /************** Setting packet struct areas in operation ****************/
812

813
        get_accum_pkt = &(op_ptr->pkt.get_accum);
814

815 816 817 818 819 820
        if (use_immed_pkt) {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
        }
        else {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
        }
821

822 823 824 825 826 827 828 829 830 831 832 833 834
        get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        get_accum_pkt->count = target_count;
        get_accum_pkt->datatype = target_datatype;
        get_accum_pkt->info.dataloop_size = 0;
        get_accum_pkt->op = op;
        get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
        if (use_immed_pkt) {
            void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
            mpi_errno = immed_copy(src, dest, orig_data_sz);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
835 836
        }

Xin Zhao's avatar
Xin Zhao committed
837 838
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

839
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
840 841 842
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

843
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
844 845
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
846 847

        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
848
            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
849 850 851 852
            mpi_errno = poke_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
853 854 855 856 857 858 859 860 861 862 863 864 865 866

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
867 868
    }

869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953