ch3u_rma_ops.c 49.6 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

Xin Zhao's avatar
Xin Zhao committed
9 10
MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);

11 12 13 14
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
    - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
      category    : CH3
      type        : int
      default     : 65536
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of data size of a RMA operation
          which can be piggybacked with a LOCK message. It is
          always a positive value and should not be smaller
          than MPIDI_RMA_IMMED_BYTES.
          If user sets it as a small value, for middle and large
          data size, we will lose performance because of always
          waiting for round-trip of LOCK synchronization; if
          user sets it as a large value, we need to consume
          more memory on target side to buffer this lock request
          when lock is not satisfied.

34 35 36
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

37
#undef FUNCNAME
38
#define FUNCNAME MPIDI_CH3I_Put
39 40
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
41 42 43 44
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
45 46
{
    int mpi_errno = MPI_SUCCESS;
47
    int dt_contig ATTRIBUTE((unused)), rank;
48
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
49
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
50
    MPIDI_msg_sz_t data_sz;
51
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
52
    int made_progress = 0;
53
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
54

55
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
56

57 58 59
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

60 61 62 63
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

64 65
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

66
    if (data_sz == 0) {
67
        goto fn_exit;
68 69
    }

70
    rank = win_ptr->comm_ptr->rank;
71 72 73

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
74
        /* check if target is local and shared memory is allocated on window,
75
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
76 77

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
78 79 80 81 82
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
83 84 85 86
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

87
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
88
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
89
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
90 91
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
92 93
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
94 95 96

        if (ureq) {
            /* Complete user request and release the ch3 ref */
97
            MPIDI_CH3U_Request_complete(ureq);
98
        }
99
    }
100
    else {
101
        MPIDI_RMA_Op_t *op_ptr = NULL;
102
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
103
        int use_immed_pkt = FALSE;
104
        int is_origin_contig, is_target_contig;
105

106
        /* queue it up */
107
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
108 109
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
110

Xin Zhao's avatar
Xin Zhao committed
111 112
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

113
        /******************** Setting operation struct areas ***********************/
114

115
        /* FIXME: For contig and very short operations, use a streamlined op */
116 117 118 119
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
120 121

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
122
        op_ptr->ureq = ureq;
123 124 125 126 127 128 129 130 131 132 133

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
134

135 136 137
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

138
        /* Judge if we can use IMMED data packet */
139 140
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
141
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
142 143 144
                use_immed_pkt = TRUE;
        }

145
        /* Judge if this operation is an piggyback candidate */
146 147
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
148 149 150
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
151
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
152
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
153 154
        }

155 156
        /************** Setting packet struct areas in operation ****************/

157
        put_pkt = &(op_ptr->pkt.put);
158 159 160 161 162 163 164 165

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

166 167
        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
168 169
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
170
        put_pkt->info.dataloop_size = 0;
171
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
172 173
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
174
        if (use_immed_pkt) {
175
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
176
            mpi_errno = immed_copy(src, dest, data_sz);
177 178
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
179
        }
180

Xin Zhao's avatar
Xin Zhao committed
181 182
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

183
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
184 185 186
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

187
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
188 189
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
190

191 192 193 194 195 196 197 198 199 200 201 202 203
        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
204 205 206
    }

  fn_exit:
207
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
208 209 210 211 212 213 214 215 216
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
217
#define FUNCNAME MPIDI_CH3I_Get
218 219
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
220 221 222 223
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
224 225
{
    int mpi_errno = MPI_SUCCESS;
226
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
227
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
228
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
229
    MPID_Datatype *dtp;
230
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
231
    int made_progress = 0;
232
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
233

234
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET);
235

236 237 238
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

239 240 241 242
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

243 244
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, orig_data_sz, dtp,
                            dt_true_lb);
245

246
    if (orig_data_sz == 0) {
247
        goto fn_exit;
248 249
    }

250
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
251

252 253
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
254
        /* check if target is local and shared memory is allocated on window,
255
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
256 257

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
258 259 260 261 262
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
263 264 265
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
266

267
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
268
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
269
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
270 271
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
272 273
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
274 275 276

        if (ureq) {
            /* Complete user request and release the ch3 ref */
277
            MPIDI_CH3U_Request_complete(ureq);
278
        }
279
    }
280
    else {
281
        MPIDI_RMA_Op_t *op_ptr = NULL;
282
        MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
283 284
        MPI_Aint target_type_size;
        int use_immed_resp_pkt = FALSE;
285
        int is_origin_contig, is_target_contig;
286

287
        /* queue it up */
288
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
289 290
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
291

Xin Zhao's avatar
Xin Zhao committed
292 293
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

294
        /******************** Setting operation struct areas ***********************/
295

296
        /* FIXME: For contig and very short operations, use a streamlined op */
297 298 299 300
        op_ptr->origin_addr = origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
301 302

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
303
        op_ptr->ureq = ureq;
304 305 306 307 308 309 310 311 312 313 314

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
315

316 317 318
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

319
        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
320
        MPIU_Assign_trunc(target_data_sz, target_count * target_type_size, MPIDI_msg_sz_t);
321 322

        /* Judge if we can use IMMED data response packet */
323 324
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
325
            if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
326 327 328
                use_immed_resp_pkt = TRUE;
        }

329
        /* Judge if this operation is an piggyback candidate. */
330 331
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
332 333 334
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
335
            op_ptr->piggyback_lock_candidate = 1;
336 337
        }

338 339
        /************** Setting packet struct areas in operation ****************/

340
        get_pkt = &(op_ptr->pkt.get);
341
        MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
342 343
        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
344 345
        get_pkt->count = target_count;
        get_pkt->datatype = target_datatype;
346
        get_pkt->info.dataloop_size = 0;
347
        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
348
        get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
349 350
        if (use_immed_resp_pkt)
            get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
351

Xin Zhao's avatar
Xin Zhao committed
352 353
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

354
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
355 356 357
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

358
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
359 360
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
361

362 363 364 365 366 367 368 369 370 371 372 373 374
        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
375 376 377
    }

  fn_exit:
378
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET);
379 380 381 382 383 384 385 386 387 388
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
389
#define FUNCNAME MPIDI_CH3I_Accumulate
390 391
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
392 393 394 395
int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                          origin_datatype, int target_rank, MPI_Aint target_disp,
                          int target_count, MPI_Datatype target_datatype, MPI_Op op,
                          MPID_Win * win_ptr, MPID_Request * ureq)
396
{
397
    int mpi_errno = MPI_SUCCESS;
398
    MPIDI_msg_sz_t data_sz;
399
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
400
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
401
    MPID_Datatype *dtp;
402
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
403
    int made_progress = 0;
404
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
405

406
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
407

408 409 410
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

411 412 413 414
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

415 416
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

417
    if (data_sz == 0) {
418
        goto fn_exit;
419
    }
420

421
    rank = win_ptr->comm_ptr->rank;
422 423 424

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
425
        /* check if target is local and shared memory is allocated on window,
426
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
427 428

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
429 430 431 432 433
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
434 435 436 437
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

438
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
439
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
440 441 442 443 444 445
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
446 447 448

        if (ureq) {
            /* Complete user request and release the ch3 ref */
449
            MPIDI_CH3U_Request_complete(ureq);
450
        }
451
    }
452
    else {
453
        MPIDI_RMA_Op_t *op_ptr = NULL;
454
        MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
455
        int use_immed_pkt = FALSE;
456
        int is_origin_contig, is_target_contig;
Xin Zhao's avatar
Xin Zhao committed
457 458 459 460
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
        int i;
461

462
        /* queue it up */
463
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
464 465
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
466

Xin Zhao's avatar
Xin Zhao committed
467 468
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

469
        /******************** Setting operation struct areas ***********************/
470

471 472 473 474
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;
475 476

        /* Remember user request */
Xin Zhao's avatar
Xin Zhao committed
477
        op_ptr->ureq = ureq;
478 479 480 481

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
482
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
483 484
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
485
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
486
        }
487

Xin Zhao's avatar
Xin Zhao committed
488 489
        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
Xin Zhao's avatar
Xin Zhao committed
490
            MPID_Datatype_get_size_macro(origin_datatype, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
491 492 493 494
            predefined_dtp_count = origin_count;
            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
        }
        else {
495 496
            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
Xin Zhao's avatar
Xin Zhao committed
497
            predefined_dtp_count = data_sz / predefined_dtp_size;
498
            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
Xin Zhao's avatar
Xin Zhao committed
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
        }

518 519 520
        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

521
        /* Judge if we can use IMMED data packet */
522 523
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
Xin Zhao's avatar
Xin Zhao committed
524
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
525 526 527
                use_immed_pkt = TRUE;
        }

528
        /* Judge if this operation is an piggyback candidate. */
529 530
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
531 532 533
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
Xin Zhao's avatar
Xin Zhao committed
534
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
535
                op_ptr->piggyback_lock_candidate = 1;
Xin Zhao's avatar
Xin Zhao committed
536 537
        }

538 539
        /************** Setting packet struct areas in operation ****************/

540
        accum_pkt = &(op_ptr->pkt.accum);
541

542 543 544 545 546 547 548
        if (use_immed_pkt) {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
        }
        else {
            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        }

549 550
        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
551 552
        accum_pkt->count = target_count;
        accum_pkt->datatype = target_datatype;
553
        accum_pkt->info.dataloop_size = 0;
554
        accum_pkt->op = op;
555
        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
556 557
        accum_pkt->source_win_handle = win_ptr->handle;
        accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
558
        if (use_immed_pkt) {
559
            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
Xin Zhao's avatar
Xin Zhao committed
560
            mpi_errno = immed_copy(src, dest, data_sz);
561 562
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
563
        }
564

Xin Zhao's avatar
Xin Zhao committed
565 566
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

567
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
568 569 570
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

571
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
572 573
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
574

575 576 577 578 579 580 581 582 583 584 585 586 587
        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
588 589
    }

590
  fn_exit:
591
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
592 593 594 595 596 597 598 599 600 601
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
602
#define FUNCNAME MPIDI_CH3I_Get_accumulate
603 604
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
605 606 607 608 609
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
610 611
{
    int mpi_errno = MPI_SUCCESS;
612
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
613 614 615 616 617
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
618
    int made_progress = 0;
619
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
620

621
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
622

623 624 625
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

626 627 628 629
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

630 631
    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
                            dt_true_lb);
632

633
    if (target_data_sz == 0) {
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
663 664 665

        if (ureq) {
            /* Complete user request and release the ch3 ref */
666
            MPIDI_CH3U_Request_complete(ureq);
667
        }
668 669
    }
    else {
670
        MPIDI_RMA_Op_t *op_ptr = NULL;
671 672
        MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
        MPI_Aint origin_type_size;
673
        MPI_Aint target_type_size;
674 675 676 677 678
        int use_immed_pkt = FALSE, i;
        int is_origin_contig, is_target_contig, is_result_contig;
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
679 680 681 682 683
        int is_empty_origin = FALSE;

        /* Judge if origin buffer is empty */
        if (op == MPI_NO_OP)
            is_empty_origin = TRUE;
684 685

        /* Append the operation to the window's RMA ops queue */
686
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
687 688
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
689 690 691

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

Xin Zhao's avatar
Xin Zhao committed
692 693
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

694
        /******************** Setting operation struct areas ***********************/
695

696 697 698 699 700 701 702
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->result_addr = result_addr;
        op_ptr->result_count = result_count;
        op_ptr->result_datatype = result_datatype;
        op_ptr->target_rank = target_rank;
703

704 705
        /* Remember user request */
        op_ptr->ureq = ureq;
706

707 708
        /* if source or target datatypes are derived, increment their
         * reference counts */
709
        if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
710 711 712 713 714 715 716 717
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
            MPID_Datatype_get_ptr(result_datatype, result_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
        }
718

719 720 721 722 723 724 725 726 727 728
        if (is_empty_origin == FALSE) {
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t);
        }
        else {
            /* If origin buffer is empty, set origin data size to 0 */
            orig_data_sz = 0;
        }

        MPID_Datatype_get_size_macro(target_datatype, target_type_size);
729

730
        /* Get size and count for predefined datatype elements */
731 732 733 734
        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            predefined_dtp_size = target_type_size;
            predefined_dtp_count = target_count;
            MPID_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent);
735 736
        }
        else {
737 738 739 740
            MPIU_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size);
            predefined_dtp_count = target_data_sz / predefined_dtp_size;
            MPID_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent);
741 742 743
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);
Xin Zhao's avatar
Xin Zhao committed
744

745 746 747 748 749
        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
750

751 752 753
        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
Xin Zhao's avatar
Xin Zhao committed
754
            }
755 756
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
Xin Zhao's avatar
Xin Zhao committed
757
            }
758 759
            if (result_dtp != NULL) {
                MPID_Datatype_add_ref(result_dtp);
Xin Zhao's avatar
Xin Zhao committed
760
            }
761
        }
Xin Zhao's avatar
Xin Zhao committed
762

763 764 765 766 767 768 769
        if (is_empty_origin == FALSE) {
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        }
        else {
            /* If origin buffer is empty, mark origin data as contig data */
            is_origin_contig = 1;
        }
770 771
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
        MPID_Datatype_is_contig(result_datatype, &is_result_contig);
772

773
        /* Judge if we can use IMMED data packet */
774
        if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
775 776 777
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
            is_origin_contig && is_target_contig && is_result_contig) {
778
            if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
779 780
                use_immed_pkt = TRUE;
        }
781

782
        /* Judge if this operation is a piggyback candidate */
783
        if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
784 785 786 787 788 789 790 791
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for origin, target and result data. We should extend this optimization to derived
             * datatypes as well. */
            if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                op_ptr->piggyback_lock_candidate = 1;
        }
792

793
        /************** Setting packet struct areas in operation ****************/
794

795
        get_accum_pkt = &(op_ptr->pkt.get_accum);
796

797 798 799 800 801 802
        if (use_immed_pkt) {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
        }
        else {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
        }
803

804 805 806 807 808 809 810 811 812 813 814 815 816
        get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        get_accum_pkt->count = target_count;
        get_accum_pkt->datatype = target_datatype;
        get_accum_pkt->info.dataloop_size = 0;
        get_accum_pkt->op = op;
        get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
        if (use_immed_pkt) {
            void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
            mpi_errno = immed_copy(src, dest, orig_data_sz);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
817 818
        }

Xin Zhao's avatar
Xin Zhao committed
819 820
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

821
        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
822 823 824
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);

825
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
826 827
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
828

829 830 831 832 833 834 835 836 837 838 839 840 841
        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                int local_completed = 0, remote_completed = 0;
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
842 843
    }

844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934