ch3u_rma_ops.c 17.1 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 3 4 5 6 7 8
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidrma.h"

9
static int enableShortACC = 1;
10

11 12 13
#define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
#define MPIDI_PASSIVE_TARGET_RMA_TAG 563924

14
/*
15 16 17 18 19 20 21 22 23 24 25
 * TODO:
 * Explore use of alternate allocation mechanisms for the RMA queue elements
 * (Because profiling has shown that queue element allocation/deallocation
 * can take a significant amount of time in the RMA operations).
 *    1: Current approach (uses perm memory malloc/free)
 *    2: Preallocate and maintain list (use perm memory malloc, but
 *       free onto window; use first; free on window free)
 *    3: Preallocate and maintain list (use separate memory, but free to
 *       thread/process; free in Finalize handler.  Option to use for
 *       single-threaded to avoid thread overheads)
 * Possible interface
26 27
 *    int MPIDI_RMAListAlloc(MPIDI_RMA_Op_t **a,MPID_Win *win)
 *    int MPIDI_RMAListFree(MPIDI_RMA_Op_t *a, MPID_Win *win)
28 29
 *    return value is error code (e.g., allocation failure).
 */
30 31 32 33 34

#undef FUNCNAME
#define FUNCNAME MPIDI_Win_free
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
35
int MPIDI_Win_free(MPID_Win ** win_ptr)
36
{
37
    int mpi_errno = MPI_SUCCESS;
38
    int in_use;
39
    MPID_Comm *comm_ptr;
40
    int errflag = FALSE;
41
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FREE);
42

43
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
James Dinan's avatar
James Dinan committed
44 45 46 47

    MPIU_ERR_CHKANDJUMP((*win_ptr)->epoch_state != MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

48 49 50 51 52 53 54
    if (!(*win_ptr)->shm_allocated) {
        /* when SHM is allocated, we already did a global barrier in
           MPIDI_CH3_SHM_Win_free, so we do not need to do it again here. */
        mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
    }
55

56
    comm_ptr = (*win_ptr)->comm_ptr;
57
    mpi_errno = MPIR_Comm_free_impl(comm_ptr);
58 59
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
60

61
    MPIU_Free((*win_ptr)->targets);
62
    MPIU_Free((*win_ptr)->base_addrs);
63
    MPIU_Free((*win_ptr)->sizes);
64 65
    MPIU_Free((*win_ptr)->disp_units);
    MPIU_Free((*win_ptr)->all_win_handles);
66

67
    /* Free the attached buffer for windows created with MPI_Win_allocate() */
68 69
    if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
        (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) {
70
        if ((*win_ptr)->shm_allocated == FALSE && (*win_ptr)->size > 0) {
71
            MPIU_Free((*win_ptr)->base);
72
        }
73 74
    }

75 76 77
    MPIU_Object_release_ref(*win_ptr, &in_use);
    /* MPI windows don't have reference count semantics, so this should always be true */
    MPIU_Assert(!in_use);
78
    MPIU_Handle_obj_free(&MPID_Win_mem, *win_ptr);
79

80
  fn_exit:
81 82
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FREE);
    return mpi_errno;
83

84
  fn_fail:
85 86
    goto fn_exit;
}
87 88


89 90 91 92
#undef FUNCNAME
#define FUNCNAME MPIDI_Win_shared_query
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
93
int MPIDI_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint * size,
94 95 96 97 98 99 100
                           int *disp_unit, void *baseptr)
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_SHARED_QUERY);
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_SHARED_QUERY);

101 102 103
    *(void **) baseptr = win_ptr->base;
    *size = win_ptr->size;
    *disp_unit = win_ptr->disp_unit;
104

105
  fn_exit:
106 107 108
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_SHARED_QUERY);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
109
  fn_fail:
110 111 112 113 114
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


115 116 117 118
#undef FUNCNAME
#define FUNCNAME MPIDI_Put
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
119
int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
120 121
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
122 123
{
    int mpi_errno = MPI_SUCCESS;
124
    int dt_contig ATTRIBUTE((unused)), rank;
125
    MPID_Datatype *dtp;
Pavan Balaji's avatar
Pavan Balaji committed
126
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
127
    MPIDI_msg_sz_t data_sz;
128
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
129
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
130

131 132
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);

133 134 135 136
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

137
    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
James Dinan's avatar
James Dinan committed
138 139 140 141 142 143
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

144 145
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

146
    if (data_sz == 0) {
147
        goto fn_exit;
148 149
    }

150
    rank = win_ptr->comm_ptr->rank;
151 152 153

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
154
        /* check if target is local and shared memory is allocated on window,
155
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
156 157

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
158 159 160 161 162
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
163 164 165 166
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

167
    /* If the put is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
168
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
169
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
170 171
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
172 173
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
174
    }
175
    else {
176
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
177
        MPIDI_RMA_Op_t *new_ptr = NULL;
178

179
        /* queue it up */
180
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr);
181 182 183
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
184

185 186
        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->type = MPIDI_RMA_PUT;
187 188 189
        /* Cast away const'ness for the origin address, as the
         * MPIDI_RMA_Op_t structure is used for both PUT and GET like
         * operations */
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
        new_ptr->target_disp = target_disp;
        new_ptr->target_count = target_count;
        new_ptr->target_datatype = target_datatype;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
208 209 210
    }

  fn_exit:
211
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_PUT);
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Get
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
227 228
              origin_datatype, int target_rank, MPI_Aint target_disp,
              int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr)
229 230 231
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
232
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
233
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
234
    MPID_Datatype *dtp;
235
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
236
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
237

238 239
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);

240 241 242 243
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

244
    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
James Dinan's avatar
James Dinan committed
245 246 247 248 249 250
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

251
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
252

253
    if (data_sz == 0) {
254
        goto fn_exit;
255 256
    }

257
    rank = win_ptr->comm_ptr->rank;
Xin Zhao's avatar
Xin Zhao committed
258

259 260
    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
261
        /* check if target is local and shared memory is allocated on window,
262
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
263 264

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
265 266 267 268 269
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
270 271 272
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }
273

274
    /* If the get is a local operation, do it here */
Xin Zhao's avatar
Xin Zhao committed
275
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
276
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
277 278
        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
279 280
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
281
    }
282
    else {
283
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
284
        MPIDI_RMA_Op_t *new_ptr = NULL;
285

286
        /* queue it up */
287
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr);
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }

        /* FIXME: For contig and very short operations, use a streamlined op */
        new_ptr->type = MPIDI_RMA_GET;
        new_ptr->origin_addr = origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
        new_ptr->target_disp = target_disp;
        new_ptr->target_count = target_count;
        new_ptr->target_datatype = target_datatype;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
330
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
331 332
                     origin_datatype, int target_rank, MPI_Aint target_disp,
                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr)
333
{
334
    int mpi_errno = MPI_SUCCESS;
335
    MPIDI_msg_sz_t data_sz;
336
    int dt_contig ATTRIBUTE((unused)), rank;
Pavan Balaji's avatar
Pavan Balaji committed
337
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
338
    MPID_Datatype *dtp;
339
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
340
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
341

342 343
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

344 345 346 347
    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

348
    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
James Dinan's avatar
James Dinan committed
349 350 351 352 353 354
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

355 356
    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

357
    if (data_sz == 0) {
358
        goto fn_exit;
359
    }
360

361
    rank = win_ptr->comm_ptr->rank;
362 363 364

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
Xin Zhao's avatar
Xin Zhao committed
365
        /* check if target is local and shared memory is allocated on window,
366
         * if so, we directly perform this operation on shared memory region. */
Xin Zhao's avatar
Xin Zhao committed
367 368

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
369 370 371 372 373
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
Xin Zhao's avatar
Xin Zhao committed
374 375 376 377
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

378
    /* Do =! rank first (most likely branch?) */
Xin Zhao's avatar
Xin Zhao committed
379
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
380 381 382 383 384 385
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
                                          target_rank, target_disp, target_count, target_datatype,
                                          op, win_ptr);
        if (mpi_errno)
            MPIU_ERR_POP(mpi_errno);
386
    }
387
    else {
388
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
389
        MPIDI_RMA_Op_t *new_ptr = NULL;
390

391
        /* queue it up */
392
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr);
393 394 395
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
396

397 398
        /* If predefined and contiguous, use a simplified element */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
399
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) {
400 401
            new_ptr->type = MPIDI_RMA_ACC_CONTIG;
            /* Only the information needed for the contig/predefined acc */
402 403
            /* Cast away const'ness for origin_address as
             * MPIDI_RMA_Op_t contain both PUT and GET like ops */
404 405 406 407 408 409 410 411 412 413 414 415
            new_ptr->origin_addr = (void *) origin_addr;
            new_ptr->origin_count = origin_count;
            new_ptr->origin_datatype = origin_datatype;
            new_ptr->target_rank = target_rank;
            new_ptr->target_disp = target_disp;
            new_ptr->target_count = target_count;
            new_ptr->target_datatype = target_datatype;
            new_ptr->op = op;
            goto fn_exit;
        }

        new_ptr->type = MPIDI_RMA_ACCUMULATE;
416 417
        /* Cast away const'ness for origin_address as MPIDI_RMA_Op_t
         * contain both PUT and GET like ops */
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
        new_ptr->origin_addr = (void *) origin_addr;
        new_ptr->origin_count = origin_count;
        new_ptr->origin_datatype = origin_datatype;
        new_ptr->target_rank = target_rank;
        new_ptr->target_disp = target_disp;
        new_ptr->target_count = target_count;
        new_ptr->target_datatype = target_datatype;
        new_ptr->op = op;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
437 438
    }

439
  fn_exit:
440 441 442 443 444 445 446 447 448 449 450 451 452 453
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Alloc_mem
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
454
void *MPIDI_Alloc_mem(size_t size, MPID_Info * info_ptr)
455 456 457 458 459 460 461
{
    void *ap;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ALLOC_MEM);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_ALLOC_MEM);

    ap = MPIU_Malloc(size);
462

463 464 465 466 467 468 469 470 471
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_ALLOC_MEM);
    return ap;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Free_mem
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
472
int MPIDI_Free_mem(void *ptr)
473 474 475 476 477 478 479
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_FREE_MEM);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_FREE_MEM);

    MPIU_Free(ptr);
480

481 482 483
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_FREE_MEM);
    return mpi_errno;
}