ch3u_rma_sync.c 66.1 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
7
8
9
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/* Notes for memory barriers in RMA synchronizations

   When SHM is allocated for RMA window, we need to add memory berriers at proper
   places in RMA synchronization routines to guarantee the ordering of read/write
   operations, so that any operations after synchronization calls will see the
   correct data.

   There are four kinds of operations involved in the following explanation:

   1. Local loads/stores: any operations happening outside RMA epoch and accessing
      each process's own window memory.

   2. SHM operations: any operations happening inside RMA epoch. They may access
      any processes' window memory, which include direct loads/stores, and
      RMA operations that are internally implemented as direct loads/stores in
      MPI implementation.

   3. PROC_SYNC: synchronzations among processes by sending/recving messages.

   4. MEM_SYNC: a full memory barrier. It ensures the ordering of read/write
      operations on each process.

   (1) FENCE synchronization

              RANK 0                           RANK 1

       (local loads/stores)             (local loads/stores)

           WIN_FENCE {                    WIN_FENCE {
               MEM_SYNC                       MEM_SYNC
               PROC_SYNC -------------------- PROC_SYNC
               MEM_SYNC                       MEM_SYNC
           }                              }

        (SHM operations)                  (SHM operations)

           WIN_FENCE {                     WIN_FENCE {
               MEM_SYNC                        MEM_SYNC
               PROC_SYNC --------------------- PROC_SYNC
               MEM_SYNC                        MEM_SYNC
           }                               }

      (local loads/stores)              (local loads/stores)

       We need MEM_SYNC before and after PROC_SYNC for both starting WIN_FENCE
       and ending WIN_FENCE, to ensure the ordering between local loads/stores
       and PROC_SYNC in starting WIN_FENCE (and vice versa in ending WIN_FENCE),
       and the ordering between PROC_SYNC and SHM operations in starting WIN_FENCE
       (and vice versa for ending WIN_FENCE).

       In starting WIN_FENCE, the MEM_SYNC before PROC_SYNC essentially exposes
       previous local loads/stores to other processes; after PROC_SYNC, each
       process knows that everyone else already exposed their local loads/stores;
       the MEM_SYNC after PROC_SYNC ensures that my following SHM operations will
       happen after PROC_SYNC and will see the latest data on other processes.

       In ending WIN_FENCE, the MEM_SYNC before PROC_SYNC essentially exposes
       previous SHM operations to other processes; after PROC_SYNC, each process
       knows everyone else already exposed their SHM operations; the MEM_SYNC
       after PROC_SYNC ensures that my following local loads/stores will happen
       after PROC_SYNC and will see the latest data in my memory region.

   (2) POST-START-COMPLETE-WAIT synchronization

              RANK 0                           RANK 1

                                          (local loads/stores)

           WIN_START {                      WIN_POST {
                                                MEM_SYNC
               PROC_SYNC ---------------------- PROC_SYNC
               MEM_SYNC
           }                                }

         (SHM operations)

           WIN_COMPLETE {                  WIN_WAIT/TEST {
               MEM_SYNC
               PROC_SYNC --------------------- PROC_SYNC
                                               MEM_SYNC
           }                               }

                                          (local loads/stores)

       We need MEM_SYNC before PROC_SYNC for WIN_POST and WIN_COMPLETE, and
       MEM_SYNC after PROC_SYNC in WIN_START and WIN_WAIT/TEST, to ensure the
       ordering between local loads/stores and PROC_SYNC in WIN_POST (and
       vice versa in WIN_WAIT/TEST), and the ordering between PROC_SYNC and SHM
       operations in WIN_START (and vice versa in WIN_COMPLETE).

       In WIN_POST, the MEM_SYNC before PROC_SYNC essentially exposes previous
       local loads/stores to group of origin processes; after PROC_SYNC, origin
       processes knows all target processes already exposed their local
       loads/stores; in WIN_START, the MEM_SYNC after PROC_SYNC ensures that
       following SHM operations will happen after PROC_SYNC and will see the
       latest data on target processes.

       In WIN_COMPLETE, the MEM_SYNC before PROC_SYNC essentailly exposes previous
       SHM operations to group of target processes; after PROC_SYNC, target
       processes knows all origin process already exposed their SHM operations;
       in WIN_WAIT/TEST, the MEM_SYNC after PROC_SYNC ensures that following local
       loads/stores will happen after PROC_SYNC and will see the latest data in
       my memory region.

   (3) Passive target synchronization

              RANK 0                          RANK 1

                                        WIN_LOCK(target=1) {
                                            PROC_SYNC (lock granted)
                                            MEM_SYNC
                                        }

                                        (SHM operations)

                                        WIN_UNLOCK(target=1) {
                                            MEM_SYNC
                                            PROC_SYNC (lock released)
                                        }

         PROC_SYNC -------------------- PROC_SYNC

         WIN_LOCK (target=1) {
             PROC_SYNC (lock granted)
             MEM_SYNC
         }

         (SHM operations)

         WIN_UNLOCK (target=1) {
             MEM_SYNC
             PROC_SYNC (lock released)
         }

         PROC_SYNC -------------------- PROC_SYNC

                                        WIN_LOCK(target=1) {
                                            PROC_SYNC (lock granted)
                                            MEM_SYNC
                                        }

                                        (SHM operations)

                                        WIN_UNLOCK(target=1) {
                                            MEM_SYNC
                                            PROC_SYNC (lock released)
                                        }

         We need MEM_SYNC after PROC_SYNC in WIN_LOCK, and MEM_SYNC before
         PROC_SYNC in WIN_UNLOCK, to ensure the ordering between SHM operations
         and PROC_SYNC and vice versa.

         In WIN_LOCK, the MEM_SYNC after PROC_SYNC guarantees two things:
         (a) it guarantees that following SHM operations will happen after
         lock is granted; (b) it guarantees that following SHM operations
         will happen after any PROC_SYNC with target before WIN_LOCK is called,
         which means those SHM operations will see the latest data on target
         process.

         In WIN_UNLOCK, the MEM_SYNC before PROC_SYNC also guarantees two
         things: (a) it guarantees that SHM operations will happen before
         lock is released; (b) it guarantees that SHM operations will happen
         before any PROC_SYNC with target after WIN_UNLOCK is returned, which
         means following SHM operations on that target will see the latest data.

         WIN_LOCK_ALL/UNLOCK_ALL are same with WIN_LOCK/UNLOCK.

              RANK 0                          RANK 1

         WIN_LOCK_ALL

         (SHM operations)

         WIN_FLUSH(target=1) {
             MEM_SYNC
         }

         PROC_SYNC ------------------------PROC_SYNC

                                           WIN_LOCK(target=1) {
                                               PROC_SYNC (lock granted)
                                               MEM_SYNC
                                           }

                                           (SHM operations)

                                           WIN_UNLOCK(target=1) {
                                               MEM_SYNC
                                               PROC_SYNC (lock released)
                                           }

         WIN_UNLOCK_ALL

         We need MEM_SYNC in WIN_FLUSH to ensure the ordering between SHM
         operations and PROC_SYNC.

         The MEM_SYNC in WIN_FLUSH guarantees that all SHM operations before
         this WIN_FLUSH will happen before any PROC_SYNC with target after
         this WIN_FLUSH, which means SHM operations on target process after
         PROC_SYNC with origin will see the latest data.
*/

212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH3_RMA_SCALABLE_FENCE_PROCESS_NUM
      category    : CH3
      type        : int
      default     : 1024
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
          Specify the threshold of switching the algorithm used in
          FENCE from the basic algorithm to the scalable algorithm.
          The value can be nagative, zero or positive.
          When the number of processes is larger than or equal to
          this value, FENCE will use a scalable algorithm which do
          not use O(P) data structure; when the number of processes
          is smaller than the value, FENCE will use a basic but fast
          algorithm which requires an O(P) data structure.

233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
    - name        : MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING
      category    : CH3
      type        : int
      default     : 0
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Specify if delay issuing of RMA operations for piggybacking
        LOCK/UNLOCK/FLUSH is enabled. It can be either 0 or 1. When
        it is set to 1, the issuing of LOCK message is delayed until
        origin process see the first RMA operation and piggyback
        LOCK with that operation, and the origin process always keeps
        the current last operation until the ending synchronization
        call in order to piggyback UNLOCK/FLUSH with that operation.
        When it is set to 0, in WIN_LOCK/UNLOCK case, the LOCK message
        is sent out as early as possible, in WIN_LOCK_ALL/UNLOCK_ALL
        case, the origin process still tries to piggyback LOCK message
        with the first operation; for UNLOCK/FLUSH message, the origin
        process no longer keeps the current last operation but only
        piggyback UNLOCK/FLUSH if there is an operation avaliable in
        the ending synchronization call.

256
257
258
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

Xin Zhao's avatar
Xin Zhao committed
259
260
261
262
263
264
265
266
MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_lockqueue_alloc);
MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_winlock_getlocallock);
MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_wincreate_allgather);

MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmaqueue_alloc);
MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmaqueue_set);

void MPIDI_CH3_RMA_Init_sync_pvars(void)
267
{
Xin Zhao's avatar
Xin Zhao committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
    /* rma_lockqueue_alloc */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_lockqueue_alloc,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "Allocate Lock Queue element (in seconds)");

    /* rma_winlock_getlocallock */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_winlock_getlocallock,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "WIN_LOCK:Get local lock (in seconds)");

    /* rma_wincreate_allgather */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_wincreate_allgather,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "WIN_CREATE:Allgather (in seconds)");

    /* rma_rmaqueue_alloc */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_rmaqueue_alloc,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "Allocate RMA Queue element (in seconds)");

    /* rma_rmaqueue_set */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_rmaqueue_set,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "Set fields in RMA Queue element (in seconds)");
312
}
313

314
315
/* These are used to use a common routine to complete lists of RMA
   operations with a single routine, while collecting data that
316
317
318
319
   distinguishes between different synchronization modes.  This is not
   thread-safe; the best choice for thread-safety is to eliminate this
   ability to discriminate between the different types of RMA synchronization.
*/
320

321
322
323
/*
 * These routines provide a default implementation of the MPI RMA operations
 * in terms of the low-level, two-sided channel operations.  A channel
324
325
 * may override these functions, on a per-window basis, by overriding
 * the MPID functions in the RMAFns section of MPID_Win object.
326
327
 */

328
329
#define SYNC_POST_TAG 100

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
#undef FUNCNAME
#define FUNCNAME flush_local_all
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int flush_local_all(MPID_Win * win_ptr)
{
    int i, made_progress = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int local_completed = 0, remote_completed = 0;
    int total_remote_complete_cnt = 0, total_local_complete_cnt = 0;
    int curr_remote_complete_cnt = 0, curr_local_complete_cnt = 0;
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_FLUSH_LOCAL_ALL);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_FLUSH_LOCAL_ALL);

    /* Set sync_flag in sync struct. */
    for (i = 0; i < win_ptr->num_slots; i++) {
        curr_target = win_ptr->slots[i].target_list_head;
        while (curr_target != NULL) {
            if (curr_target->sync.upgrade_flush_local) {
                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
                }
                total_remote_complete_cnt++;
            }
            else {
                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
                }
                total_local_complete_cnt++;
            }

            curr_target = curr_target->next;
        }
    }

    /* issue out all operations. */
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);

    /* wait for remote completion for those targets that disable flush_local,
     * and wait for local completion for other targets */
    do {
        curr_local_complete_cnt = 0, curr_remote_complete_cnt = 0;
        for (i = 0; i < win_ptr->num_slots; i++) {
            curr_target = win_ptr->slots[i].target_list_head;
            while (curr_target != NULL) {
                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);

                MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed,
                                              remote_completed);

                if (curr_target->sync.upgrade_flush_local) {
                    if (remote_completed) {
                        curr_remote_complete_cnt++;
                    }
                }
                else {
                    if (local_completed) {
                        curr_local_complete_cnt++;
                    }
                }
                curr_target = curr_target->next;
            }
        }

        if (curr_remote_complete_cnt < total_remote_complete_cnt ||
            curr_local_complete_cnt < total_local_complete_cnt) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
    } while (curr_remote_complete_cnt < total_remote_complete_cnt ||
             curr_local_complete_cnt < total_local_complete_cnt);

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_FLUSH_LOCAL_ALL);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
#define FUNCNAME flush_all
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int flush_all(MPID_Win * win_ptr)
{
    int i, made_progress = 0;
    int local_completed = 0, remote_completed = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_FLUSH_ALL);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_FLUSH_ALL);

    /* Set sync_flag in sync struct. */
    for (i = 0; i < win_ptr->num_slots; i++) {
        curr_target = win_ptr->slots[i].target_list_head;
        while (curr_target != NULL) {
            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
            }

            curr_target = curr_target->next;
        }
    }

    /* Issue out all operations. */
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);

    /* Wait for remote completion. */
    do {
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
        if (!remote_completed) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
    } while (!remote_completed);

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_FLUSH_ALL);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
469

470
471
472
473
474
/********************************************************************************/
/* Active Target synchronization (including WIN_FENCE, WIN_POST, WIN_START,     */
/* WIN_COMPLETE, WIN_WAIT, WIN_TEST)                                            */
/********************************************************************************/

475
476
477
478
#undef FUNCNAME
#define FUNCNAME MPIDI_Win_fence
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
479
int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
480
{
481
    int i;
482
    MPIDI_RMA_Target_t *curr_target = NULL;
Wesley Bland's avatar
Wesley Bland committed
483
    mpir_errflag_t errflag = MPIR_ERR_NONE;
484
485
486
    int comm_size = win_ptr->comm_ptr->local_size;
    int scalable_fence_enabled = 0;
    int *rma_target_marks = NULL;
487
    int mpi_errno = MPI_SUCCESS;
488
    MPIU_CHKLMEM_DECL(1);
489
490
491
492
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);

493
494
495
496
    MPIU_ERR_CHKANDJUMP((win_ptr->states.access_state != MPIDI_RMA_NONE &&
                         win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
                         win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED) ||
                        win_ptr->states.exposure_state != MPIDI_RMA_NONE,
James Dinan's avatar
James Dinan committed
497
498
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

499
500
501
502
503
    /* Judge if we should switch to scalable FENCE algorithm */
    if (comm_size >= MPIR_CVAR_CH3_RMA_SCALABLE_FENCE_PROCESS_NUM) {
        scalable_fence_enabled = 1;
    }

504
505
506
507
508
    /* Ensure ordering of load/store operations. */
    if (win_ptr->shm_allocated == TRUE) {
        OPA_read_write_barrier();
    }

509
510
    if (assert & MPI_MODE_NOPRECEDE) {
        if (assert & MPI_MODE_NOSUCCEED) {
511
            goto finish_fence;
512
        }
513
514
        else {
            /* It is possible that there is a IBARRIER in MPI_WIN_FENCE with
515
516
             * MODE_NOPRECEDE not being completed, we let the progress engine
             * to delete its request when it is completed. */
517
518
519
520
521
522
            if (win_ptr->fence_sync_req != MPI_REQUEST_NULL) {
                MPID_Request *req_ptr;
                MPID_Request_get_ptr(win_ptr->fence_sync_req, req_ptr);
                MPID_Request_release(req_ptr);
                win_ptr->fence_sync_req = MPI_REQUEST_NULL;
                win_ptr->states.access_state = MPIDI_RMA_NONE;
523
524
                MPIDI_CH3I_num_active_issued_win--;
                MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
525
            }
526

527
528
            if (win_ptr->shm_allocated == TRUE) {
                MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;
529

530
                mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
531
532
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
533
                MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
534
535
            }

536
            mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
537
538
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
539

540
            /* Set window access state properly. */
541
            win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
542
            MPIDI_CH3I_num_active_issued_win++;
543

544
            goto finish_fence;
545
        }
546
    }
547

548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
    /* Perform basic algorithm by calling reduce-scatter */
    if (!scalable_fence_enabled) {
        /* If the IBARRIER is not completed, do not need to wait for
         * it since we are going to call reduce-scatter */
        if (win_ptr->fence_sync_req != MPI_REQUEST_NULL) {
            MPID_Request *req_ptr;
            MPID_Request_get_ptr(win_ptr->fence_sync_req, req_ptr);
            MPID_Request_release(req_ptr);
            win_ptr->fence_sync_req = MPI_REQUEST_NULL;
            MPIDI_CH3I_num_active_issued_win--;
            MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);

            win_ptr->states.access_state = MPIDI_RMA_NONE;
        }
        MPIU_CHKLMEM_MALLOC(rma_target_marks, int *, comm_size * sizeof(int),
                            mpi_errno, "rma_target_marks");
        for (i = 0; i < comm_size; i++)
            rma_target_marks[i] = 0;

        for (i = 0; i < win_ptr->num_slots; i++) {
            curr_target = win_ptr->slots[i].target_list_head;
            while (curr_target != NULL) {
                rma_target_marks[curr_target->target_rank] = 1;
                curr_target = curr_target->next;
            }
        }

        win_ptr->at_completion_counter += comm_size;

        mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_marks, 1,
                                                   MPI_INT, MPI_SUM, win_ptr->comm_ptr, &errflag);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
583

584
585
586
587
588
589
590
591
592
593
594
        win_ptr->at_completion_counter -= comm_size;
        win_ptr->at_completion_counter += rma_target_marks[0];
        MPIU_Assert(win_ptr->at_completion_counter >= 0);

        win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
    }

    if (!scalable_fence_enabled) {
        for (i = 0; i < win_ptr->num_slots; i++) {
            curr_target = win_ptr->slots[i].target_list_head;
            while (curr_target != NULL) {
595
596
597
                /* flag is set in order to decrement complete counter on target */
                curr_target->win_complete_flag = 1;

598
599
600
                curr_target = curr_target->next;
            }
        }
601
602
603
604

        mpi_errno = flush_local_all(win_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
605
606
    }
    else {
607
        mpi_errno = flush_all(win_ptr);
608
609
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
610
    }
611

612
613
    /* Cleanup all targets on window. */
    mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
614
615
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
James Dinan's avatar
James Dinan committed
616

617
618
619
620
621
    if (scalable_fence_enabled) {
        mpi_errno = MPIR_Barrier_impl(win_ptr->comm_ptr, &errflag);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
622

623
624
625
626
627
628
629
        /* Set window access state properly. */
        if (assert & MPI_MODE_NOSUCCEED) {
            win_ptr->states.access_state = MPIDI_RMA_NONE;
        }
        else {
            win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
        }
630
631
    }
    else {
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
        /* Waiting for all operations targeting at me to be finished. */
        while (win_ptr->at_completion_counter) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }

        if (assert & MPI_MODE_NOSUCCEED) {
            win_ptr->states.access_state = MPIDI_RMA_NONE;
        }
        else {
            /* Prepare for the next possible epoch */
            mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            MPIDI_CH3I_num_active_issued_win++;
            win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;

            if (win_ptr->shm_allocated == TRUE) {
                MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;
                mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
                MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
            }
        }
658
659
    }

660
  finish_fence:
661
662
663
    /* Make sure that all targets are freed. */
    MPIU_Assert(win_ptr->non_empty_slots == 0);

664
665
    MPIU_Assert(win_ptr->active_req_cnt == 0);

666
667
668
669
670
    /* Ensure ordering of load/store operations. */
    if (win_ptr->shm_allocated == TRUE) {
        OPA_read_write_barrier();
    }

671
  fn_exit:
672
    MPIU_CHKLMEM_FREEALL();
673
674
675
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FENCE);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
676
  fn_fail:
677
678
679
680
681
682
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
683
#define FUNCNAME MPIDI_Win_post
684
685
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
686
int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
687
{
688
    int *post_ranks_in_win_grp;
689
    int mpi_errno = MPI_SUCCESS;
690
    MPIU_CHKLMEM_DECL(3);
691
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_POST);
692

693
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_POST);
694

695
    /* Note that here we cannot distinguish if this exposure epoch is overlapped
696
697
698
     * with an exposure epoch of FENCE (which is not allowed), since FENCE may be
     * ended up with not unsetting the window state. We can only detect if this
     * exposure epoch is overlapped with another exposure epoch of PSCW. */
699
    MPIU_ERR_CHKANDJUMP(win_ptr->states.exposure_state != MPIDI_RMA_NONE,
700
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
701

702
703
704
    /* Ensure ordering of load/store operations. */
    if (win_ptr->shm_allocated == TRUE) {
        OPA_read_write_barrier();
705
706
    }

707
    /* Set window exposure state properly. */
708
709
710
711
    win_ptr->states.exposure_state = MPIDI_RMA_PSCW_EXPO;

    win_ptr->at_completion_counter += post_grp_ptr->size;

712
713
714
    if ((assert & MPI_MODE_NOCHECK) == 0) {
        MPI_Request *req;
        MPI_Status *status;
715
716
        int i, post_grp_size, dst, rank;
        MPID_Comm *win_comm_ptr;
717

718
719
        /* NOCHECK not specified. We need to notify the source
         * processes that Post has been called. */
720

721
        post_grp_size = post_grp_ptr->size;
722
723
        win_comm_ptr = win_ptr->comm_ptr;
        rank = win_ptr->comm_ptr->rank;
724

725
726
727
        MPIU_CHKLMEM_MALLOC(post_ranks_in_win_grp, int *,
                            post_grp_size * sizeof(int), mpi_errno, "post_ranks_in_win_grp");
        mpi_errno = fill_ranks_in_win_grp(win_ptr, post_grp_ptr, post_ranks_in_win_grp);
728
729
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
730
731
732
733
734

        MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request),
                            mpi_errno, "req");
        MPIU_CHKLMEM_MALLOC(status, MPI_Status *, post_grp_size * sizeof(MPI_Status),
                            mpi_errno, "status");
735

736
737
        /* Send a 0-byte message to the source processes */
        for (i = 0; i < post_grp_size; i++) {
738
            dst = post_ranks_in_win_grp[i];
739

740
741
742
743
            if (dst != rank) {
                MPID_Request *req_ptr;
                mpi_errno = MPID_Isend(&i, 0, MPI_INT, dst, SYNC_POST_TAG, win_comm_ptr,
                                       MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
744
745
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
746
747
748
749
750
                req[i] = req_ptr->handle;
            }
            else {
                req[i] = MPI_REQUEST_NULL;
            }
751
        }
752

753
754
755
756
757
758
759
760
761
762
763
764
        mpi_errno = MPIR_Waitall_impl(post_grp_size, req, status);
        if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
            MPIU_ERR_POP(mpi_errno);

        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno == MPI_ERR_IN_STATUS) {
            for (i = 0; i < post_grp_size; i++) {
                if (status[i].MPI_ERROR != MPI_SUCCESS) {
                    mpi_errno = status[i].MPI_ERROR;
                    MPIU_ERR_POP(mpi_errno);
                }
            }
765
        }
766
        /* --END ERROR HANDLING-- */
767
    }
768

769
  fn_exit:
770
771
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_POST);
772
773
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
774
  fn_fail:
775
776
777
778
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

779

780
781
782
783
784
#undef FUNCNAME
#define FUNCNAME MPIDI_Win_start
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
785
{
786
    int mpi_errno = MPI_SUCCESS;
787
    MPIU_CHKLMEM_DECL(2);
788
789
    MPIU_CHKPMEM_DECL(2);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_START);
790

791
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_START);
792

793
    /* Note that here we cannot distinguish if this access epoch is overlapped
794
795
796
797
     * with an access epoch of FENCE (which is not allowed), since FENCE may be
     * ended up with not unsetting the window state. We can only detect if this
     * access epoch is overlapped with another access epoch of PSCW or Passive
     * Target. */
798
799
800
801
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
                        win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
                        win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
802

803
    win_ptr->start_grp_size = group_ptr->size;
804

805
806
807
808
809
    MPIU_CHKPMEM_MALLOC(win_ptr->start_ranks_in_win_grp, int *,
                        win_ptr->start_grp_size * sizeof(int),
                        mpi_errno, "win_ptr->start_ranks_in_win_grp");

    mpi_errno = fill_ranks_in_win_grp(win_ptr, group_ptr, win_ptr->start_ranks_in_win_grp);
810
811
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
812

813
    if ((assert & MPI_MODE_NOCHECK) == 0) {
Xin Zhao's avatar
Xin Zhao committed
814
        int i, intra_cnt;
815
816
817
818
819
820
        MPI_Request *intra_start_req = NULL;
        MPI_Status *intra_start_status = NULL;
        MPID_Comm *comm_ptr = win_ptr->comm_ptr;
        int rank = comm_ptr->rank;

        /* wait for messages from local processes */
821

822
823
824
825
        /* post IRECVs */
        MPIU_CHKPMEM_MALLOC(win_ptr->start_req, MPI_Request *,
                            win_ptr->start_grp_size * sizeof(MPI_Request),
                            mpi_errno, "win_ptr->start_req");
826

827
828
829
        if (win_ptr->shm_allocated == TRUE) {
            int node_comm_size = comm_ptr->node_comm->local_size;
            MPIU_CHKLMEM_MALLOC(intra_start_req, MPI_Request *,
830
                                node_comm_size * sizeof(MPI_Request), mpi_errno, "intra_start_req");
831
832
833
834
            MPIU_CHKLMEM_MALLOC(intra_start_status, MPI_Status *,
                                node_comm_size * sizeof(MPI_Status),
                                mpi_errno, "intra_start_status");
        }
835

836
837
        intra_cnt = 0;
        for (i = 0; i < win_ptr->start_grp_size; i++) {
838
            MPID_Request *req_ptr;
839
840
            MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
            int src = win_ptr->start_ranks_in_win_grp[i];
841

842
843
844
            if (src != rank) {
                MPIDI_Comm_get_vc(comm_ptr, rank, &orig_vc);
                MPIDI_Comm_get_vc(comm_ptr, src, &target_vc);
845

846
847
                mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
                                       comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
848
849
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
850

851
                if (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id) {
852
853
854
855
856
857
858
859
860
                    intra_start_req[intra_cnt++] = req_ptr->handle;
                    win_ptr->start_req[i] = MPI_REQUEST_NULL;
                }
                else {
                    win_ptr->start_req[i] = req_ptr->handle;
                }
            }
            else {
                win_ptr->start_req[i] = MPI_REQUEST_NULL;
861
            }
862
        }
863

864
865
866
867
868
869
870
871
872
873
874
875
        /* for targets on SHM, waiting until their IRECVs to be finished */
        if (intra_cnt) {
            mpi_errno = MPIR_Waitall_impl(intra_cnt, intra_start_req, intra_start_status);
            if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
                MPIU_ERR_POP(mpi_errno);
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno == MPI_ERR_IN_STATUS) {
                for (i = 0; i < intra_cnt; i++) {
                    if (intra_start_status[i].MPI_ERROR != MPI_SUCCESS) {
                        mpi_errno = intra_start_status[i].MPI_ERROR;
                        MPIU_ERR_POP(mpi_errno);
                    }
876
                }
877
            }
878
            /* --END ERROR HANDLING-- */
879
        }
880
881
    }

882
  finish_start:
883
    /* Set window access state properly. */
884
    win_ptr->states.access_state = MPIDI_RMA_PSCW_ISSUED;
885
    MPIDI_CH3I_num_active_issued_win++;
886

887
    MPIU_Assert(win_ptr->active_req_cnt == 0);
888

889
890
891
892
893
    /* Ensure ordering of load/store operations. */
    if (win_ptr->shm_allocated == TRUE) {
        OPA_read_write_barrier();
    }

894
  fn_exit:
895
896
897
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_START);
    return mpi_errno;
898
  fn_fail:
899
900
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
901
902
903
}


904

905
#undef FUNCNAME
906
#define FUNCNAME MPIDI_Win_complete
907
908
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
909
int MPIDI_Win_complete(MPID_Win * win_ptr)
910
{
911
    int mpi_errno = MPI_SUCCESS;
912
913
914
    int i, dst, rank = win_ptr->comm_ptr->rank;
    MPID_Comm *win_comm_ptr = win_ptr->comm_ptr;
    MPIDI_RMA_Target_t *curr_target;
915
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_COMPLETE);
916

917
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_COMPLETE);
918

919
920
921
    /* Access epochs on the same window must be disjoint. */
    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PSCW_ISSUED &&
                        win_ptr->states.access_state != MPIDI_RMA_PSCW_GRANTED,
James Dinan's avatar
James Dinan committed
922
923
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

924
925
926
927
928
    /* Ensure ordering of load/store operations. */
    if (win_ptr->shm_allocated == TRUE) {
        OPA_read_write_barrier();
    }

929
930
931
932
933
934
    if (win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED) {
        while (win_ptr->states.access_state != MPIDI_RMA_PSCW_GRANTED) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
935
936
    }

937
938
939
940
941
942
    for (i = 0; i < win_ptr->start_grp_size; i++) {
        dst = win_ptr->start_ranks_in_win_grp[i];
        if (dst == rank) {
            win_ptr->at_completion_counter--;
            MPIU_Assert(win_ptr->at_completion_counter >= 0);
            continue;
943
        }
944

945
        if (win_comm_ptr->local_size <= win_ptr->num_slots)
946
            curr_target = win_ptr->slots[dst].target_list_head;
947
        else {
948
            curr_target = win_ptr->slots[dst % win_ptr->num_slots].target_list_head;
949
950
951
            while (curr_target != NULL && curr_target->target_rank != dst)
                curr_target = curr_target->next;
        }
952

953
954
        if (curr_target != NULL) {
            curr_target->win_complete_flag = 1;
955
956
        }
        else {
957
            /* FIXME: do we need to wait for remote completion? */
958
            mpi_errno = send_decr_at_cnt_msg(dst, win_ptr, MPIDI_CH3_PKT_FLAG_NONE);
959
960
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
961
        }
962
963
    }

964
    mpi_errno = flush_local_all(win_ptr);
965
966
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
967
968
969

    /* Cleanup all targets on this window. */
    mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
970
971
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
972

973
  finish_complete:
974
    /* Set window access state properly. */
975
    win_ptr->states.access_state = MPIDI_RMA_NONE;
976

977
978
979
980
    /* free start group stored in window */
    MPIU_Free(win_ptr->start_ranks_in_win_grp);
    win_ptr->start_ranks_in_win_grp = NULL;
    MPIU_Assert(win_ptr->start_req == NULL);
981

982
983
984
    /* Make sure that all targets are freed. */
    MPIU_Assert(win_ptr->non_empty_slots == 0);

985
    MPIU_Assert(win_ptr->active_req_cnt == 0);
986

987
988
  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_COMPLETE);
989
    return mpi_errno;
990
991
992
993
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
994
}
995

996
997


998
#undef FUNCNAME
999
#define FUNCNAME MPIDI_Win_wait
1000
#undef FCNAME
For faster browsing, not all history is shown. View entire blame