ch3u_rma_progress.c 30.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    - name        : MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD
      category    : CH3
      type        : int
      default     : 2097152
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
         Threshold of number of active requests to trigger
         blocking waiting in operation routines. When the
         value is negative, we never blockingly wait in
         operation routines. When the value is zero, we always
         trigger blocking waiting in operation routines to
         wait until no. of active requests becomes zero. When the
         value is positive, we do blocking waiting in operation
         routines to wait until no. of active requests being
         reduced to this value.

32 33 34
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

35 36 37
static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                                int *made_progress);
static inline int check_and_switch_window_state(MPID_Win * win_ptr, int *made_progress);
38 39
static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                   int *made_progress);
40 41
static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);

42
/* check if we can switch window-wide state: FENCE_ISSUED, PSCW_ISSUED, LOCK_ALL_ISSUED */
43
#undef FUNCNAME
44
#define FUNCNAME check_and_switch_window_state
45 46
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
47
static inline int check_and_switch_window_state(MPID_Win * win_ptr, int *made_progress)
48
{
49
    MPID_Request *fence_req_ptr = NULL;
50
    int i, mpi_errno = MPI_SUCCESS;
51
    MPIDI_STATE_DECL(MPID_STATE_CHECK_AND_SWITCH_WINDOW_STATE);
52

53
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_CHECK_AND_SWITCH_WINDOW_STATE);
54 55 56

    (*made_progress) = 0;

57 58
    switch (win_ptr->states.access_state) {
    case MPIDI_RMA_FENCE_ISSUED:
59 60 61 62 63 64
        MPID_Request_get_ptr(win_ptr->fence_sync_req, fence_req_ptr);
        if (MPID_Request_is_complete(fence_req_ptr)) {
            win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
            MPID_Request_release(fence_req_ptr);
            win_ptr->fence_sync_req = MPI_REQUEST_NULL;

65 66
            MPIDI_CH3I_num_active_issued_win--;
            MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
67

68 69
            (*made_progress) = 1;
        }
70 71 72
        break;

    case MPIDI_RMA_PSCW_ISSUED:
73 74
        if (win_ptr->start_req == NULL) {
            /* for MPI_MODE_NOCHECK and all targets on SHM,
75
             * we do not create PSCW requests on window. */
76 77
            win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;

78 79
            MPIDI_CH3I_num_active_issued_win--;
            MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
80

81 82 83 84 85 86 87 88 89 90 91 92 93
            (*made_progress) = 1;
        }
        else {
            for (i = 0; i < win_ptr->start_grp_size; i++) {
                MPID_Request *start_req_ptr = NULL;
                if (win_ptr->start_req[i] == MPI_REQUEST_NULL)
                    continue;
                MPID_Request_get_ptr(win_ptr->start_req[i], start_req_ptr);
                if (MPID_Request_is_complete(start_req_ptr)) {
                    MPID_Request_release(start_req_ptr);
                    win_ptr->start_req[i] = MPI_REQUEST_NULL;
                }
                else {
94
                    break;
95 96 97
                }
            }

98 99
            if (i == win_ptr->start_grp_size) {
                win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
100

101 102
                MPIDI_CH3I_num_active_issued_win--;
                MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
103

104 105 106 107 108
                (*made_progress) = 1;

                MPIU_Free(win_ptr->start_req);
                win_ptr->start_req = NULL;
            }
109
        }
110 111 112
        break;

    case MPIDI_RMA_LOCK_ALL_ISSUED:
113 114 115 116
        if (win_ptr->outstanding_locks == 0) {
            win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_GRANTED;
            (*made_progress) = 1;
        }
117 118 119 120
        break;

    default:
        break;
121
    }   /* end of switch */
122 123

  fn_exit:
124
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_CHECK_AND_SWITCH_WINDOW_STATE);
125 126 127 128 129 130 131 132 133
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
134
#define FUNCNAME check_and_switch_target_state
135 136
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
137 138
static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                                int *made_progress)
139 140 141 142 143 144
{
    int rank = win_ptr->comm_ptr->rank;
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

145
    if (target == NULL)
146 147
        goto fn_exit;

148
    /* This check should only be performed when window-wide sync is finished, or
149
     * current sync is per-target sync. */
150 151 152 153 154 155 156 157 158 159 160 161
    if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
        win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED) {
        goto fn_exit;
    }

    switch (target->access_state) {
    case MPIDI_RMA_LOCK_CALLED:
        if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE ||
            target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL ||
            target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
162 163
            if (target->pending_op_list_head == NULL ||
                !target->pending_op_list_head->piggyback_lock_candidate) {
164 165 166 167
                /* issue lock request */
                target->access_state = MPIDI_RMA_LOCK_ISSUED;
                if (target->target_rank == rank) {
                    mpi_errno = acquire_local_lock(win_ptr, target->lock_type);
168 169
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
170 171
                }
                else {
172 173 174
                    mpi_errno = send_lock_msg(target->target_rank, target->lock_type, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
175 176 177 178 179 180
                }

                (*made_progress) = 1;
            }
        }
        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
181
            if (target->pending_op_list_head == NULL) {
182
                /* No RMA operation has ever been posted to this target,
183 184
                 * finish issuing, no need to acquire the lock. Cleanup
                 * function will clean it up. */
185 186 187 188 189 190 191 192 193 194 195 196
                target->access_state = MPIDI_RMA_LOCK_GRANTED;

                target->sync.outstanding_acks--;
                MPIU_Assert(target->sync.outstanding_acks >= 0);

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

                (*made_progress) = 1;
            }
            else {
                /* if we reach WIN_UNLOCK and there is still operation existing
197 198
                 * in pending list, this operation must be the only operation
                 * and it is prepared to piggyback LOCK and UNLOCK. */
199 200
                MPIU_Assert(target->pending_op_list_head->next == NULL);
                MPIU_Assert(target->pending_op_list_head->piggyback_lock_candidate);
201 202 203 204 205 206 207
            }
        }
        break;

    case MPIDI_RMA_LOCK_GRANTED:
    case MPIDI_RMA_NONE:
        if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
208
            if (target->pending_op_list_head == NULL) {
209 210 211 212 213 214 215
                if (target->target_rank == rank) {
                    target->sync.outstanding_acks--;
                    MPIU_Assert(target->sync.outstanding_acks >= 0);
                }
                else {
                    if (target->put_acc_issued) {
                        mpi_errno = send_flush_msg(target->target_rank, win_ptr);
216 217
                        if (mpi_errno != MPI_SUCCESS)
                            MPIU_ERR_POP(mpi_errno);
218 219
                    }
                    else {
220
                        /* We did not issue PUT/ACC since the last
221 222
                         * synchronization call, therefore here we
                         * don't need ACK back */
223 224
                        target->sync.outstanding_acks--;
                        MPIU_Assert(target->sync.outstanding_acks >= 0);
225 226
                    }
                }
227 228 229 230 231

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

                (*made_progress) = 1;
232
            }
233 234
        }
        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
235
            if (target->pending_op_list_head == NULL) {
236
                if (target->target_rank == rank) {
237
                    target->sync.outstanding_acks--;
238
                    MPIU_Assert(target->sync.outstanding_acks >= 0);
239

240
                    mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
241 242
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
243
                }
244 245 246 247
                else {
                    MPIDI_CH3_Pkt_flags_t flag = MPIDI_CH3_PKT_FLAG_NONE;
                    if (!target->put_acc_issued) {
                        /* We did not issue PUT/ACC since the last
248 249
                         * synchronization call, therefore here we
                         * don't need ACK back */
250 251 252 253 254 255
                        target->sync.outstanding_acks--;
                        MPIU_Assert(target->sync.outstanding_acks >= 0);

                        flag = MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK;
                    }
                    mpi_errno = send_unlock_msg(target->target_rank, win_ptr, flag);
256 257
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
258 259 260 261 262
                }

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

263 264 265
                (*made_progress) = 1;
            }
        }
266
        break;
267

268 269
    default:
        break;
270
    }   /* end of switch */
271

272
  fn_exit:
273
    return mpi_errno;
274
  fn_fail:
275 276
    goto fn_exit;
}
277 278


279 280 281 282
#undef FUNCNAME
#define FUNCNAME issue_ops_target
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
283
static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
284 285 286 287 288
                                   int *made_progress)
{
    MPIDI_RMA_Op_t *curr_op = NULL;
    MPIDI_CH3_Pkt_flags_t flags;
    int first_op = 1, mpi_errno = MPI_SUCCESS;
289

290
    (*made_progress) = 0;
291

292 293
    if (win_ptr->non_empty_slots == 0 || target == NULL)
        goto fn_exit;
294

295 296 297 298 299 300
    /* Exit if window-wide sync is not finished */
    if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
        win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
        goto fn_exit;
301

302 303 304 305 306
    /* Exit if per-target sync is not finished */
    if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        if (target->access_state == MPIDI_RMA_LOCK_ISSUED)
            goto fn_exit;
307 308 309 310 311 312
    }

    /* Issue out operations in the list. */
    curr_op = target->next_op_to_issue;
    while (curr_op != NULL) {

313 314
        if (target->access_state == MPIDI_RMA_LOCK_ISSUED) {
            /* It is possible that the previous OP+LOCK changes
315
             * lock state to LOCK_ISSUED. */
316 317
            break;
        }
318 319

        if (curr_op->next == NULL &&
320
            target->sync.sync_flag == MPIDI_RMA_SYNC_NONE && curr_op->ureq == NULL) {
321
            /* Skip the last OP if sync_flag is NONE since we
322 323 324 325 326
             * want to leave it to the ending synchronization
             * so that we can piggyback LOCK / FLUSH.
             * However, if it is a request-based RMA, do not
             * skip it (otherwise a wait call before unlock
             * will be blocked). */
327
            break;
328 329
        }

330 331
        flags = MPIDI_CH3_PKT_FLAG_NONE;

332 333 334 335
        if (first_op) {
            /* piggyback on first OP. */
            if (target->access_state == MPIDI_RMA_LOCK_CALLED) {
                MPIU_Assert(curr_op->piggyback_lock_candidate);
336 337 338 339 340 341
                if (target->lock_type == MPI_LOCK_SHARED)
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED;
                else {
                    MPIU_Assert(target->lock_type == MPI_LOCK_EXCLUSIVE);
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE;
                }
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
                target->access_state = MPIDI_RMA_LOCK_ISSUED;
            }
            first_op = 0;
        }

        if (curr_op->next == NULL) {
            /* piggyback on last OP. */
            if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
                flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
                if (target->win_complete_flag)
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
            }
            else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
                flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
            }
        }

        mpi_errno = issue_rma_op(curr_op, win_ptr, target, flags);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

363 364
        (*made_progress) = 1;

365
        if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
366 367 368
            curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
            curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
            curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
369
            target->put_acc_issued = 1; /* set PUT_ACC_FLAG when sending
370
                                         * PUT/ACC operation. */
371 372
        }

373 374 375 376 377 378 379 380 381 382
        if ((curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
             curr_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) && curr_op->issued_stream_count > 0) {
            /* For ACC-like operations, if not all stream units
             * are issued out, we stick to the current operation,
             * otherwise we move on to the next operation. */
            target->next_op_to_issue = curr_op;
        }
        else
            target->next_op_to_issue = curr_op->next;

383 384 385 386 387 388 389
        if (target->next_op_to_issue == NULL) {
            if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH || flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
                /* We are done with ending sync, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
            }
        }

390 391
        if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
            flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
392
            /* If this operation is piggybacked with LOCK,
393 394 395
             * do not move it out of pending list, and do
             * not complete the user request, because we
             * may need to re-transmit it. */
396 397 398
            break;
        }

399 400 401 402 403
        if (curr_op->ureq != NULL) {
            mpi_errno = set_user_req_after_issuing_op(curr_op);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
404

405 406
        if (curr_op->reqs_size == 0) {
            MPIU_Assert(curr_op->reqs == NULL);
407
            /* Sending is completed immediately. */
408
            MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list_head),
409 410 411
                                         &(target->pending_op_list_tail), curr_op);
        }
        else {
412 413 414
            MPI_Datatype target_datatype;
            int is_derived = FALSE;

415
            /* Sending is not completed immediately. */
416

417
            MPIDI_CH3I_RMA_Ops_unlink(&(target->pending_op_list_head),
418
                                      &(target->pending_op_list_tail), curr_op);
419 420 421 422 423 424 425 426 427 428 429 430 431

            MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(curr_op->pkt, target_datatype, mpi_errno);

            if ((target_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) ||
                (curr_op->origin_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(curr_op->origin_datatype)) ||
                (curr_op->result_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(curr_op->result_datatype))) {
                is_derived = TRUE;
            }

            if (is_derived) {
432 433
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_dt_op_list_head),
                                          &(target->issued_dt_op_list_tail), curr_op);
434 435
            }
            else if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
436 437 438
                     curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
439 440
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_write_op_list_head),
                                          &(target->issued_write_op_list_tail), curr_op);
441 442
            }
            else {
443 444
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_read_op_list_head),
                                          &(target->issued_read_op_list_tail), curr_op);
445 446 447 448 449
            }
        }

        curr_op = target->next_op_to_issue;

450
    }   /* end of while loop */
451 452 453 454 455 456 457 458 459 460 461

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME issue_ops_win
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
462
static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress)
463 464
{
    int mpi_errno = MPI_SUCCESS;
465
    int start_slot, end_slot, i, idx;
466 467 468 469 470 471 472
    MPIDI_RMA_Target_t *target = NULL;

    (*made_progress) = 0;

    if (win_ptr->non_empty_slots == 0)
        goto fn_exit;

473 474 475 476 477 478 479 480
    /* Exit if window-wide sync is not finished */
    if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
        win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
        goto fn_exit;

    /* FIXME: we should optimize the issuing pattern here. */
481 482 483 484

    start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
    end_slot = start_slot + win_ptr->num_slots;
    for (i = start_slot; i < end_slot; i++) {
485 486 487 488
        if (i < win_ptr->num_slots)
            idx = i;
        else
            idx = i - win_ptr->num_slots;
489

490
        target = win_ptr->slots[idx].target_list_head;
491
        while (target != NULL) {
492 493 494
            int temp_progress = 0;

            /* check target state */
495
            mpi_errno = check_and_switch_target_state(win_ptr, target, &temp_progress);
496 497 498 499
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (temp_progress)
                (*made_progress) = 1;
500

501 502
            /* issue operations to this target */
            mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
503 504 505 506
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (temp_progress)
                (*made_progress) = 1;
507 508 509 510 511

            target = target->next;
        }
    }

512
  fn_exit:
513
    return mpi_errno;
514
  fn_fail:
515 516 517 518
    goto fn_exit;
}


519 520 521 522 523 524 525 526
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Free_ops_before_completion
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
{
    MPIDI_RMA_Op_t *curr_op = NULL;
    MPIDI_RMA_Target_t *curr_target = NULL;
527
    MPIDI_RMA_Op_t **op_list_head = NULL, **op_list_tail = NULL;
528 529 530 531
    int read_flag = 0;
    int i, made_progress = 0;
    int mpi_errno = MPI_SUCCESS;

532 533 534
    /* If we are in an free_ops_before_completion, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
535
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
536 537

    /* make nonblocking progress once */
538
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
539 540
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
541

542
    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
543 544
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
545 546 547 548
        goto fn_exit;

    /* find targets that have operations */
    for (i = 0; i < win_ptr->num_slots; i++) {
549 550
        if (win_ptr->slots[i].target_list_head != NULL) {
            curr_target = win_ptr->slots[i].target_list_head;
551
            while (curr_target != NULL) {
552 553
                if (curr_target->issued_read_op_list_head != NULL ||
                    curr_target->issued_write_op_list_head != NULL) {
554 555 556 557 558 559 560 561 562
                    if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
                        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
                        if (curr_target->access_state == MPIDI_RMA_LOCK_GRANTED)
                            break;
                    }
                    else {
                        break;
                    }
                }
563
                curr_target = curr_target->next;
564
            }
565 566
            if (curr_target != NULL)
                break;
567 568
        }
    }
569

570 571
    if (curr_target == NULL)
        goto fn_exit;
572

573
    /* After we do this, all following Win_flush_local
574
     * must do a Win_flush instead. */
575
    curr_target->sync.upgrade_flush_local = 1;
576

577 578 579 580 581
    op_list_head = &curr_target->issued_read_op_list_head;
    op_list_tail = &curr_target->issued_read_op_list_tail;
    read_flag = 1;

    curr_op = *op_list_head;
582 583

    /* free all ops in the list since we do not need to maintain them anymore */
584 585 586 587 588 589 590 591 592 593
    while (1) {
        if (curr_op != NULL) {
            if (curr_op->reqs_size > 0) {
                MPIU_Assert(curr_op->reqs != NULL);
                for (i = 0; i < curr_op->reqs_size; i++) {
                    if (curr_op->reqs[i] != NULL) {
                        MPID_Request_release(curr_op->reqs[i]);
                        curr_op->reqs[i] = NULL;
                        win_ptr->active_req_cnt--;
                    }
594 595
                }

596 597 598 599 600 601 602
                /* free req array in this op */
                MPIU_Free(curr_op->reqs);
                curr_op->reqs = NULL;
                curr_op->reqs_size = 0;
            }
            MPL_LL_DELETE(*op_list_head, *op_list_tail, curr_op);
            MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
603
        }
604
        else {
605
            if (read_flag == 1) {
606 607
                op_list_head = &curr_target->issued_write_op_list_head;
                op_list_head = &curr_target->issued_write_op_list_tail;
608 609
                read_flag = 0;
            }
610 611 612 613
            else {
                /* we reach the tail of write_op_list, break out. */
                break;
            }
614
        }
615
        curr_op = *op_list_head;
616
    }
617

618
  fn_exit:
619
    return mpi_errno;
620
  fn_fail:
621 622 623 624
    goto fn_exit;
}


625 626 627 628 629 630
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_aggressive
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
{
631
    int i, local_completed = 0, remote_completed ATTRIBUTE((unused)) = 0;
632 633 634 635 636 637 638
    int mpi_errno = MPI_SUCCESS;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int made_progress = 0;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
639
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
640 641 642

    /* find the first target that has something to issue */
    for (i = 0; i < win_ptr->num_slots; i++) {
643 644 645
        if (win_ptr->slots[i].target_list_head != NULL) {
            curr_target = win_ptr->slots[i].target_list_head;
            while (curr_target != NULL && curr_target->pending_op_list_head == NULL)
646
                curr_target = curr_target->next;
647 648
            if (curr_target != NULL)
                break;
649 650 651
        }
    }

652 653
    if (curr_target == NULL)
        goto fn_exit;
654 655 656 657 658 659 660 661 662 663 664 665

    if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL)
        curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;

    /* Issue out all operations. */
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                    &made_progress);
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);

    /* Wait for local completion. */
    do {
666
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
667 668
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
669 670 671

        MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed, remote_completed);

672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
        if (!local_completed) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
    } while (!local_completed);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_target_aggressive
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target)
{
692
    int i, local_completed ATTRIBUTE((unused)) = 0, remote_completed = 0;
693 694 695 696 697 698 699 700 701
    int made_progress = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int mpi_errno = MPI_SUCCESS;

    (*target) = NULL;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
702
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanotarget");
703 704 705 706 707 708 709 710 711 712 713

    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        /* switch to window-wide protocol */
        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
            if (i == win_ptr->comm_ptr->rank)
                continue;
            MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
            if (orig_vc->node_id != target_vc->node_id) {
                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
714 715
                if (mpi_errno)
                    MPIU_ERR_POP(mpi_errno);
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
                if (curr_target == NULL) {
                    win_ptr->outstanding_locks++;
                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
                }
            }
        }
        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED;
    }

    do {
        /* find a non-empty slot and set the FLUSH flag on the first
         * target */
        /* TODO: we should think about better strategies on selecting the target */
        for (i = 0; i < win_ptr->num_slots; i++)
732
            if (win_ptr->slots[i].target_list_head != NULL)
733
                break;
734
        curr_target = win_ptr->slots[i].target_list_head;
735 736 737 738 739 740 741 742 743 744 745 746 747
        if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
            curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
            curr_target->sync.outstanding_acks++;
        }

        /* Issue out all operations. */
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                        &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

        /* Wait for remote completion. */
        do {
748
            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
749 750
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
751 752 753

            MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed, remote_completed);

754 755 756 757 758 759 760 761
            if (!remote_completed) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        } while (!remote_completed);

        /* Cleanup the target. */
Xin Zhao's avatar
Xin Zhao committed
762
        mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, curr_target);
763 764
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
765 766 767 768 769 770 771 772 773 774 775 776 777

        /* check if we got a target */
        (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);

    } while ((*target) == NULL);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


778 779 780 781 782 783
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_target
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress)
{
784 785
    int temp_progress = 0;
    MPIDI_RMA_Target_t *target = NULL;
786 787 788 789
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

790
    /* check window state */
791
    mpi_errno = check_and_switch_window_state(win_ptr, &temp_progress);
792 793 794 795
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
796

797 798
    /* find target element */
    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
799 800
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
801

802
    /* check target state */
803
    mpi_errno = check_and_switch_target_state(win_ptr, target, &temp_progress);
804 805 806 807
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
808

809 810
    /* issue operations to this target */
    mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
811 812 813 814
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
815 816 817 818 819 820 821 822 823 824 825 826 827 828

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_win
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress)
{
829
    int temp_progress = 0;
830 831 832 833 834
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

    /* check window state */
835
    mpi_errno = check_and_switch_window_state(win_ptr, &temp_progress);
836 837 838 839
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
840

841
    /* issue operations on window */
842
    mpi_errno = issue_ops_win(win_ptr, &temp_progress);
843 844 845 846
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_global
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
{
    MPIDI_RMA_Win_list_t *win_elem = MPIDI_RMA_Win_list;
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

    for (win_elem = MPIDI_RMA_Win_list; win_elem; win_elem = win_elem->next) {
867
        int temp_progress = 0;
868

869 870 871 872
        if (win_elem->win_ptr->states.access_state == MPIDI_RMA_NONE ||
            win_elem->win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
            win_elem->win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED)
            continue;
873

874
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_elem->win_ptr, &temp_progress);
875 876 877 878
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
        if (temp_progress)
            (*made_progress) = 1;
879 880 881 882 883 884 885
    }

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}