ch3u_rma_progress.c 30.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    - name        : MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD
      category    : CH3
      type        : int
      default     : 2097152
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
         Threshold of number of active requests to trigger
         blocking waiting in operation routines. When the
         value is negative, we never blockingly wait in
         operation routines. When the value is zero, we always
         trigger blocking waiting in operation routines to
         wait until no. of active requests becomes zero. When the
         value is positive, we do blocking waiting in operation
         routines to wait until no. of active requests being
         reduced to this value.

32 33 34
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

35 36 37 38 39
static inline int check_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                     int *made_progress);
static inline int check_window_state(MPID_Win * win_ptr, int *made_progress);
static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                   int *made_progress);
40 41
static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);

42
/* check if we can switch window-wide state: FENCE_ISSUED, PSCW_ISSUED, LOCK_ALL_ISSUED */
43 44 45 46
#undef FUNCNAME
#define FUNCNAME check_window_state
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
47
static inline int check_window_state(MPID_Win * win_ptr, int *made_progress)
48
{
49
    MPID_Request *fence_req_ptr = NULL;
50 51 52 53 54 55 56
    int i, mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_CHECK_WINDOW_STATE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_CHECK_WINDOW_STATE);

    (*made_progress) = 0;

57 58
    switch (win_ptr->states.access_state) {
    case MPIDI_RMA_FENCE_ISSUED:
59 60 61 62 63 64
        MPID_Request_get_ptr(win_ptr->fence_sync_req, fence_req_ptr);
        if (MPID_Request_is_complete(fence_req_ptr)) {
            win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
            MPID_Request_release(fence_req_ptr);
            win_ptr->fence_sync_req = MPI_REQUEST_NULL;

65 66 67
            num_active_issued_win--;
            MPIU_Assert(num_active_issued_win >= 0);

68 69
            (*made_progress) = 1;
        }
70 71 72
        break;

    case MPIDI_RMA_PSCW_ISSUED:
73 74
        if (win_ptr->start_req == NULL) {
            /* for MPI_MODE_NOCHECK and all targets on SHM,
75
             * we do not create PSCW requests on window. */
76 77
            win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;

78 79 80
            num_active_issued_win--;
            MPIU_Assert(num_active_issued_win >= 0);

81 82 83 84 85 86 87 88 89 90 91 92 93
            (*made_progress) = 1;
        }
        else {
            for (i = 0; i < win_ptr->start_grp_size; i++) {
                MPID_Request *start_req_ptr = NULL;
                if (win_ptr->start_req[i] == MPI_REQUEST_NULL)
                    continue;
                MPID_Request_get_ptr(win_ptr->start_req[i], start_req_ptr);
                if (MPID_Request_is_complete(start_req_ptr)) {
                    MPID_Request_release(start_req_ptr);
                    win_ptr->start_req[i] = MPI_REQUEST_NULL;
                }
                else {
94
                    break;
95 96 97
                }
            }

98 99
            if (i == win_ptr->start_grp_size) {
                win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
100

101 102
                num_active_issued_win--;
                MPIU_Assert(num_active_issued_win >= 0);
103

104 105 106 107 108
                (*made_progress) = 1;

                MPIU_Free(win_ptr->start_req);
                win_ptr->start_req = NULL;
            }
109
        }
110 111 112
        break;

    case MPIDI_RMA_LOCK_ALL_ISSUED:
113 114 115 116
        if (win_ptr->outstanding_locks == 0) {
            win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_GRANTED;
            (*made_progress) = 1;
        }
117 118 119 120
        break;

    default:
        break;
121
    }   /* end of switch */
122 123 124 125 126 127 128 129 130 131 132 133

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_CHECK_WINDOW_STATE);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
134
#define FUNCNAME check_target_state
135 136
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
137
static inline int check_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
138
                                     int *made_progress)
139 140 141 142 143 144
{
    int rank = win_ptr->comm_ptr->rank;
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

145
    if (target == NULL)
146 147
        goto fn_exit;

148
    /* This check should only be performed when window-wide sync is finished, or
149
     * current sync is per-target sync. */
150 151 152 153 154 155 156 157 158 159 160 161
    if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
        win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED) {
        goto fn_exit;
    }

    switch (target->access_state) {
    case MPIDI_RMA_LOCK_CALLED:
        if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE ||
            target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL ||
            target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
162 163
            if (target->pending_op_list_head == NULL ||
                !target->pending_op_list_head->piggyback_lock_candidate) {
164 165 166 167
                /* issue lock request */
                target->access_state = MPIDI_RMA_LOCK_ISSUED;
                if (target->target_rank == rank) {
                    mpi_errno = acquire_local_lock(win_ptr, target->lock_type);
168 169
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
170 171
                }
                else {
172 173 174
                    mpi_errno = send_lock_msg(target->target_rank, target->lock_type, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
175 176 177 178 179 180
                }

                (*made_progress) = 1;
            }
        }
        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
181
            if (target->pending_op_list_head == NULL) {
182
                /* No RMA operation has ever been posted to this target,
183 184
                 * finish issuing, no need to acquire the lock. Cleanup
                 * function will clean it up. */
185 186 187 188 189 190 191 192 193 194 195 196
                target->access_state = MPIDI_RMA_LOCK_GRANTED;

                target->sync.outstanding_acks--;
                MPIU_Assert(target->sync.outstanding_acks >= 0);

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

                (*made_progress) = 1;
            }
            else {
                /* if we reach WIN_UNLOCK and there is still operation existing
197 198
                 * in pending list, this operation must be the only operation
                 * and it is prepared to piggyback LOCK and UNLOCK. */
199 200
                MPIU_Assert(target->pending_op_list_head->next == NULL);
                MPIU_Assert(target->pending_op_list_head->piggyback_lock_candidate);
201 202 203 204 205 206 207
            }
        }
        break;

    case MPIDI_RMA_LOCK_GRANTED:
    case MPIDI_RMA_NONE:
        if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
208
            if (target->pending_op_list_head == NULL) {
209 210 211 212 213 214 215
                if (target->target_rank == rank) {
                    target->sync.outstanding_acks--;
                    MPIU_Assert(target->sync.outstanding_acks >= 0);
                }
                else {
                    if (target->put_acc_issued) {
                        mpi_errno = send_flush_msg(target->target_rank, win_ptr);
216 217
                        if (mpi_errno != MPI_SUCCESS)
                            MPIU_ERR_POP(mpi_errno);
218 219
                    }
                    else {
220
                        /* We did not issue PUT/ACC since the last
221 222
                         * synchronization call, therefore here we
                         * don't need ACK back */
223 224
                        target->sync.outstanding_acks--;
                        MPIU_Assert(target->sync.outstanding_acks >= 0);
225 226
                    }
                }
227 228 229 230 231

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

                (*made_progress) = 1;
232
            }
233 234
        }
        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
235
            if (target->pending_op_list_head == NULL) {
236
                if (target->target_rank == rank) {
237
                    target->sync.outstanding_acks--;
238
                    MPIU_Assert(target->sync.outstanding_acks >= 0);
239

240
                    mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
241 242
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
243
                }
244 245 246 247
                else {
                    MPIDI_CH3_Pkt_flags_t flag = MPIDI_CH3_PKT_FLAG_NONE;
                    if (!target->put_acc_issued) {
                        /* We did not issue PUT/ACC since the last
248 249
                         * synchronization call, therefore here we
                         * don't need ACK back */
250 251 252 253 254 255
                        target->sync.outstanding_acks--;
                        MPIU_Assert(target->sync.outstanding_acks >= 0);

                        flag = MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK;
                    }
                    mpi_errno = send_unlock_msg(target->target_rank, win_ptr, flag);
256 257
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
258 259 260 261 262
                }

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

263 264 265
                (*made_progress) = 1;
            }
        }
266
        break;
267

268 269
    default:
        break;
270
    }   /* end of switch */
271

272
  fn_exit:
273
    return mpi_errno;
274
  fn_fail:
275 276
    goto fn_exit;
}
277 278


279 280 281 282
#undef FUNCNAME
#define FUNCNAME issue_ops_target
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
283
static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
284 285 286 287 288
                                   int *made_progress)
{
    MPIDI_RMA_Op_t *curr_op = NULL;
    MPIDI_CH3_Pkt_flags_t flags;
    int first_op = 1, mpi_errno = MPI_SUCCESS;
289

290
    (*made_progress) = 0;
291

292 293
    if (win_ptr->non_empty_slots == 0 || target == NULL)
        goto fn_exit;
294

295 296 297 298 299 300
    /* Exit if window-wide sync is not finished */
    if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
        win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
        goto fn_exit;
301

302 303 304 305 306
    /* Exit if per-target sync is not finished */
    if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        if (target->access_state == MPIDI_RMA_LOCK_ISSUED)
            goto fn_exit;
307 308 309 310 311 312
    }

    /* Issue out operations in the list. */
    curr_op = target->next_op_to_issue;
    while (curr_op != NULL) {

313 314
        if (target->access_state == MPIDI_RMA_LOCK_ISSUED) {
            /* It is possible that the previous OP+LOCK changes
315
             * lock state to LOCK_ISSUED. */
316 317
            break;
        }
318 319

        if (curr_op->next == NULL &&
320
            target->sync.sync_flag == MPIDI_RMA_SYNC_NONE && curr_op->ureq == NULL) {
321
            /* Skip the last OP if sync_flag is NONE since we
322 323 324 325 326
             * want to leave it to the ending synchronization
             * so that we can piggyback LOCK / FLUSH.
             * However, if it is a request-based RMA, do not
             * skip it (otherwise a wait call before unlock
             * will be blocked). */
327
            break;
328 329
        }

330 331
        flags = MPIDI_CH3_PKT_FLAG_NONE;

332 333 334 335
        if (first_op) {
            /* piggyback on first OP. */
            if (target->access_state == MPIDI_RMA_LOCK_CALLED) {
                MPIU_Assert(curr_op->piggyback_lock_candidate);
336 337 338 339 340 341
                if (target->lock_type == MPI_LOCK_SHARED)
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED;
                else {
                    MPIU_Assert(target->lock_type == MPI_LOCK_EXCLUSIVE);
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE;
                }
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
                target->access_state = MPIDI_RMA_LOCK_ISSUED;
            }
            first_op = 0;
        }

        if (curr_op->next == NULL) {
            /* piggyback on last OP. */
            if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
                flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
                if (target->win_complete_flag)
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
            }
            else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
                flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
            }
        }

        mpi_errno = issue_rma_op(curr_op, win_ptr, target, flags);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

363 364
        (*made_progress) = 1;

365
        if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
366 367 368
            curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
            curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
            curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
369
            target->put_acc_issued = 1; /* set PUT_ACC_FLAG when sending
370
                                         * PUT/ACC operation. */
371 372
        }

373 374 375 376 377 378 379 380 381 382
        if ((curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
             curr_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) && curr_op->issued_stream_count > 0) {
            /* For ACC-like operations, if not all stream units
             * are issued out, we stick to the current operation,
             * otherwise we move on to the next operation. */
            target->next_op_to_issue = curr_op;
        }
        else
            target->next_op_to_issue = curr_op->next;

383 384 385 386 387 388 389
        if (target->next_op_to_issue == NULL) {
            if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH || flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
                /* We are done with ending sync, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
            }
        }

390 391
        if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
            flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
392
            /* If this operation is piggybacked with LOCK,
393 394 395
             * do not move it out of pending list, and do
             * not complete the user request, because we
             * may need to re-transmit it. */
396 397 398
            break;
        }

399 400 401 402 403
        if (curr_op->ureq != NULL) {
            mpi_errno = set_user_req_after_issuing_op(curr_op);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
404

405 406
        if (curr_op->reqs_size == 0) {
            MPIU_Assert(curr_op->reqs == NULL);
407
            /* Sending is completed immediately. */
408
            MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list_head),
409 410 411
                                         &(target->pending_op_list_tail), curr_op);
        }
        else {
412 413 414
            MPI_Datatype target_datatype;
            int is_derived = FALSE;

415
            /* Sending is not completed immediately. */
416

417
            MPIDI_CH3I_RMA_Ops_unlink(&(target->pending_op_list_head),
418
                                      &(target->pending_op_list_tail), curr_op);
419 420 421 422 423 424 425 426 427 428 429 430 431

            MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(curr_op->pkt, target_datatype, mpi_errno);

            if ((target_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) ||
                (curr_op->origin_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(curr_op->origin_datatype)) ||
                (curr_op->result_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(curr_op->result_datatype))) {
                is_derived = TRUE;
            }

            if (is_derived) {
432 433
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_dt_op_list_head),
                                          &(target->issued_dt_op_list_tail), curr_op);
434 435
            }
            else if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
436 437 438
                     curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
439 440
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_write_op_list_head),
                                          &(target->issued_write_op_list_tail), curr_op);
441 442
            }
            else {
443 444
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_read_op_list_head),
                                          &(target->issued_read_op_list_tail), curr_op);
445 446 447 448 449
            }
        }

        curr_op = target->next_op_to_issue;

450
    }   /* end of while loop */
451 452 453 454 455 456 457 458 459 460 461

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME issue_ops_win
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
462
static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress)
463 464
{
    int mpi_errno = MPI_SUCCESS;
465
    int start_slot, end_slot, i, idx;
466 467 468 469 470 471 472
    MPIDI_RMA_Target_t *target = NULL;

    (*made_progress) = 0;

    if (win_ptr->non_empty_slots == 0)
        goto fn_exit;

473 474 475 476 477 478 479 480
    /* Exit if window-wide sync is not finished */
    if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
        win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
        goto fn_exit;

    /* FIXME: we should optimize the issuing pattern here. */
481 482 483 484

    start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
    end_slot = start_slot + win_ptr->num_slots;
    for (i = start_slot; i < end_slot; i++) {
485 486 487 488
        if (i < win_ptr->num_slots)
            idx = i;
        else
            idx = i - win_ptr->num_slots;
489

490
        target = win_ptr->slots[idx].target_list_head;
491
        while (target != NULL) {
492 493 494 495
            int temp_progress = 0;

            /* check target state */
            mpi_errno = check_target_state(win_ptr, target, &temp_progress);
496 497 498 499
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (temp_progress)
                (*made_progress) = 1;
500

501 502
            /* issue operations to this target */
            mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
503 504 505 506
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (temp_progress)
                (*made_progress) = 1;
507 508 509 510 511

            target = target->next;
        }
    }

512
  fn_exit:
513
    return mpi_errno;
514
  fn_fail:
515 516 517 518
    goto fn_exit;
}


519 520 521 522 523 524 525 526
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Free_ops_before_completion
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
{
    MPIDI_RMA_Op_t *curr_op = NULL;
    MPIDI_RMA_Target_t *curr_target = NULL;
527
    MPIDI_RMA_Op_t **op_list_head = NULL, **op_list_tail = NULL;
528 529 530 531
    int read_flag = 0;
    int i, made_progress = 0;
    int mpi_errno = MPI_SUCCESS;

532 533 534
    /* If we are in an free_ops_before_completion, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
535
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
536 537

    /* make nonblocking progress once */
538
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
539 540
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
541

542
    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
543 544
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
545 546 547 548
        goto fn_exit;

    /* find targets that have operations */
    for (i = 0; i < win_ptr->num_slots; i++) {
549 550
        if (win_ptr->slots[i].target_list_head != NULL) {
            curr_target = win_ptr->slots[i].target_list_head;
551
            while (curr_target != NULL) {
552 553
                if (curr_target->issued_read_op_list_head != NULL ||
                    curr_target->issued_write_op_list_head != NULL) {
554 555 556 557 558 559 560 561 562
                    if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
                        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
                        if (curr_target->access_state == MPIDI_RMA_LOCK_GRANTED)
                            break;
                    }
                    else {
                        break;
                    }
                }
563
                curr_target = curr_target->next;
564
            }
565 566
            if (curr_target != NULL)
                break;
567 568
        }
    }
569

570 571
    if (curr_target == NULL)
        goto fn_exit;
572

573
    /* After we do this, all following Win_flush_local
574
     * must do a Win_flush instead. */
575
    curr_target->sync.upgrade_flush_local = 1;
576

577 578 579
    if (curr_target->issued_read_op_list_head != NULL) {
        op_list_head = &curr_target->issued_read_op_list_head;
        op_list_tail = &curr_target->issued_read_op_list_tail;
580 581 582
        read_flag = 1;
    }
    else {
583 584
        op_list_head = &curr_target->issued_write_op_list_head;
        op_list_tail = &curr_target->issued_write_op_list_tail;
585 586 587
    }

    /* free all ops in the list since we do not need to maintain them anymore */
588
    for (curr_op = *op_list_head; curr_op != NULL;) {
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
        if (curr_op->reqs_size > 0) {
            MPIU_Assert(curr_op->reqs != NULL);
            for (i = 0; i < curr_op->reqs_size; i++) {
                if (curr_op->reqs[i] != NULL) {
                    MPID_Request_release(curr_op->reqs[i]);
                    curr_op->reqs[i] = NULL;
                    win_ptr->active_req_cnt--;
                }
            }

            /* free req array in this op */
            MPIU_Free(curr_op->reqs);
            curr_op->reqs = NULL;
            curr_op->reqs_size = 0;
        }
604
        MPL_LL_DELETE(*op_list_head, *op_list_tail, curr_op);
605
        MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
606

607
        if (*op_list_head == NULL) {
608
            if (read_flag == 1) {
609 610
                op_list_head = &curr_target->issued_write_op_list_head;
                op_list_head = &curr_target->issued_write_op_list_tail;
611 612 613
                read_flag = 0;
            }
        }
614
        curr_op = *op_list_head;
615
    }
616

617
  fn_exit:
618
    return mpi_errno;
619
  fn_fail:
620 621 622 623
    goto fn_exit;
}


624 625 626 627 628 629 630 631 632 633 634 635 636 637
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_aggressive
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
{
    int i, local_completed = 0, remote_completed = 0;
    int mpi_errno = MPI_SUCCESS;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int made_progress = 0;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
638
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
639 640 641

    /* find the first target that has something to issue */
    for (i = 0; i < win_ptr->num_slots; i++) {
642 643 644
        if (win_ptr->slots[i].target_list_head != NULL) {
            curr_target = win_ptr->slots[i].target_list_head;
            while (curr_target != NULL && curr_target->pending_op_list_head == NULL)
645
                curr_target = curr_target->next;
646 647
            if (curr_target != NULL)
                break;
648 649 650
        }
    }

651 652
    if (curr_target == NULL)
        goto fn_exit;
653 654 655 656 657 658 659 660 661 662 663 664 665

    if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL)
        curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;

    /* Issue out all operations. */
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                    &made_progress);
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);

    /* Wait for local completion. */
    do {
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
666
                                                      &local_completed, &remote_completed);
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
        if (!local_completed) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
    } while (!local_completed);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_target_aggressive
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target)
{
    int i, local_completed = 0, remote_completed = 0;
    int made_progress = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int mpi_errno = MPI_SUCCESS;

    (*target) = NULL;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
699
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanotarget");
700 701 702 703 704 705 706 707 708 709 710

    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        /* switch to window-wide protocol */
        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
            if (i == win_ptr->comm_ptr->rank)
                continue;
            MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
            if (orig_vc->node_id != target_vc->node_id) {
                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
711 712
                if (mpi_errno)
                    MPIU_ERR_POP(mpi_errno);
713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
                if (curr_target == NULL) {
                    win_ptr->outstanding_locks++;
                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
                }
            }
        }
        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED;
    }

    do {
        /* find a non-empty slot and set the FLUSH flag on the first
         * target */
        /* TODO: we should think about better strategies on selecting the target */
        for (i = 0; i < win_ptr->num_slots; i++)
729
            if (win_ptr->slots[i].target_list_head != NULL)
730
                break;
731
        curr_target = win_ptr->slots[i].target_list_head;
732 733 734 735 736 737 738 739 740 741 742 743 744 745
        if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
            curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
            curr_target->sync.outstanding_acks++;
        }

        /* Issue out all operations. */
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                        &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

        /* Wait for remote completion. */
        do {
            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
746
                                                          &local_completed, &remote_completed);
747 748 749 750 751 752 753 754 755 756 757
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (!remote_completed) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        } while (!remote_completed);

        /* Cleanup the target. */
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, curr_target);
758 759
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
760 761 762 763 764 765 766 767 768 769 770 771 772

        /* check if we got a target */
        (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);

    } while ((*target) == NULL);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


773 774 775 776 777 778
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_target
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress)
{
779 780
    int temp_progress = 0;
    MPIDI_RMA_Target_t *target = NULL;
781 782 783 784
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

785 786
    /* check window state */
    mpi_errno = check_window_state(win_ptr, &temp_progress);
787 788 789 790
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
791

792 793
    /* find target element */
    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
794 795
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
796

797 798
    /* check target state */
    mpi_errno = check_target_state(win_ptr, target, &temp_progress);
799 800 801 802
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
803

804 805
    /* issue operations to this target */
    mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
806 807 808 809
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
810 811 812 813 814 815 816 817 818 819 820 821 822 823

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_win
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress)
{
824
    int temp_progress = 0;
825 826 827 828 829
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

    /* check window state */
830
    mpi_errno = check_window_state(win_ptr, &temp_progress);
831 832 833 834
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
835

836
    /* issue operations on window */
837
    mpi_errno = issue_ops_win(win_ptr, &temp_progress);
838 839 840 841
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_global
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
{
    MPIDI_RMA_Win_list_t *win_elem = MPIDI_RMA_Win_list;
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

    for (win_elem = MPIDI_RMA_Win_list; win_elem; win_elem = win_elem->next) {
862
        int temp_progress = 0;
863

864 865 866 867
        if (win_elem->win_ptr->states.access_state == MPIDI_RMA_NONE ||
            win_elem->win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
            win_elem->win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED)
            continue;
868

869
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_elem->win_ptr, &temp_progress);
870 871 872 873
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
        if (temp_progress)
            (*made_progress) = 1;
874 875 876 877 878 879 880
    }

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}