ch3u_rma_progress.c 31.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    - name        : MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD
      category    : CH3
      type        : int
      default     : 2097152
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
         Threshold of number of active requests to trigger
         blocking waiting in operation routines. When the
         value is negative, we never blockingly wait in
         operation routines. When the value is zero, we always
         trigger blocking waiting in operation routines to
         wait until no. of active requests becomes zero. When the
         value is positive, we do blocking waiting in operation
         routines to wait until no. of active requests being
         reduced to this value.

32 33 34 35 36 37 38 39 40 41 42 43 44
    - name        : MPIR_CVAR_CH3_RMA_POKE_PROGRESS_REQ_THRESHOLD
      category    : CH3
      type        : int
      default     : 128
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_ALL_EQ
      description : >-
        Threshold at which the RMA implementation attempts to complete requests
        while completing RMA operations and while using the lazy synchonization
        approach.  Change this value if programs fail because they run out of
        requests or other internal resources

45 46 47
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

48
static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
49 50
                                                int *is_able_to_issue, int *made_progress);
static inline int check_and_switch_window_state(MPID_Win * win_ptr, int *is_able_to_issue,
51
                                                int *made_progress);
52 53
static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                   int *made_progress);
54

55
/* check if we can switch window-wide state: FENCE_ISSUED, PSCW_ISSUED, LOCK_ALL_ISSUED */
56
#undef FUNCNAME
57
#define FUNCNAME check_and_switch_window_state
58 59
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
60 61
static inline int check_and_switch_window_state(MPID_Win * win_ptr, int *is_able_to_issue,
                                                int *made_progress)
62
{
63
    MPID_Request *fence_req_ptr = NULL;
64
    int i, mpi_errno = MPI_SUCCESS;
65
    MPIDI_STATE_DECL(MPID_STATE_CHECK_AND_SWITCH_WINDOW_STATE);
66

67
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_CHECK_AND_SWITCH_WINDOW_STATE);
68 69

    (*made_progress) = 0;
70
    (*is_able_to_issue) = 0;
71

72 73
    switch (win_ptr->states.access_state) {
    case MPIDI_RMA_FENCE_ISSUED:
74 75 76
        MPID_Request_get_ptr(win_ptr->fence_sync_req, fence_req_ptr);
        if (MPID_Request_is_complete(fence_req_ptr)) {
            win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
77 78
            (*is_able_to_issue) = 1;

79 80 81
            MPID_Request_release(fence_req_ptr);
            win_ptr->fence_sync_req = MPI_REQUEST_NULL;

82 83
            MPIDI_CH3I_num_active_issued_win--;
            MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
84

85 86
            (*made_progress) = 1;
        }
87 88 89
        break;

    case MPIDI_RMA_PSCW_ISSUED:
90 91
        if (win_ptr->start_req == NULL) {
            /* for MPI_MODE_NOCHECK and all targets on SHM,
92
             * we do not create PSCW requests on window. */
93
            win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
94
            (*is_able_to_issue) = 1;
95

96 97
            MPIDI_CH3I_num_active_issued_win--;
            MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
98

99 100 101 102 103 104 105 106 107 108 109 110 111
            (*made_progress) = 1;
        }
        else {
            for (i = 0; i < win_ptr->start_grp_size; i++) {
                MPID_Request *start_req_ptr = NULL;
                if (win_ptr->start_req[i] == MPI_REQUEST_NULL)
                    continue;
                MPID_Request_get_ptr(win_ptr->start_req[i], start_req_ptr);
                if (MPID_Request_is_complete(start_req_ptr)) {
                    MPID_Request_release(start_req_ptr);
                    win_ptr->start_req[i] = MPI_REQUEST_NULL;
                }
                else {
112
                    break;
113 114 115
                }
            }

116 117
            if (i == win_ptr->start_grp_size) {
                win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
118
                (*is_able_to_issue) = 1;
119

120 121
                MPIDI_CH3I_num_active_issued_win--;
                MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
122

123 124 125 126 127
                (*made_progress) = 1;

                MPIU_Free(win_ptr->start_req);
                win_ptr->start_req = NULL;
            }
128
        }
129 130 131
        break;

    case MPIDI_RMA_LOCK_ALL_ISSUED:
132 133
        if (win_ptr->outstanding_locks == 0) {
            win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_GRANTED;
134 135
            (*is_able_to_issue) = 1;

136 137
            (*made_progress) = 1;
        }
138 139
        break;

140 141 142 143 144 145 146 147
    case MPIDI_RMA_PER_TARGET:
    case MPIDI_RMA_LOCK_ALL_CALLED:
    case MPIDI_RMA_FENCE_GRANTED:
    case MPIDI_RMA_PSCW_GRANTED:
    case MPIDI_RMA_LOCK_ALL_GRANTED:
        (*is_able_to_issue) = 1;
        break;

148 149
    default:
        break;
150
    }   /* end of switch */
151 152

  fn_exit:
153
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_CHECK_AND_SWITCH_WINDOW_STATE);
154 155 156 157 158 159 160 161 162
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
163
#define FUNCNAME check_and_switch_target_state
164 165
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
166
static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
167
                                                int *is_able_to_issue, int *made_progress)
168 169 170 171 172
{
    int rank = win_ptr->comm_ptr->rank;
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;
173
    (*is_able_to_issue) = 0;
174

175
    if (target == NULL)
176 177
        goto fn_exit;

178 179 180 181 182
    switch (target->access_state) {
    case MPIDI_RMA_LOCK_CALLED:
        if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE ||
            target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL ||
            target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
183 184
            if (target->pending_op_list_head == NULL ||
                !target->pending_op_list_head->piggyback_lock_candidate) {
185 186 187 188
                /* issue lock request */
                target->access_state = MPIDI_RMA_LOCK_ISSUED;
                if (target->target_rank == rank) {
                    mpi_errno = acquire_local_lock(win_ptr, target->lock_type);
189 190
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
191 192
                }
                else {
193 194 195
                    mpi_errno = send_lock_msg(target->target_rank, target->lock_type, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
196 197 198 199 200 201
                }

                (*made_progress) = 1;
            }
        }
        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
202
            if (target->pending_op_list_head == NULL) {
203
                /* No RMA operation has ever been posted to this target,
204 205
                 * finish issuing, no need to acquire the lock. Cleanup
                 * function will clean it up. */
206 207 208 209 210 211 212 213 214
                target->access_state = MPIDI_RMA_LOCK_GRANTED;

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

                (*made_progress) = 1;
            }
            else {
                /* if we reach WIN_UNLOCK and there is still operation existing
215 216
                 * in pending list, this operation must be the only operation
                 * and it is prepared to piggyback LOCK and UNLOCK. */
217
                MPIU_Assert(MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING);
218 219
                MPIU_Assert(target->pending_op_list_head->next == NULL);
                MPIU_Assert(target->pending_op_list_head->piggyback_lock_candidate);
220 221 222 223 224 225
            }
        }
        break;

    case MPIDI_RMA_LOCK_GRANTED:
    case MPIDI_RMA_NONE:
226 227 228 229 230 231 232
        if (target->win_complete_flag) {
            if (target->pending_op_list_head == NULL) {
                mpi_errno = send_decr_at_cnt_msg(target->target_rank, win_ptr);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        }
233
        if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
234
            if (target->pending_op_list_head == NULL) {
235
                if (target->target_rank != rank) {
236
                    if (target->put_acc_issued) {
237 238 239

                        target->sync.outstanding_acks++;

240
                        mpi_errno = send_flush_msg(target->target_rank, win_ptr);
241 242
                        if (mpi_errno != MPI_SUCCESS)
                            MPIU_ERR_POP(mpi_errno);
243 244
                    }
                }
245 246 247 248 249

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

                (*made_progress) = 1;
250
            }
251 252
        }
        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
253
            if (target->pending_op_list_head == NULL) {
254 255
                if (target->target_rank == rank) {
                    mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
256 257
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
258
                }
259 260 261 262
                else {
                    MPIDI_CH3_Pkt_flags_t flag = MPIDI_CH3_PKT_FLAG_NONE;
                    if (!target->put_acc_issued) {
                        /* We did not issue PUT/ACC since the last
263 264
                         * synchronization call, therefore here we
                         * don't need ACK back */
265 266 267

                        flag = MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK;
                    }
268 269 270
                    else {
                        target->sync.outstanding_acks++;
                    }
271
                    mpi_errno = send_unlock_msg(target->target_rank, win_ptr, flag);
272 273
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
274 275 276 277 278
                }

                /* We are done with ending synchronization, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;

279 280 281
                (*made_progress) = 1;
            }
        }
282
        break;
283

284 285
    default:
        break;
286
    }   /* end of switch */
287

288
    if (target->access_state != MPIDI_RMA_LOCK_ISSUED) {
289 290 291
        (*is_able_to_issue) = 1;
    }

292
  fn_exit:
293
    return mpi_errno;
294
  fn_fail:
295 296
    goto fn_exit;
}
297 298


299 300 301 302
#undef FUNCNAME
#define FUNCNAME issue_ops_target
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
303
static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
304 305 306 307
                                   int *made_progress)
{
    MPIDI_RMA_Op_t *curr_op = NULL;
    MPIDI_CH3_Pkt_flags_t flags;
308
    int is_able_to_issue = 0;
309
    int first_op = 1, mpi_errno = MPI_SUCCESS;
310

311
    (*made_progress) = 0;
312

313 314 315 316 317 318 319
    /* check and try to switch target state */
    mpi_errno = check_and_switch_target_state(win_ptr, target, &is_able_to_issue, made_progress);
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
    if (!is_able_to_issue)
        goto fn_exit;

320
    if (win_ptr->non_empty_slots == 0 || target == NULL || target->pending_op_list_head == NULL)
321
        goto fn_exit;
322

323 324 325 326
    /* Issue out operations in the list. */
    curr_op = target->next_op_to_issue;
    while (curr_op != NULL) {

327 328
        if (target->access_state == MPIDI_RMA_LOCK_ISSUED) {
            /* It is possible that the previous OP+LOCK changes
329
             * lock state to LOCK_ISSUED. */
330 331
            break;
        }
332

333
        if (MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING && curr_op->next == NULL &&
334
            target->sync.sync_flag == MPIDI_RMA_SYNC_NONE && curr_op->ureq == NULL) {
335 336
            /* If DELAY_ISSUING_FOR_PIGGYBACKING is turned on,
             * skip the last OP if sync_flag is NONE since we
337
             * want to leave it to the ending synchronization
338
             * so that we can piggyback UNLOCK / FLUSH.
339 340 341
             * However, if it is a request-based RMA, do not
             * skip it (otherwise a wait call before unlock
             * will be blocked). */
342
            break;
343 344
        }

345 346
        flags = MPIDI_CH3_PKT_FLAG_NONE;

347 348 349 350
        if (first_op) {
            /* piggyback on first OP. */
            if (target->access_state == MPIDI_RMA_LOCK_CALLED) {
                MPIU_Assert(curr_op->piggyback_lock_candidate);
351 352 353 354 355 356
                if (target->lock_type == MPI_LOCK_SHARED)
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED;
                else {
                    MPIU_Assert(target->lock_type == MPI_LOCK_EXCLUSIVE);
                    flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE;
                }
357 358 359 360 361
                target->access_state = MPIDI_RMA_LOCK_ISSUED;
            }
            first_op = 0;
        }

362 363 364 365
        /* piggyback FLUSH on every OP if ordered flush is not guaranteed. */
        if (!MPIDI_CH3U_Win_pkt_orderings.am_flush_ordered)
            flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;

366 367 368 369 370 371 372
        if (curr_op->next == NULL) {
            /* piggyback on last OP. */
            if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
                flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
            }
            else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
                flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
373 374 375 376

                /* if piggyback UNLOCK then unset FLUSH (set for every
                 * operation on out-of-order network). */
                flags &= ~MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
377
            }
378 379
            if (target->win_complete_flag)
                flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
380 381
        }

382 383 384 385 386 387
        /* only increase ack counter when FLUSH or UNLOCK flag is set,
         * but without LOCK piggyback. */
        if (((flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
             || (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)))
            target->sync.outstanding_acks++;

388 389 390 391
        mpi_errno = issue_rma_op(curr_op, win_ptr, target, flags);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

392 393
        (*made_progress) = 1;

394
        if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
395 396 397
            curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
            curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
            curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
398
            target->put_acc_issued = 1; /* set PUT_ACC_FLAG when sending
399
                                         * PUT/ACC operation. */
400 401
        }

402
        if ((curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
403 404
             curr_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) &&
            curr_op->issued_stream_count != ALL_STREAM_UNITS_ISSUED) {
405 406 407 408 409 410 411 412
            /* For ACC-like operations, if not all stream units
             * are issued out, we stick to the current operation,
             * otherwise we move on to the next operation. */
            target->next_op_to_issue = curr_op;
        }
        else
            target->next_op_to_issue = curr_op->next;

413 414 415 416 417 418 419
        if (target->next_op_to_issue == NULL) {
            if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH || flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
                /* We are done with ending sync, unset target's sync_flag. */
                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
            }
        }

420 421
        if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
            flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
422
            /* If this operation is piggybacked with LOCK,
423 424 425
             * do not move it out of pending list, and do
             * not complete the user request, because we
             * may need to re-transmit it. */
426 427 428
            break;
        }

429 430 431 432 433
        if (curr_op->ureq != NULL) {
            mpi_errno = set_user_req_after_issuing_op(curr_op);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
434

435
        if (curr_op->reqs_size == 0) {
436
            MPIU_Assert(curr_op->single_req == NULL && curr_op->multi_reqs == NULL);
437
            /* Sending is completed immediately. */
438
            MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list_head),
439 440 441
                                         &(target->pending_op_list_tail), curr_op);
        }
        else {
442 443 444
            MPI_Datatype target_datatype;
            int is_derived = FALSE;

445
            /* Sending is not completed immediately. */
446

447
            MPIDI_CH3I_RMA_Ops_unlink(&(target->pending_op_list_head),
448
                                      &(target->pending_op_list_tail), curr_op);
449 450 451 452 453 454 455 456 457 458 459 460 461

            MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(curr_op->pkt, target_datatype, mpi_errno);

            if ((target_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) ||
                (curr_op->origin_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(curr_op->origin_datatype)) ||
                (curr_op->result_datatype != MPI_DATATYPE_NULL &&
                 !MPIR_DATATYPE_IS_PREDEFINED(curr_op->result_datatype))) {
                is_derived = TRUE;
            }

            if (is_derived) {
462 463
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_dt_op_list_head),
                                          &(target->issued_dt_op_list_tail), curr_op);
464 465
            }
            else if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
466 467 468
                     curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
469 470
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_write_op_list_head),
                                          &(target->issued_write_op_list_tail), curr_op);
471 472
            }
            else {
473 474
                MPIDI_CH3I_RMA_Ops_append(&(target->issued_read_op_list_head),
                                          &(target->issued_read_op_list_tail), curr_op);
475
            }
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491


            /* Poke the progress engine when next_op_to_issue is not the current OP, in
             * order to make sure the issuing function is re-entrant safe. */
            if (target->next_op_to_issue != curr_op &&
                win_ptr->active_req_cnt > MPIR_CVAR_CH3_RMA_POKE_PROGRESS_REQ_THRESHOLD) {
                int local_completed, remote_completed;
                mpi_errno = poke_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);

                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
                                                           &remote_completed);
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
492 493 494 495
        }

        curr_op = target->next_op_to_issue;

496
    }   /* end of while loop */
497 498 499 500 501 502 503 504

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


505 506 507 508 509 510 511 512
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Free_ops_before_completion
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
{
    MPIDI_RMA_Op_t *curr_op = NULL;
    MPIDI_RMA_Target_t *curr_target = NULL;
513
    MPIDI_RMA_Op_t **op_list_head = NULL, **op_list_tail = NULL;
514 515 516 517
    int read_flag = 0;
    int i, made_progress = 0;
    int mpi_errno = MPI_SUCCESS;

518 519 520
    /* If we are in an free_ops_before_completion, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
521
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
522 523

    /* make nonblocking progress once */
524
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
525 526
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
527

528
    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
529 530
        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
531 532 533 534
        goto fn_exit;

    /* find targets that have operations */
    for (i = 0; i < win_ptr->num_slots; i++) {
535 536
        if (win_ptr->slots[i].target_list_head != NULL) {
            curr_target = win_ptr->slots[i].target_list_head;
537
            while (curr_target != NULL) {
538 539
                if (curr_target->issued_read_op_list_head != NULL ||
                    curr_target->issued_write_op_list_head != NULL) {
540 541 542 543 544 545 546 547 548
                    if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
                        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
                        if (curr_target->access_state == MPIDI_RMA_LOCK_GRANTED)
                            break;
                    }
                    else {
                        break;
                    }
                }
549
                curr_target = curr_target->next;
550
            }
551 552
            if (curr_target != NULL)
                break;
553 554
        }
    }
555

556 557
    if (curr_target == NULL)
        goto fn_exit;
558

559
    /* After we do this, all following Win_flush_local
560
     * must do a Win_flush instead. */
561
    curr_target->sync.upgrade_flush_local = 1;
562

563 564 565 566 567
    op_list_head = &curr_target->issued_read_op_list_head;
    op_list_tail = &curr_target->issued_read_op_list_tail;
    read_flag = 1;

    curr_op = *op_list_head;
568 569

    /* free all ops in the list since we do not need to maintain them anymore */
570 571
    while (1) {
        if (curr_op != NULL) {
572 573 574 575 576 577 578 579 580
            if (curr_op->reqs_size == 1) {
                MPIU_Assert(curr_op->single_req != NULL);
                MPID_Request_release(curr_op->single_req);
                curr_op->single_req = NULL;
                win_ptr->active_req_cnt--;
                curr_op->reqs_size = 0;
            }
            else if (curr_op->reqs_size > 1) {
                MPIU_Assert(curr_op->multi_reqs != NULL);
581
                for (i = 0; i < curr_op->reqs_size; i++) {
582 583 584
                    if (curr_op->multi_reqs[i] != NULL) {
                        MPID_Request_release(curr_op->multi_reqs[i]);
                        curr_op->multi_reqs[i] = NULL;
585 586
                        win_ptr->active_req_cnt--;
                    }
587 588
                }

589
                /* free req array in this op */
590 591
                MPIU_Free(curr_op->multi_reqs);
                curr_op->multi_reqs = NULL;
592 593
                curr_op->reqs_size = 0;
            }
594
            MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, op_list_head, op_list_tail, curr_op);
595
        }
596
        else {
597
            if (read_flag == 1) {
598
                op_list_head = &curr_target->issued_write_op_list_head;
Xin Zhao's avatar
Xin Zhao committed
599
                op_list_tail = &curr_target->issued_write_op_list_tail;
600 601
                read_flag = 0;
            }
602 603 604 605
            else {
                /* we reach the tail of write_op_list, break out. */
                break;
            }
606
        }
607
        curr_op = *op_list_head;
608
    }
609

610
  fn_exit:
611
    return mpi_errno;
612
  fn_fail:
613 614 615 616
    goto fn_exit;
}


617 618 619 620 621 622
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_aggressive
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
{
623
    int i, local_completed = 0, remote_completed ATTRIBUTE((unused)) = 0;
624 625 626 627 628 629 630
    int mpi_errno = MPI_SUCCESS;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int made_progress = 0;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
631
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
632 633 634

    /* find the first target that has something to issue */
    for (i = 0; i < win_ptr->num_slots; i++) {
635 636 637
        if (win_ptr->slots[i].target_list_head != NULL) {
            curr_target = win_ptr->slots[i].target_list_head;
            while (curr_target != NULL && curr_target->pending_op_list_head == NULL)
638
                curr_target = curr_target->next;
639 640
            if (curr_target != NULL)
                break;
641 642 643
        }
    }

644 645
    if (curr_target == NULL)
        goto fn_exit;
646 647 648 649 650 651 652 653 654 655 656 657

    if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL)
        curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;

    /* Issue out all operations. */
    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                    &made_progress);
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);

    /* Wait for local completion. */
    do {
658
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
659 660
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
661 662 663

        MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed, remote_completed);

664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
        if (!local_completed) {
            mpi_errno = wait_progress_engine();
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
        }
    } while (!local_completed);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_target_aggressive
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target)
{
684
    int i, local_completed ATTRIBUTE((unused)) = 0, remote_completed = 0;
685 686 687 688 689 690 691 692 693
    int made_progress = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int mpi_errno = MPI_SUCCESS;

    (*target) = NULL;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
694
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanotarget");
695 696 697 698 699 700 701 702 703 704 705

    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        /* switch to window-wide protocol */
        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
            if (i == win_ptr->comm_ptr->rank)
                continue;
            MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
            if (orig_vc->node_id != target_vc->node_id) {
                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
706 707
                if (mpi_errno)
                    MPIU_ERR_POP(mpi_errno);
708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
                if (curr_target == NULL) {
                    win_ptr->outstanding_locks++;
                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
                }
            }
        }
        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED;
    }

    do {
        /* find a non-empty slot and set the FLUSH flag on the first
         * target */
        /* TODO: we should think about better strategies on selecting the target */
        for (i = 0; i < win_ptr->num_slots; i++)
724
            if (win_ptr->slots[i].target_list_head != NULL)
725
                break;
726
        curr_target = win_ptr->slots[i].target_list_head;
727 728 729 730 731 732 733 734 735 736 737 738
        if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
            curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
        }

        /* Issue out all operations. */
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                        &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

        /* Wait for remote completion. */
        do {
739
            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
740 741
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
742 743 744

            MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed, remote_completed);

745 746 747 748 749 750 751 752
            if (!remote_completed) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        } while (!remote_completed);

        /* Cleanup the target. */
Xin Zhao's avatar
Xin Zhao committed
753
        mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, curr_target);
754 755
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
756 757 758 759 760 761 762 763 764 765 766 767 768

        /* check if we got a target */
        (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);

    } while ((*target) == NULL);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


769 770 771 772 773 774
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_target
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress)
{
775
    int temp_progress = 0;
776
    int is_able_to_issue = 0;
777
    MPIDI_RMA_Target_t *target = NULL;
778 779 780 781
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

782
    /* check window state */
783
    mpi_errno = check_and_switch_window_state(win_ptr, &is_able_to_issue, &temp_progress);
784 785 786 787
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
788 789
    if (!is_able_to_issue)
        goto fn_exit;
790

791 792
    /* find target element */
    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
793 794
    if (mpi_errno != MPI_SUCCESS)
        MPIU_ERR_POP(mpi_errno);
795

796 797
    /* issue operations to this target */
    mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
798 799 800 801
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);
    if (temp_progress)
        (*made_progress) = 1;
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_win
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress)
{
    int mpi_errno = MPI_SUCCESS;
Xin Zhao's avatar
Xin Zhao committed
817 818 819
    int start_slot, end_slot, i, idx;
    int is_able_to_issue = 0;
    MPIDI_RMA_Target_t *target = NULL;
820 821 822

    (*made_progress) = 0;

Xin Zhao's avatar
Xin Zhao committed
823 824 825
    /* check and try to switch window state */
    mpi_errno = check_and_switch_window_state(win_ptr, &is_able_to_issue, made_progress);
    if (mpi_errno != MPI_SUCCESS)
826
        MPIU_ERR_POP(mpi_errno);
Xin Zhao's avatar
Xin Zhao committed
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
    if (!is_able_to_issue)
        goto fn_exit;

    if (win_ptr->non_empty_slots == 0)
        goto fn_exit;

    /* FIXME: we should optimize the issuing pattern here. */

    start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
    end_slot = start_slot + win_ptr->num_slots;
    for (i = start_slot; i < end_slot; i++) {
        if (i < win_ptr->num_slots)
            idx = i;
        else
            idx = i - win_ptr->num_slots;

        target = win_ptr->slots[idx].target_list_head;
        while (target != NULL) {
            int temp_progress = 0;

            /* issue operations to this target */
            mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (temp_progress)
                (*made_progress) = 1;

            target = target->next;
        }
    }
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_global
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
{
    MPIDI_RMA_Win_list_t *win_elem = MPIDI_RMA_Win_list;
    int mpi_errno = MPI_SUCCESS;

    (*made_progress) = 0;

876 877 878
    if (MPIDI_CH3I_num_active_issued_win == 0 && MPIDI_CH3I_num_passive_win == 0)
        goto fn_exit;

879
    for (win_elem = MPIDI_RMA_Win_list; win_elem; win_elem = win_elem->next) {
880
        int temp_progress = 0;
881

882 883 884 885
        if (win_elem->win_ptr->states.access_state == MPIDI_RMA_NONE ||
            win_elem->win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
            win_elem->win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED)
            continue;
886

887
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_elem->win_ptr, &temp_progress);
888 889 890 891
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);
        if (temp_progress)
            (*made_progress) = 1;
892 893 894 895 896 897 898
    }

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}