ch3u_handle_recv_req.c 55.7 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

static int create_derived_datatype(MPID_Request * rreq, MPID_Datatype ** dtp);

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3U_Handle_recv_req
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPID_Request * rreq, 
			       int * complete)
{
19
    static int in_routine ATTRIBUTE((unused)) = FALSE;
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    int mpi_errno = MPI_SUCCESS;
    int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);

    MPIU_Assert(in_routine == FALSE);
    in_routine = TRUE;

    reqFn = rreq->dev.OnDataAvail;
    if (!reqFn) {
	MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_RECV);
	MPIDI_CH3U_Request_complete(rreq);
	*complete = TRUE;
    }
    else {
        mpi_errno = reqFn( vc, rreq, complete );
    }

    in_routine = FALSE;
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);
    return mpi_errno;
}

/* ----------------------------------------------------------------------- */
/* Here are the functions that implement the actions that are taken when 
 * data is available for a receive request (or other completion operations)
 * These include "receive" requests that are part of the RMA implementation.
 *
 * The convention for the names of routines that are called when data is
 * available is
 *    MPIDI_CH3_ReqHandler_<type>( MPIDI_VC_t *, MPID_Request *, int * )
 * as in 
 *    MPIDI_CH3_ReqHandler_...
 *
 * ToDo: 
 *    We need a way for each of these functions to describe what they are,
 *    so that given a pointer to one of these functions, we can retrieve
 *    a description of the routine.  We may want to use a static string 
 *    and require the user to maintain thread-safety, at least while
 *    accessing the string.
 */
/* ----------------------------------------------------------------------- */
63
64
int MPIDI_CH3_ReqHandler_RecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
				       MPID_Request *rreq, 
65
66
67
68
69
70
71
72
73
				       int *complete )
{
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
    return MPI_SUCCESS;
}

#undef FUNCNAME
74
#define FUNCNAME MPIDI_CH3_ReqHandler_PutRecvComplete
75
76
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
77
78
79
int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
                                          MPID_Request *rreq,
                                          int *complete )
80
81
82
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Win *win_ptr;
83
84
    MPI_Win source_win_handle = rreq->dev.source_win_handle;
    MPIDI_CH3_Pkt_flags_t flags = rreq->dev.flags;
85
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
86

87
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    /* NOTE: It is possible that this request is already completed before
       entering this handler. This happens when this req handler is called
       within the same req handler on the same request.
       Consider this case: req is queued up in SHM queue with ref count of 2:
       one is for completing the request and another is for dequeueing from
       the queue. The first called req handler on this request completed
       this request and decrement ref counter to 1. Request is still in the
       queue. Within this handler, we call the req handler on the same request
       for the second time (for example when making progress on SHM queue),
       and the second called handler also tries to complete this request,
       which leads to wrong execution.
       Here we check if req is already completed to prevent processing the
       same request twice. */
    if (MPID_Request_is_complete(rreq)) {
        *complete = FALSE;
        goto fn_exit;
    }

107
108
    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);

109
110
111
112
113
114
115
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);

    /* NOTE: finish_op_on_target() must be called after we complete this request,
       because inside finish_op_on_target() we may call this request handler
       on the same request again (in release_lock()). Marking this request as
       completed will prevent us from processing the same request twice. */
116
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_PUT,
117
                                    flags, source_win_handle);
118
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
119

120
    *complete = TRUE;
121

122
123
124
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
    return MPI_SUCCESS;
125

126
127
128
129
130
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
131
132


133
134
135
136
137
138
139
140
141
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_AccumRecvComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
                                            MPID_Request *rreq,
                                            int *complete )
{
    int mpi_errno = MPI_SUCCESS;
142
    MPI_Aint true_lb, true_extent;
143
    MPID_Win *win_ptr;
144
145
    MPI_Win source_win_handle = rreq->dev.source_win_handle;
    MPIDI_CH3_Pkt_flags_t flags = rreq->dev.flags;
146
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
147

148
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
    /* NOTE: It is possible that this request is already completed before
       entering this handler. This happens when this req handler is called
       within the same req handler on the same request.
       Consider this case: req is queued up in SHM queue with ref count of 2:
       one is for completing the request and another is for dequeueing from
       the queue. The first called req handler on this request completed
       this request and decrement ref counter to 1. Request is still in the
       queue. Within this handler, we call the req handler on the same request
       for the second time (for example when making progress on SHM queue),
       and the second called handler also tries to complete this request,
       which leads to wrong execution.
       Here we check if req is already completed to prevent processing the
       same request twice. */
    if (MPID_Request_is_complete(rreq)) {
        *complete = FALSE;
        goto fn_exit;
    }

168
    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
169

170
    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP);
171

172
173
174
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
    /* accumulate data from tmp_buf into user_buf */
175
176
    mpi_errno = do_accumulate_op(rreq->dev.final_user_buf, rreq->dev.real_user_buf,
                                 rreq->dev.user_count, rreq->dev.datatype, rreq->dev.op);
177
178
179
180
181
182
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }

183
184
185
186
    /* free the temporary buffer */
    MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
    MPIU_Free((char *) rreq->dev.final_user_buf + true_lb);

187
188
189
190
191
192
193
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);

    /* NOTE: finish_op_on_target() must be called after we complete this request,
       because inside finish_op_on_target() we may call this request handler
       on the same request again (in release_lock()). Marking this request as
       completed will prevent us from processing the same request twice. */
194
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_ACCUMULATE,
195
                                    flags, source_win_handle);
196
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
197

198
    *complete = TRUE;
199

200
201
202
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
    return MPI_SUCCESS;
203

204
205
206
207
208
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
209
210


211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumRecvComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
                                             MPID_Request *rreq,
                                             int *complete )
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Win *win_ptr;
    MPI_Aint type_size;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
    MPID_Request *resp_req;
    MPID_IOV iov[MPID_IOV_LIMIT];
226
    MPI_Aint true_lb, true_extent;
227
228
    size_t len;
    int iovcnt;
229
230
231
232
233
234
235
236
237
238
239
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);

    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);

    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
    get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
    get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_accum_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
240
    get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
241
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
242
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
243
244
    if ((rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
        (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
245
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
246
    get_accum_resp_pkt->immed_len = 0;
247

248
    MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
249

250
251
252
253
254
255
256
257
    /* Copy data into a temporary buffer */
    resp_req = MPID_Request_create();
    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    MPIU_Object_set_ref(resp_req, 1);

    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
                        mpi_errno, "GACC resp. buffer");

258
259
260
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

261
262
263
264
265
266
267
268
269
270
271
    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
                    rreq->dev.user_count * type_size);
    } else {
        MPID_Segment *seg = MPID_Segment_alloc();
        MPI_Aint last = type_size * rreq->dev.user_count;

        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
        MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg, 0);
        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
        MPID_Segment_free(seg);
272
273
    }

274
275
276
277
278
279
280
281
282
283
    /* accumulate data from tmp_buf into user_buf */
    mpi_errno = do_accumulate_op(rreq->dev.final_user_buf, rreq->dev.real_user_buf,
                                 rreq->dev.user_count, rreq->dev.datatype, rreq->dev.op);
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

Xin Zhao's avatar
Xin Zhao committed
284
285
    resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;
    resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
286
287
    resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
    resp_req->dev.flags = rreq->dev.flags;
288

289
290
291
    /* here we increment the Active Target counter to guarantee the GET-like
       operation are completed when counter reaches zero. */
    win_ptr->at_completion_counter++;
292

293
294
295
296
297
298
299
300
301
    /* length of target data */
    MPIU_Assign_trunc(len, rreq->dev.user_count * type_size, size_t);

    /* both origin buffer and target buffer are basic datatype,
       fill IMMED data area in response packet header. */
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
        /* Try to copy target data into packet header. */
        MPIU_Assign_trunc(get_accum_resp_pkt->immed_len,
                          MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size),
302
                          int);
303
304
305
306
307

        if (get_accum_resp_pkt->immed_len > 0) {
            void *src = resp_req->dev.user_buf;
            void *dest = (void*) get_accum_resp_pkt->data;
            /* copy data from origin buffer to immed area in packet header */
308
            mpi_errno = immed_copy(src, dest, (size_t)get_accum_resp_pkt->immed_len);
309
310
311
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }
312

313
    if (len == (size_t)get_accum_resp_pkt->immed_len) {
314
315
316
317
318
319
320
321
322
323
324
325
        /* All origin data is in packet header, issue the header. */
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iovcnt = 1;
    }
    else {
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)resp_req->dev.user_buf + get_accum_resp_pkt->immed_len);
        iov[1].MPID_IOV_LEN = rreq->dev.user_count * type_size - get_accum_resp_pkt->immed_len;
        iovcnt = 2;
    }
326
327

    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
328
    mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
329
330
331
332
333
334
335
336
337
    MPIU_THREAD_CS_EXIT(CH3COMM,vc);

    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

    /* Mark get portion as handled */
    rreq->dev.resp_request_handle = MPI_REQUEST_NULL;

    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);

338
339
340
    /* free the temporary buffer */
    MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
    MPIU_Free((char *) rreq->dev.final_user_buf + true_lb);
341
342
343
344
    
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
345
346
 fn_exit:
    MPIU_CHKPMEM_COMMIT();
347
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
348
    return MPI_SUCCESS;
349
350
351
352
353
354

    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
355
356
357
}

#undef FUNCNAME
358
#define FUNCNAME MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete
359
360
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
361
int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
362
363
364
365
						   MPID_Request *rreq, 
						   int *complete )
{
    int mpi_errno = MPI_SUCCESS;
366
    MPID_Datatype *new_dtp = NULL;
367
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
368
    
369
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
                
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
    
    /* update request to get the data */
    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_PUT_RESP);
    rreq->dev.datatype = new_dtp->handle;
    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count; 
    
    rreq->dev.datatype_ptr = new_dtp;
    /* this will cause the datatype to be freed when the
       request is freed. free dtype_info here. */
    MPIU_Free(rreq->dev.dtype_info);
    
    rreq->dev.segment_ptr = MPID_Segment_alloc( );
385
386
    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

387
388
389
390
391
392
393
394
395
396
397
398
399
    MPID_Segment_init(rreq->dev.user_buf,
		      rreq->dev.user_count,
		      rreq->dev.datatype,
		      rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = rreq->dev.recv_data_sz;
    
    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
			    "**ch3|loadrecviov");
    }
    if (!rreq->dev.OnDataAvail) 
400
	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRecvComplete;
401
402
403
    
    *complete = FALSE;
 fn_fail:
404
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
405
406
407
408
    return mpi_errno;
}

#undef FUNCNAME
409
#define FUNCNAME MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete
410
411
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
412
int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
413
414
415
416
						     MPID_Request *rreq, 
						     int *complete )
{
    int mpi_errno = MPI_SUCCESS;
417
    MPID_Datatype *new_dtp = NULL;
418
419
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf;
420
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
421
    
422
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
423
424
425
426
427
428
429
430
431
    
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
    
    /* update new request to get the data */
    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RESP);
    
    /* first need to allocate tmp_buf to recv the data into */
    
432
    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
433
434
435
436
437
    MPID_Datatype_get_extent_macro(new_dtp->handle, extent); 
    
    tmp_buf = MPIU_Malloc(rreq->dev.user_count * 
			  (MPIR_MAX(extent,true_extent)));  
    if (!tmp_buf) {
438
439
	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
440
441
442
443
444
445
    }
    
    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - true_lb);
    
    rreq->dev.user_buf = tmp_buf;
Xin Zhao's avatar
Xin Zhao committed
446
    rreq->dev.final_user_buf = rreq->dev.user_buf;
447
448
449
450
451
452
453
454
455
    rreq->dev.datatype = new_dtp->handle;
    rreq->dev.recv_data_sz = new_dtp->size *
	rreq->dev.user_count; 
    rreq->dev.datatype_ptr = new_dtp;
    /* this will cause the datatype to be freed when the
       request is freed. free dtype_info here. */
    MPIU_Free(rreq->dev.dtype_info);
    
    rreq->dev.segment_ptr = MPID_Segment_alloc( );
456
457
    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

458
459
460
461
462
463
464
465
466
467
468
469
470
    MPID_Segment_init(rreq->dev.user_buf,
		      rreq->dev.user_count,
		      rreq->dev.datatype,
		      rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = rreq->dev.recv_data_sz;
    
    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
			    "**ch3|loadrecviov");
    }
    if (!rreq->dev.OnDataAvail)
471
	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRecvComplete;
472
473
474
    
    *complete = FALSE;
 fn_fail:
475
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
476
477
478
    return mpi_errno;
}

479
480

#undef FUNCNAME
481
#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete
482
483
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
484
485
486
int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
                                                      MPID_Request *rreq,
                                                      int *complete )
487
488
{
    int mpi_errno = MPI_SUCCESS;
489
490
491
492
    MPID_Datatype *new_dtp = NULL;
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
493

494
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
495

496
497
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
498

499
500
    /* update new request to get the data */
    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
501

502
    /* first need to allocate tmp_buf to recv the data into */
503

504
505
506
507
508
509
510
511
512
513
514
515
516
517
    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
    MPID_Datatype_get_extent_macro(new_dtp->handle, extent);

    tmp_buf = MPIU_Malloc(rreq->dev.user_count *
			  (MPIR_MAX(extent,true_extent)));
    if (!tmp_buf) {
	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
    }

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - true_lb);

    rreq->dev.user_buf = tmp_buf;
Xin Zhao's avatar
Xin Zhao committed
518
    rreq->dev.final_user_buf = rreq->dev.user_buf;
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
    rreq->dev.datatype = new_dtp->handle;
    rreq->dev.recv_data_sz = new_dtp->size *
	rreq->dev.user_count;
    rreq->dev.datatype_ptr = new_dtp;
    /* this will cause the datatype to be freed when the
       request is freed. free dtype_info here. */
    MPIU_Free(rreq->dev.dtype_info);

    rreq->dev.segment_ptr = MPID_Segment_alloc( );
    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

    MPID_Segment_init(rreq->dev.user_buf,
		      rreq->dev.user_count,
		      rreq->dev.datatype,
		      rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = rreq->dev.recv_data_sz;

    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
			    "**ch3|loadrecviov");
    }
    if (!rreq->dev.OnDataAvail)
	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumRecvComplete;

    *complete = FALSE;
546
 fn_fail:
547
548
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
    return mpi_errno;
549
550
551
}


Xin Zhao's avatar
Xin Zhao committed
552

553
#undef FUNCNAME
554
#define FUNCNAME MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete
555
556
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
557
int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
558
559
560
561
						   MPID_Request *rreq, 
						   int *complete )
{
    int mpi_errno = MPI_SUCCESS;
562
    MPID_Datatype *new_dtp = NULL;
563
564
565
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_resp_t * get_resp_pkt = &upkt.get_resp;
    MPID_Request * sreq;
566
    MPID_Win *win_ptr;
567
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
568
    
569
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
570
                
571
572
    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);

573
574
575
576
577
578
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
    MPIU_Free(rreq->dev.dtype_info);
    
    /* create request for sending data */
    sreq = MPID_Request_create();
579
    MPIU_ERR_CHKANDJUMP(sreq == NULL, mpi_errno,MPI_ERR_OTHER,"**nomemreq");
580
    
581
582
    sreq->kind = MPID_REQUEST_SEND;
    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_RESP);
583
584
    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
    sreq->dev.OnFinal     = MPIDI_CH3_ReqHandler_GetSendComplete;
585
586
587
588
589
590
    sreq->dev.user_buf = rreq->dev.user_buf;
    sreq->dev.user_count = rreq->dev.user_count;
    sreq->dev.datatype = new_dtp->handle;
    sreq->dev.datatype_ptr = new_dtp;
    sreq->dev.target_win_handle = rreq->dev.target_win_handle;
    sreq->dev.source_win_handle = rreq->dev.source_win_handle;
591
    sreq->dev.flags = rreq->dev.flags;
592
593
    
    MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
594
    get_resp_pkt->request_handle = rreq->dev.request_handle;    
595
596
597
    get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
    get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
598
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
599
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
600
601
    if ((rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
        (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
602
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
603
    get_resp_pkt->immed_len = 0;
604
605
    
    sreq->dev.segment_ptr = MPID_Segment_alloc( );
606
607
    MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

608
609
610
611
612
613
    MPID_Segment_init(sreq->dev.user_buf,
		      sreq->dev.user_count,
		      sreq->dev.datatype,
		      sreq->dev.segment_ptr, 0);
    sreq->dev.segment_first = 0;
    sreq->dev.segment_size = new_dtp->size * sreq->dev.user_count;
614

615
616
    /* Because this is in a packet handler, it is already within a critical section */	
    /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
617
    mpi_errno = vc->sendNoncontig_fn(vc, sreq, get_resp_pkt, sizeof(*get_resp_pkt));
618
    /* MPIU_THREAD_CS_EXIT(CH3COMM,vc); */
619
620
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
621
    {
622
        MPID_Request_release(sreq);
623
        sreq = NULL;
624
        MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg");
625
    }
626
    /* --END ERROR HANDLING-- */
627
628
629
630
631
632
633
    
    /* mark receive data transfer as complete and decrement CC in receive 
       request */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
    
 fn_fail:
634
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
635
636
637
    return mpi_errno;
}

638

639
640
641
642
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_UnpackUEBufComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
643
int MPIDI_CH3_ReqHandler_UnpackUEBufComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
644
645
646
647
648
649
650
651
					      MPID_Request *rreq, 
					      int *complete )
{
    int recv_pending;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
    
652
653
    MPIDI_Request_decr_pending(rreq);
    MPIDI_Request_check_pending(rreq, &recv_pending);
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
    if (!recv_pending)
    { 
	if (rreq->dev.recv_data_sz > 0)
	{
	    MPIDI_CH3U_Request_unpack_uebuf(rreq);
	    MPIU_Free(rreq->dev.tmpbuf);
	}
    }
    else
    {
	/* The receive has not been posted yet.  MPID_{Recv/Irecv}() 
	   is responsible for unpacking the buffer. */
    }
    
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
    
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
    return MPI_SUCCESS;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_UnpackSRBufComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *vc, 
					      MPID_Request *rreq, 
					      int *complete )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);

    MPIDI_CH3U_Request_unpack_srbuf(rreq);

691
692
693
694
695
696
697
698
699
700
701
    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP)
    {
	mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(
	    vc, rreq, complete );
    }
    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP)
    {
	mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(
	    vc, rreq, complete );
    }
    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP)
702
    {
703
	mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
	    vc, rreq, complete );
    }
    else {
	/* mark data transfer as complete and decrement CC */
	MPIDI_CH3U_Request_complete(rreq);
	*complete = TRUE;
    }

    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
720
int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
					      MPID_Request *rreq, 
					      int *complete )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);

    MPIDI_CH3U_Request_unpack_srbuf(rreq);
    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|loadrecviov" );
    }
    *complete = FALSE;
 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_ReloadIOV
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
744
745
int MPIDI_CH3_ReqHandler_ReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
				    MPID_Request *rreq, int *complete )
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);

    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|loadrecviov");
    }
    *complete = FALSE;
 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
    return mpi_errno;
}

/* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */

#undef FUNCNAME
#define FUNCNAME create_derived_datatype
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
{
    MPIDI_RMA_dtype_info *dtype_info;
    MPID_Datatype *new_dtp;
    int mpi_errno=MPI_SUCCESS;
    MPI_Aint ptrdiff;
    MPIDI_STATE_DECL(MPID_STATE_CREATE_DERIVED_DATATYPE);
    
    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DERIVED_DATATYPE);

    dtype_info = req->dev.dtype_info;

    /* allocate new datatype object and handle */
    new_dtp = (MPID_Datatype *) MPIU_Handle_obj_alloc(&MPID_Datatype_mem);
    if (!new_dtp) {
784
785
	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
			     "MPID_Datatype_mem" );
786
787
788
789
790
791
792
793
794
795
796
797
    }

    *dtp = new_dtp;
            
    /* Note: handle is filled in by MPIU_Handle_obj_alloc() */
    MPIU_Object_set_ref(new_dtp, 1);
    new_dtp->is_permanent = 0;
    new_dtp->is_committed = 1;
    new_dtp->attributes   = 0;
    new_dtp->cache_id     = 0;
    new_dtp->name[0]      = 0;
    new_dtp->is_contig = dtype_info->is_contig;
798
    new_dtp->max_contig_blocks = dtype_info->max_contig_blocks; 
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
    new_dtp->size = dtype_info->size;
    new_dtp->extent = dtype_info->extent;
    new_dtp->dataloop_size = dtype_info->dataloop_size;
    new_dtp->dataloop_depth = dtype_info->dataloop_depth; 
    new_dtp->eltype = dtype_info->eltype;
    /* set dataloop pointer */
    new_dtp->dataloop = req->dev.dataloop;
    
    new_dtp->ub = dtype_info->ub;
    new_dtp->lb = dtype_info->lb;
    new_dtp->true_ub = dtype_info->true_ub;
    new_dtp->true_lb = dtype_info->true_lb;
    new_dtp->has_sticky_ub = dtype_info->has_sticky_ub;
    new_dtp->has_sticky_lb = dtype_info->has_sticky_lb;
    /* update pointers in dataloop */
    ptrdiff = (MPI_Aint)((char *) (new_dtp->dataloop) - (char *)
                         (dtype_info->dataloop));
    
    /* FIXME: Temp to avoid SEGV when memory tracing */
    new_dtp->hetero_dloop = 0;

    MPID_Dataloop_update(new_dtp->dataloop, ptrdiff);

    new_dtp->contents = NULL;

 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DERIVED_DATATYPE);

    return mpi_errno;
}

830

831
static inline int perform_put_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
{
    MPIDI_CH3_Pkt_put_t *put_pkt = &((lock_entry->pkt).put);
    MPIDI_VC_t *vc = NULL;
    int mpi_errno = MPI_SUCCESS;

    if (lock_entry->data == NULL) {
        /* all data fits in packet header */
        mpi_errno = MPIR_Localcopy(put_pkt->data, put_pkt->count, put_pkt->datatype,
                                   put_pkt->addr, put_pkt->count, put_pkt->datatype);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }
    else {
        mpi_errno = MPIR_Localcopy(lock_entry->data, put_pkt->count, put_pkt->datatype,
                                   put_pkt->addr, put_pkt->count, put_pkt->datatype);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, put_pkt->origin_rank, &vc);

    /* do final action */
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_PUT,
                                    put_pkt->flags, put_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

863
static inline int perform_get_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
864
865
866
867
868
869
870
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
    MPIDI_CH3_Pkt_get_t *get_pkt = &((lock_entry->pkt).get);
    MPID_Request *sreq = NULL;
    MPIDI_VC_t *vc = NULL;
    MPI_Aint type_size;
871
872
    size_t len;
    int iovcnt;
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
    MPID_IOV iov[MPID_IOV_LIMIT];
    int mpi_errno = MPI_SUCCESS;

    sreq = MPID_Request_create();
    if (sreq == NULL) {
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    }
    MPIU_Object_set_ref(sreq, 1);

    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_RESP);
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
    sreq->dev.OnFinal = MPIDI_CH3_ReqHandler_GetSendComplete;

    sreq->dev.target_win_handle = win_ptr->handle;
    sreq->dev.flags = get_pkt->flags;

    /* here we increment the Active Target counter to guarantee the GET-like
       operation are completed when counter reaches zero. */
    win_ptr->at_completion_counter++;

    MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
    get_resp_pkt->request_handle = get_pkt->request_handle;
    get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
899
900
    if ((get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
        (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
901
902
903
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_resp_pkt->source_win_handle = get_pkt->source_win_handle;
904
    get_resp_pkt->immed_len = 0;
905

906
    /* length of target data */
907
    MPID_Datatype_get_size_macro(get_pkt->datatype, type_size);
908
909
910
911
912
913
914
915
    MPIU_Assign_trunc(len, get_pkt->count * type_size, size_t);

    /* both origin buffer and target buffer are basic datatype,
       fill IMMED data area in response packet header. */
    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
        /* Try to copy target data into packet header. */
        MPIU_Assign_trunc(get_resp_pkt->immed_len,
                          MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size),
916
                          int);
917
918
919
920
921

        if (get_resp_pkt->immed_len > 0) {
            void *src = get_pkt->addr;
            void *dest = (void*) get_resp_pkt->data;
            /* copy data from origin buffer to immed area in packet header */
922
            mpi_errno = immed_copy(src, dest, (size_t)get_resp_pkt->immed_len);
923
924
925
926
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }

927
    if (len == (size_t)get_resp_pkt->immed_len) {
928
929
930
931
932
933
934
935
936
937
938
939
        /* All origin data is in packet header, issue the header. */
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
        iovcnt = 1;
    }
    else {
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)get_pkt->addr + get_resp_pkt->immed_len);
        iov[1].MPID_IOV_LEN = get_pkt->count * type_size - get_resp_pkt->immed_len;
        iovcnt = 2;
    }
940
941
942
943

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, get_pkt->origin_rank, &vc);

944
    mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, iovcnt);
945
946
947
948
949
950
951
952
953
954
955
956
    if (mpi_errno != MPI_SUCCESS) {
        MPID_Request_release(sreq);
	MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


957
static inline int perform_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
{
    MPIDI_CH3_Pkt_accum_t *acc_pkt = &((lock_entry->pkt).accum);
    MPIDI_VC_t *vc = NULL;
    int mpi_errno = MPI_SUCCESS;

    MPIU_Assert(lock_entry->all_data_recved == 1);

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

    if (lock_entry->data == NULL) {
        /* All data fits in packet header */
        mpi_errno = do_accumulate_op(acc_pkt->data, acc_pkt->addr,
                                     acc_pkt->count, acc_pkt->datatype, acc_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }
    else {
        mpi_errno = do_accumulate_op(lock_entry->data, acc_pkt->addr,
                                     acc_pkt->count, acc_pkt->datatype, acc_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, acc_pkt->origin_rank, &vc);

    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_ACCUMULATE,
                                    acc_pkt->flags, acc_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


997
static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
998
999
1000
1001
1002
1003
1004
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &((lock_entry->pkt).get_accum);
    MPID_Request *sreq = NULL;
    MPIDI_VC_t *vc = NULL;
    MPI_Aint type_size;
1005
1006
    size_t len;
    int iovcnt;
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
    MPID_IOV iov[MPID_IOV_LIMIT];
    int mpi_errno = MPI_SUCCESS;

    sreq = MPID_Request_create();
    if (sreq == NULL) {
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    }
    MPIU_Object_set_ref(sreq, 1);

    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
    sreq->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;

    sreq->dev.target_win_handle = win_ptr->handle;
    sreq->dev.flags = get_accum_pkt->flags;

    /* Copy data into a temporary buffer */
    MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
    sreq->dev.user_buf = (void *)MPIU_Malloc(get_accum_pkt->count * type_size);

1028
1029
1030
1031
    /* Perform ACCUMULATE OP */
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
    if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
        MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr,
                    get_accum_pkt->count * type_size);
    } else {
        MPID_Segment *seg = MPID_Segment_alloc();
        MPI_Aint last = type_size * get_accum_pkt->count;

        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
        MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count,
                          get_accum_pkt->datatype, seg, 0);
        MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
        MPID_Segment_free(seg);
    }

1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
    if (lock_entry->data == NULL) {
        /* All data fits in packet header */
        mpi_errno = do_accumulate_op(get_accum_pkt->data, get_accum_pkt->addr,
                                     get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }
    else {
        mpi_errno = do_accumulate_op(lock_entry->data, get_accum_pkt->addr,
                                     get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

1061
1062
1063
1064
1065
1066
1067
1068
1069
    /* here we increment the Active Target counter to guarantee the GET-like
       operation are completed when counter reaches zero. */
    win_ptr->at_completion_counter++;

    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
    get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
    get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
1070
1071
    if ((get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
        (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
1072
1073
1074
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_accum_resp_pkt->source_win_handle = get_accum_pkt->source_win_handle;
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
    get_accum_resp_pkt->immed_len = 0;

    /* length of target data */
    MPIU_Assign_trunc(len, get_accum_pkt->count * type_size, size_t);

    /* both origin buffer and target buffer are basic datatype,
       fill IMMED data area in response packet header. */
    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
        /* Try to copy target data into packet header. */
        MPIU_Assign_trunc(get_accum_resp_pkt->immed_len,
                          MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size),
1086
                          int);
1087
1088
1089
1090
1091

        if (get_accum_resp_pkt->immed_len > 0) {
            void *src = sreq->dev.user_buf;
            void *dest = (void*) get_accum_resp_pkt->data;
            /* copy data from origin buffer to immed area in packet header */
1092
            mpi_errno = immed_copy(src, dest, (size_t)get_accum_resp_pkt->immed_len);
1093
1094
1095
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }
1096

1097
    if (len == (size_t)get_accum_resp_pkt->immed_len) {
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
        /* All origin data is in packet header, issue the header. */
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iovcnt = 1;
    }
    else {
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)sreq->dev.user_buf + get_accum_resp_pkt->immed_len);
        iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size - get_accum_resp_pkt->immed_len;
        iovcnt = 2;
    }
1110
1111
1112
1113

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, get_accum_pkt->origin_rank, &vc);

1114
    mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, iovcnt);
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
    if (mpi_errno != MPI_SUCCESS) {
        MPID_Request_release(sreq);
	MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1127
static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
1128
{
1129
1130
1131
1132
1133
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
    MPIDI_CH3_Pkt_fop_t *fop_pkt = &((lock_entry->pkt).fop);
    MPID_Request *resp_req = NULL;
    MPIDI_VC_t *vc = NULL;
1134
    int mpi_errno = MPI_SUCCESS;
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148

    /* FIXME: this function is same with PktHandler_FOP(), should
       do code refactoring on both of them. */

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, fop_pkt->origin_rank, &vc);

    MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
    fop_resp_pkt->request_handle = fop_pkt->request_handle;
    fop_resp_pkt->source_win_handle = fop_pkt->source_win_handle;
    fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
1149
1150
    if ((fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
        (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
1151
1152
1153
        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    fop_resp_pkt->immed_len = fop_pkt->immed_len;

1154
1155
1156
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

1157
1158
    /* copy data to resp pkt header */
    void *src = fop_pkt->addr, *dest = fop_resp_pkt->data;
1159
    mpi_errno = immed_copy(src, dest, (size_t)fop_resp_pkt->immed_len);
1160
1161
1162
1163
1164
1165
1166
1167
1168
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

    /* Apply the op */
    if (fop_pkt->op != MPI_NO_OP) {
        MPI_User_function *uop = MPIR_OP_HDL_TO_FN(fop_pkt->op);
        int one = 1;
        (*uop)(fop_pkt->data, fop_pkt->addr, &one, &(fop_pkt->datatype));
    }

1169
1170
1171
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
    /* send back the original data */
    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
    mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

    if (resp_req != NULL) {
        if (!MPID_Request_is_complete(resp_req)) {
            /* sending process is not completed, set proper OnDataAvail
               (it is initialized to NULL by lower layer) */
            resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
            resp_req->dev.flags = fop_pkt->flags;
            resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;

            /* here we increment the Active Target counter to guarantee the GET-like
               operation are completed when counter reaches zero. */
            win_ptr->at_completion_counter++;

            MPID_Request_release(resp_req);
            goto fn_exit;
        }
        else {
            MPID_Request_release(resp_req);
        }
    }

    /* do final action */
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_FOP,
                                    fop_pkt->flags, fop_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1210
static inline int perform_cas_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
1211
1212
1213
1214
1215
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_cas_resp_t *cas_resp_pkt = &upkt.cas_resp;
    MPIDI_CH3_Pkt_cas_t *cas_pkt = &((lock_entry->pkt).cas);
    MPID_Request *send_req = NULL;
1216
    MPIDI_VC_t *vc = NULL;
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
    MPI_Aint len;
    int mpi_errno = MPI_SUCCESS;

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, cas_pkt->origin_rank, &vc);

    MPIDI_Pkt_init(cas_resp_pkt, MPIDI_CH3_PKT_CAS_RESP);
    cas_resp_pkt->request_handle = cas_pkt->request_handle;
    cas_resp_pkt->source_win_handle = cas_pkt->source_win_handle;
    cas_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    cas_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
1230
1231
    if ((cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
        (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;

    /* Copy old value into the response packet */
    MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
    MPIU_Assert(len <= sizeof(MPIDI_CH3_CAS_Immed_u));

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

    MPIU_Memcpy((void *) &cas_resp_pkt->data, cas_pkt->addr, len);

    /* Compare and replace if equal */
    if (MPIR_Compare_equal(&cas_pkt->compare_data, cas_pkt->addr, cas_pkt->datatype)) {
        MPIU_Memcpy(cas_pkt->addr, &cas_pkt->origin_data, len);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

    /* Send the response packet */
    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
    mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &send_req);
    MPIU_THREAD_CS_EXIT(CH3COMM, vc);

    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

    if (send_req != NULL) {
        if (!MPID_Request_is_complete(send_req)) {
            /* sending process is not completed, set proper OnDataAvail
               (it is initialized to NULL by lower layer) */
            send_req->dev.target_win_handle = cas_pkt->target_win_handle;
            send_req->dev.flags = cas_pkt->flags;
            send_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_CASSendComplete;

            /* here we increment the Active Target counter to guarantee the GET-like
               operation are completed when counter reaches zero. */
            win_ptr->at_completion_counter++;

            MPID_Request_release(send_req);
            goto fn_exit;
        }
        else
            MPID_Request_release(send_req);
    }

    /* do final action */
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_CAS,
                                    cas_pkt->flags, cas_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1289
static inline int perform_op_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
1290
1291
{
    int mpi_errno = MPI_SUCCESS;
1292
1293
1294
1295
1296
1297
1298

    if (lock_entry->pkt.type == MPIDI_CH3_PKT_LOCK) {

        /* single LOCK request */

        MPIDI_CH3_Pkt_lock_t *lock_pkt = &(lock_entry->pkt.lock);
        if (lock_pkt->origin_rank == win_ptr->comm_ptr->rank) {
1299
            mpi_errno = handle_lock_ack(win_ptr, lock_pkt->origin_rank,
1300
                                              MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
1301
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1302
1303
        }
        else {
1304
            MPIDI_VC_t *vc = NULL;
1305
1306
            MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr,
                                         lock_pkt->origin_rank, &vc);
1307
            mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(vc, win_ptr,
1308
                                                     MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED,
1309
1310
1311
1312
1313
1314
                                              lock_pkt->source_win_handle);
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }
    else {
        /* LOCK+OP packet */
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
        switch(lock_entry->pkt.type) {
        case (MPIDI_CH3_PKT_PUT):
            mpi_errno = perform_put_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_GET):
            mpi_errno = perform_get_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_ACCUMULATE):
            mpi_errno = perform_acc_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_GET_ACCUM):
            mpi_errno = perform_get_acc_in_lock_queue(win_ptr, lock_entry);
1330
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
            break;
        case (MPIDI_CH3_PKT_FOP):
            mpi_errno = perform_fop_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_CAS):
            mpi_errno = perform_cas_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        default:
            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                                 "**invalidpkt", "**invalidpkt %d", lock_entry->pkt.type);
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
        }
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
static int entered_flag = 0;
static int entered_count = 0;

/* Release the current lock on the window and grant the next lock in the
   queue if any */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Release_lock
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
{
1364
    MPIDI_RMA_Lock_entry_t *lock_entry, *lock_entry_next;
1365
1366
    int requested_lock, mpi_errno = MPI_SUCCESS, temp_entered_count;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
1367

1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);

    if (win_ptr->current_lock_type == MPI_LOCK_SHARED) {
        /* decr ref cnt */
        /* FIXME: MT: Must be done atomically */
        win_ptr->shared_lock_ref_cnt--;
    }

    /* If shared lock ref count is 0 (which is also true if the lock is an
       exclusive lock), release the lock. */
    if (win_ptr->shared_lock_ref_cnt == 0) {

	/* This function needs to be reentrant even in the single-threaded case
1381
1382
1383
1384
1385
1386
           because when going through the lock queue, pkt_handler() in
           perform_op_in_lock_queue() may again call release_lock(). To handle
           this possibility, we use an entered_flag.
           If the flag is not 0, we simply increment the entered_count and return.
           The loop through the lock queue is repeated if the entered_count has
           changed while we are in the loop.
1387
1388
	 */
	if (entered_flag != 0) {
1389
	    entered_count++; /* Count how many times we re-enter */
1390
1391
1392
	    goto fn_exit;
	}

1393
1394
1395
1396
        entered_flag = 1;  /* Mark that we are now entering release_lock() */
        temp_entered_count = entered_count;

	do {
1397
1398
1399
1400
	    if (temp_entered_count != entered_count) temp_entered_count++;

	    /* FIXME: MT: The setting of the lock type must be done atomically */
	    win_ptr->current_lock_type = MPID_LOCK_NONE;
1401

1402
1403
1404
1405
	    /* If there is a lock queue, try to satisfy as many lock requests as 
	       possible. If the first one is a shared lock, grant it and grant all 
	       other shared locks. If the first one is an exclusive lock, grant 
	       only that one. */
1406

1407
	    /* FIXME: MT: All queue accesses need to be made atomic */
1408
            lock_entry = (MPIDI_RMA_Lock_entry_t *) win_ptr->lock_queue;
1409
1410
1411
            while (lock_entry) {
                lock_entry_next = lock_entry->next;

1412
                if (lock_entry->all_data_recved) {
1413
1414
                MPIDI_CH3_PKT_RMA_GET_LOCK_TYPE(lock_entry->pkt, requested_lock, mpi_errno);
                if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
1415
1416
                    /* dequeue entry from lock queue */
                    MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_entry);
1417

1418
                    /* perform this OP */
1419
1420
                    mpi_errno = perform_op_in_lock_queue(win_ptr, lock_entry);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
1421

1422
1423
1424
                    /* free this entry */
                    mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_entry);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
1425
1426
1427
1428
1429
1430

                    /* if the granted lock is exclusive,
                       no need to continue */
                    if (requested_lock == MPI_LOCK_EXCLUSIVE)
                        break;
                }