ch3u_handle_recv_req.c 55.8 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

static int create_derived_datatype(MPID_Request * rreq, MPID_Datatype ** dtp);

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3U_Handle_recv_req
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPID_Request * rreq, 
			       int * complete)
{
19
    static int in_routine ATTRIBUTE((unused)) = FALSE;
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    int mpi_errno = MPI_SUCCESS;
    int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);

    MPIU_Assert(in_routine == FALSE);
    in_routine = TRUE;

    reqFn = rreq->dev.OnDataAvail;
    if (!reqFn) {
	MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_RECV);
	MPIDI_CH3U_Request_complete(rreq);
	*complete = TRUE;
    }
    else {
        mpi_errno = reqFn( vc, rreq, complete );
    }

    in_routine = FALSE;
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);
    return mpi_errno;
}

/* ----------------------------------------------------------------------- */
/* Here are the functions that implement the actions that are taken when 
 * data is available for a receive request (or other completion operations)
 * These include "receive" requests that are part of the RMA implementation.
 *
 * The convention for the names of routines that are called when data is
 * available is
 *    MPIDI_CH3_ReqHandler_<type>( MPIDI_VC_t *, MPID_Request *, int * )
 * as in 
 *    MPIDI_CH3_ReqHandler_...
 *
 * ToDo: 
 *    We need a way for each of these functions to describe what they are,
 *    so that given a pointer to one of these functions, we can retrieve
 *    a description of the routine.  We may want to use a static string 
 *    and require the user to maintain thread-safety, at least while
 *    accessing the string.
 */
/* ----------------------------------------------------------------------- */
63
64
int MPIDI_CH3_ReqHandler_RecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
				       MPID_Request *rreq, 
65
66
67
68
69
70
71
72
73
				       int *complete )
{
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
    return MPI_SUCCESS;
}

#undef FUNCNAME
74
#define FUNCNAME MPIDI_CH3_ReqHandler_PutRecvComplete
75
76
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
77
78
79
int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
                                          MPID_Request *rreq,
                                          int *complete )
80
81
82
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Win *win_ptr;
83
84
    MPI_Win source_win_handle = rreq->dev.source_win_handle;
    MPIDI_CH3_Pkt_flags_t flags = rreq->dev.flags;
85
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
86

87
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    /* NOTE: It is possible that this request is already completed before
       entering this handler. This happens when this req handler is called
       within the same req handler on the same request.
       Consider this case: req is queued up in SHM queue with ref count of 2:
       one is for completing the request and another is for dequeueing from
       the queue. The first called req handler on this request completed
       this request and decrement ref counter to 1. Request is still in the
       queue. Within this handler, we call the req handler on the same request
       for the second time (for example when making progress on SHM queue),
       and the second called handler also tries to complete this request,
       which leads to wrong execution.
       Here we check if req is already completed to prevent processing the
       same request twice. */
    if (MPID_Request_is_complete(rreq)) {
        *complete = FALSE;
        goto fn_exit;
    }

107
108
    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);

109
110
111
112
113
114
115
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);

    /* NOTE: finish_op_on_target() must be called after we complete this request,
       because inside finish_op_on_target() we may call this request handler
       on the same request again (in release_lock()). Marking this request as
       completed will prevent us from processing the same request twice. */
116
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_PUT,
117
                                    flags, source_win_handle);
118
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
119

120
    *complete = TRUE;
121

122
123
124
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
    return MPI_SUCCESS;
125

126
127
128
129
130
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
131
132


133
134
135
136
137
138
139
140
141
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_AccumRecvComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
                                            MPID_Request *rreq,
                                            int *complete )
{
    int mpi_errno = MPI_SUCCESS;
142
    MPI_Aint true_lb, true_extent;
143
    MPID_Win *win_ptr;
144
145
    MPI_Win source_win_handle = rreq->dev.source_win_handle;
    MPIDI_CH3_Pkt_flags_t flags = rreq->dev.flags;
146
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
147

148
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
    /* NOTE: It is possible that this request is already completed before
       entering this handler. This happens when this req handler is called
       within the same req handler on the same request.
       Consider this case: req is queued up in SHM queue with ref count of 2:
       one is for completing the request and another is for dequeueing from
       the queue. The first called req handler on this request completed
       this request and decrement ref counter to 1. Request is still in the
       queue. Within this handler, we call the req handler on the same request
       for the second time (for example when making progress on SHM queue),
       and the second called handler also tries to complete this request,
       which leads to wrong execution.
       Here we check if req is already completed to prevent processing the
       same request twice. */
    if (MPID_Request_is_complete(rreq)) {
        *complete = FALSE;
        goto fn_exit;
    }

168
    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
169

170
    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP);
171

172
173
174
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
    /* accumulate data from tmp_buf into user_buf */
175
176
    mpi_errno = do_accumulate_op(rreq->dev.final_user_buf, rreq->dev.real_user_buf,
                                 rreq->dev.user_count, rreq->dev.datatype, rreq->dev.op);
177
178
179
180
181
182
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
    }

183
184
185
186
    /* free the temporary buffer */
    MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
    MPIU_Free((char *) rreq->dev.final_user_buf + true_lb);

187
188
189
190
191
192
193
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);

    /* NOTE: finish_op_on_target() must be called after we complete this request,
       because inside finish_op_on_target() we may call this request handler
       on the same request again (in release_lock()). Marking this request as
       completed will prevent us from processing the same request twice. */
194
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_ACCUMULATE,
195
                                    flags, source_win_handle);
196
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
197

198
    *complete = TRUE;
199

200
201
202
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
    return MPI_SUCCESS;
203

204
205
206
207
208
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
209
210


211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumRecvComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
                                             MPID_Request *rreq,
                                             int *complete )
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Win *win_ptr;
    MPI_Aint type_size;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
    MPID_Request *resp_req;
    MPID_IOV iov[MPID_IOV_LIMIT];
226
    MPI_Aint true_lb, true_extent;
227
228
    size_t len;
    int iovcnt;
229
230
231
232
233
234
235
236
237
238
239
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);

    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);

    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
    get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
    get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_accum_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
240
    get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
241
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
242
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
243
244
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
245
246
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
247
    get_accum_resp_pkt->immed_len = 0;
248

249
    MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
250

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
    /* Copy data into a temporary buffer */
    resp_req = MPID_Request_create();
    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    MPIU_Object_set_ref(resp_req, 1);

    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
                        mpi_errno, "GACC resp. buffer");

    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
                    rreq->dev.user_count * type_size);
    } else {
        MPID_Segment *seg = MPID_Segment_alloc();
        MPI_Aint last = type_size * rreq->dev.user_count;

        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
        MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg, 0);
        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
        MPID_Segment_free(seg);
270
271
    }

Xin Zhao's avatar
Xin Zhao committed
272
273
    resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;
    resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
274
275
    resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
    resp_req->dev.flags = rreq->dev.flags;
276

277
278
279
    /* here we increment the Active Target counter to guarantee the GET-like
       operation are completed when counter reaches zero. */
    win_ptr->at_completion_counter++;
280

281
282
283
284
285
286
287
288
289
    /* length of target data */
    MPIU_Assign_trunc(len, rreq->dev.user_count * type_size, size_t);

    /* both origin buffer and target buffer are basic datatype,
       fill IMMED data area in response packet header. */
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
        /* Try to copy target data into packet header. */
        MPIU_Assign_trunc(get_accum_resp_pkt->immed_len,
                          MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size),
290
                          int);
291
292
293
294
295

        if (get_accum_resp_pkt->immed_len > 0) {
            void *src = resp_req->dev.user_buf;
            void *dest = (void*) get_accum_resp_pkt->data;
            /* copy data from origin buffer to immed area in packet header */
296
            mpi_errno = immed_copy(src, dest, (size_t)get_accum_resp_pkt->immed_len);
297
298
299
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }
300

301
    if (len == (size_t)get_accum_resp_pkt->immed_len) {
302
303
304
305
306
307
308
309
310
311
312
313
        /* All origin data is in packet header, issue the header. */
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iovcnt = 1;
    }
    else {
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)resp_req->dev.user_buf + get_accum_resp_pkt->immed_len);
        iov[1].MPID_IOV_LEN = rreq->dev.user_count * type_size - get_accum_resp_pkt->immed_len;
        iovcnt = 2;
    }
314
315

    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
316
    mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
317
318
319
320
321
322
323
324
325
326
327
328
    MPIU_THREAD_CS_EXIT(CH3COMM,vc);

    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

    /* Mark get portion as handled */
    rreq->dev.resp_request_handle = MPI_REQUEST_NULL;

    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
    /* accumulate data from tmp_buf into user_buf */
329
330
    mpi_errno = do_accumulate_op(rreq->dev.final_user_buf, rreq->dev.real_user_buf,
                                 rreq->dev.user_count, rreq->dev.datatype, rreq->dev.op);
331
332
333
334
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    if (mpi_errno) {
        MPIU_ERR_POP(mpi_errno);
335
    }
336
337
338
339

    /* free the temporary buffer */
    MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
    MPIU_Free((char *) rreq->dev.final_user_buf + true_lb);
340
341
342
343
    
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
344
345
 fn_exit:
    MPIU_CHKPMEM_COMMIT();
346
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
347
    return MPI_SUCCESS;
348
349
350
351
352
353

    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
354
355
356
}

#undef FUNCNAME
357
#define FUNCNAME MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete
358
359
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
360
int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
361
362
363
364
						   MPID_Request *rreq, 
						   int *complete )
{
    int mpi_errno = MPI_SUCCESS;
365
    MPID_Datatype *new_dtp = NULL;
366
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
367
    
368
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
                
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
    
    /* update request to get the data */
    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_PUT_RESP);
    rreq->dev.datatype = new_dtp->handle;
    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count; 
    
    rreq->dev.datatype_ptr = new_dtp;
    /* this will cause the datatype to be freed when the
       request is freed. free dtype_info here. */
    MPIU_Free(rreq->dev.dtype_info);
    
    rreq->dev.segment_ptr = MPID_Segment_alloc( );
384
385
    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

386
387
388
389
390
391
392
393
394
395
396
397
398
    MPID_Segment_init(rreq->dev.user_buf,
		      rreq->dev.user_count,
		      rreq->dev.datatype,
		      rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = rreq->dev.recv_data_sz;
    
    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
			    "**ch3|loadrecviov");
    }
    if (!rreq->dev.OnDataAvail) 
399
	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRecvComplete;
400
401
402
    
    *complete = FALSE;
 fn_fail:
403
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
404
405
406
407
    return mpi_errno;
}

#undef FUNCNAME
408
#define FUNCNAME MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete
409
410
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
411
int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
412
413
414
415
						     MPID_Request *rreq, 
						     int *complete )
{
    int mpi_errno = MPI_SUCCESS;
416
    MPID_Datatype *new_dtp = NULL;
417
418
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf;
419
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
420
    
421
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
422
423
424
425
426
427
428
429
430
    
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
    
    /* update new request to get the data */
    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RESP);
    
    /* first need to allocate tmp_buf to recv the data into */
    
431
    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
432
433
434
435
436
    MPID_Datatype_get_extent_macro(new_dtp->handle, extent); 
    
    tmp_buf = MPIU_Malloc(rreq->dev.user_count * 
			  (MPIR_MAX(extent,true_extent)));  
    if (!tmp_buf) {
437
438
	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
439
440
441
442
443
444
    }
    
    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - true_lb);
    
    rreq->dev.user_buf = tmp_buf;
Xin Zhao's avatar
Xin Zhao committed
445
    rreq->dev.final_user_buf = rreq->dev.user_buf;
446
447
448
449
450
451
452
453
454
    rreq->dev.datatype = new_dtp->handle;
    rreq->dev.recv_data_sz = new_dtp->size *
	rreq->dev.user_count; 
    rreq->dev.datatype_ptr = new_dtp;
    /* this will cause the datatype to be freed when the
       request is freed. free dtype_info here. */
    MPIU_Free(rreq->dev.dtype_info);
    
    rreq->dev.segment_ptr = MPID_Segment_alloc( );
455
456
    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

457
458
459
460
461
462
463
464
465
466
467
468
469
    MPID_Segment_init(rreq->dev.user_buf,
		      rreq->dev.user_count,
		      rreq->dev.datatype,
		      rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = rreq->dev.recv_data_sz;
    
    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
			    "**ch3|loadrecviov");
    }
    if (!rreq->dev.OnDataAvail)
470
	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRecvComplete;
471
472
473
    
    *complete = FALSE;
 fn_fail:
474
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
475
476
477
    return mpi_errno;
}

478
479

#undef FUNCNAME
480
#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete
481
482
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
483
484
485
int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
                                                      MPID_Request *rreq,
                                                      int *complete )
486
487
{
    int mpi_errno = MPI_SUCCESS;
488
489
490
491
    MPID_Datatype *new_dtp = NULL;
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
492

493
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
494

495
496
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
497

498
499
    /* update new request to get the data */
    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
500

501
    /* first need to allocate tmp_buf to recv the data into */
502

503
504
505
506
507
508
509
510
511
512
513
514
515
516
    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
    MPID_Datatype_get_extent_macro(new_dtp->handle, extent);

    tmp_buf = MPIU_Malloc(rreq->dev.user_count *
			  (MPIR_MAX(extent,true_extent)));
    if (!tmp_buf) {
	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
    }

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - true_lb);

    rreq->dev.user_buf = tmp_buf;
Xin Zhao's avatar
Xin Zhao committed
517
    rreq->dev.final_user_buf = rreq->dev.user_buf;
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
    rreq->dev.datatype = new_dtp->handle;
    rreq->dev.recv_data_sz = new_dtp->size *
	rreq->dev.user_count;
    rreq->dev.datatype_ptr = new_dtp;
    /* this will cause the datatype to be freed when the
       request is freed. free dtype_info here. */
    MPIU_Free(rreq->dev.dtype_info);

    rreq->dev.segment_ptr = MPID_Segment_alloc( );
    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

    MPID_Segment_init(rreq->dev.user_buf,
		      rreq->dev.user_count,
		      rreq->dev.datatype,
		      rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = rreq->dev.recv_data_sz;

    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
			    "**ch3|loadrecviov");
    }
    if (!rreq->dev.OnDataAvail)
	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumRecvComplete;

    *complete = FALSE;
545
 fn_fail:
546
547
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
    return mpi_errno;
548
549
550
}


Xin Zhao's avatar
Xin Zhao committed
551

552
#undef FUNCNAME
553
#define FUNCNAME MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete
554
555
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
556
int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
557
558
559
560
						   MPID_Request *rreq, 
						   int *complete )
{
    int mpi_errno = MPI_SUCCESS;
561
    MPID_Datatype *new_dtp = NULL;
562
563
564
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_resp_t * get_resp_pkt = &upkt.get_resp;
    MPID_Request * sreq;
565
    MPID_Win *win_ptr;
566
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
567
    
568
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
569
                
570
571
    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);

572
573
574
575
576
577
    /* create derived datatype */
    create_derived_datatype(rreq, &new_dtp);
    MPIU_Free(rreq->dev.dtype_info);
    
    /* create request for sending data */
    sreq = MPID_Request_create();
578
    MPIU_ERR_CHKANDJUMP(sreq == NULL, mpi_errno,MPI_ERR_OTHER,"**nomemreq");
579
    
580
581
    sreq->kind = MPID_REQUEST_SEND;
    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_RESP);
582
583
    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
    sreq->dev.OnFinal     = MPIDI_CH3_ReqHandler_GetSendComplete;
584
585
586
587
588
589
    sreq->dev.user_buf = rreq->dev.user_buf;
    sreq->dev.user_count = rreq->dev.user_count;
    sreq->dev.datatype = new_dtp->handle;
    sreq->dev.datatype_ptr = new_dtp;
    sreq->dev.target_win_handle = rreq->dev.target_win_handle;
    sreq->dev.source_win_handle = rreq->dev.source_win_handle;
590
    sreq->dev.flags = rreq->dev.flags;
591
592
    
    MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
593
    get_resp_pkt->request_handle = rreq->dev.request_handle;    
594
595
596
    get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
    get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
597
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
598
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
599
600
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
601
602
    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
603
    get_resp_pkt->immed_len = 0;
604
605
    
    sreq->dev.segment_ptr = MPID_Segment_alloc( );
606
607
    MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

608
609
610
611
612
613
    MPID_Segment_init(sreq->dev.user_buf,
		      sreq->dev.user_count,
		      sreq->dev.datatype,
		      sreq->dev.segment_ptr, 0);
    sreq->dev.segment_first = 0;
    sreq->dev.segment_size = new_dtp->size * sreq->dev.user_count;
614

615
616
    /* Because this is in a packet handler, it is already within a critical section */	
    /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
617
    mpi_errno = vc->sendNoncontig_fn(vc, sreq, get_resp_pkt, sizeof(*get_resp_pkt));
618
    /* MPIU_THREAD_CS_EXIT(CH3COMM,vc); */
619
620
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
621
    {
622
        MPID_Request_release(sreq);
623
        sreq = NULL;
624
        MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg");
625
    }
626
    /* --END ERROR HANDLING-- */
627
628
629
630
631
632
633
    
    /* mark receive data transfer as complete and decrement CC in receive 
       request */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
    
 fn_fail:
634
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
635
636
637
    return mpi_errno;
}

638

639
640
641
642
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_UnpackUEBufComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
643
int MPIDI_CH3_ReqHandler_UnpackUEBufComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
644
645
646
647
648
649
650
651
					      MPID_Request *rreq, 
					      int *complete )
{
    int recv_pending;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
    
652
653
    MPIDI_Request_decr_pending(rreq);
    MPIDI_Request_check_pending(rreq, &recv_pending);
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
    if (!recv_pending)
    { 
	if (rreq->dev.recv_data_sz > 0)
	{
	    MPIDI_CH3U_Request_unpack_uebuf(rreq);
	    MPIU_Free(rreq->dev.tmpbuf);
	}
    }
    else
    {
	/* The receive has not been posted yet.  MPID_{Recv/Irecv}() 
	   is responsible for unpacking the buffer. */
    }
    
    /* mark data transfer as complete and decrement CC */
    MPIDI_CH3U_Request_complete(rreq);
    *complete = TRUE;
    
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
    return MPI_SUCCESS;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_UnpackSRBufComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *vc, 
					      MPID_Request *rreq, 
					      int *complete )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);

    MPIDI_CH3U_Request_unpack_srbuf(rreq);

691
692
693
694
695
696
697
698
699
700
701
    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP)
    {
	mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(
	    vc, rreq, complete );
    }
    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP)
    {
	mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(
	    vc, rreq, complete );
    }
    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP)
702
    {
703
	mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
	    vc, rreq, complete );
    }
    else {
	/* mark data transfer as complete and decrement CC */
	MPIDI_CH3U_Request_complete(rreq);
	*complete = TRUE;
    }

    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
720
int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
					      MPID_Request *rreq, 
					      int *complete )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);

    MPIDI_CH3U_Request_unpack_srbuf(rreq);
    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|loadrecviov" );
    }
    *complete = FALSE;
 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_ReloadIOV
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
744
745
int MPIDI_CH3_ReqHandler_ReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
				    MPID_Request *rreq, int *complete )
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);

    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|loadrecviov");
    }
    *complete = FALSE;
 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
    return mpi_errno;
}

/* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */

#undef FUNCNAME
#define FUNCNAME create_derived_datatype
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
{
    MPIDI_RMA_dtype_info *dtype_info;
    MPID_Datatype *new_dtp;
    int mpi_errno=MPI_SUCCESS;
    MPI_Aint ptrdiff;
    MPIDI_STATE_DECL(MPID_STATE_CREATE_DERIVED_DATATYPE);
    
    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DERIVED_DATATYPE);

    dtype_info = req->dev.dtype_info;

    /* allocate new datatype object and handle */
    new_dtp = (MPID_Datatype *) MPIU_Handle_obj_alloc(&MPID_Datatype_mem);
    if (!new_dtp) {
784
785
	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
			     "MPID_Datatype_mem" );
786
787
788
789
790
791
792
793
794
795
796
797
    }

    *dtp = new_dtp;
            
    /* Note: handle is filled in by MPIU_Handle_obj_alloc() */
    MPIU_Object_set_ref(new_dtp, 1);
    new_dtp->is_permanent = 0;
    new_dtp->is_committed = 1;
    new_dtp->attributes   = 0;
    new_dtp->cache_id     = 0;
    new_dtp->name[0]      = 0;
    new_dtp->is_contig = dtype_info->is_contig;
798
    new_dtp->max_contig_blocks = dtype_info->max_contig_blocks; 
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
    new_dtp->size = dtype_info->size;
    new_dtp->extent = dtype_info->extent;
    new_dtp->dataloop_size = dtype_info->dataloop_size;
    new_dtp->dataloop_depth = dtype_info->dataloop_depth; 
    new_dtp->eltype = dtype_info->eltype;
    /* set dataloop pointer */
    new_dtp->dataloop = req->dev.dataloop;
    
    new_dtp->ub = dtype_info->ub;
    new_dtp->lb = dtype_info->lb;
    new_dtp->true_ub = dtype_info->true_ub;
    new_dtp->true_lb = dtype_info->true_lb;
    new_dtp->has_sticky_ub = dtype_info->has_sticky_ub;
    new_dtp->has_sticky_lb = dtype_info->has_sticky_lb;
    /* update pointers in dataloop */
    ptrdiff = (MPI_Aint)((char *) (new_dtp->dataloop) - (char *)
                         (dtype_info->dataloop));
    
    /* FIXME: Temp to avoid SEGV when memory tracing */
    new_dtp->hetero_dloop = 0;

    MPID_Dataloop_update(new_dtp->dataloop, ptrdiff);

    new_dtp->contents = NULL;

 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DERIVED_DATATYPE);

    return mpi_errno;
}

830

831
static inline int perform_put_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
{
    MPIDI_CH3_Pkt_put_t *put_pkt = &((lock_entry->pkt).put);
    MPIDI_VC_t *vc = NULL;
    int mpi_errno = MPI_SUCCESS;

    if (lock_entry->data == NULL) {
        /* all data fits in packet header */
        mpi_errno = MPIR_Localcopy(put_pkt->data, put_pkt->count, put_pkt->datatype,
                                   put_pkt->addr, put_pkt->count, put_pkt->datatype);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }
    else {
        mpi_errno = MPIR_Localcopy(lock_entry->data, put_pkt->count, put_pkt->datatype,
                                   put_pkt->addr, put_pkt->count, put_pkt->datatype);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, put_pkt->origin_rank, &vc);

    /* do final action */
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_PUT,
                                    put_pkt->flags, put_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

863
static inline int perform_get_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
864
865
866
867
868
869
870
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
    MPIDI_CH3_Pkt_get_t *get_pkt = &((lock_entry->pkt).get);
    MPID_Request *sreq = NULL;
    MPIDI_VC_t *vc = NULL;
    MPI_Aint type_size;
871
872
    size_t len;
    int iovcnt;
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
    MPID_IOV iov[MPID_IOV_LIMIT];
    int mpi_errno = MPI_SUCCESS;

    sreq = MPID_Request_create();
    if (sreq == NULL) {
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    }
    MPIU_Object_set_ref(sreq, 1);

    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_RESP);
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
    sreq->dev.OnFinal = MPIDI_CH3_ReqHandler_GetSendComplete;

    sreq->dev.target_win_handle = win_ptr->handle;
    sreq->dev.flags = get_pkt->flags;

    /* here we increment the Active Target counter to guarantee the GET-like
       operation are completed when counter reaches zero. */
    win_ptr->at_completion_counter++;

    MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
    get_resp_pkt->request_handle = get_pkt->request_handle;
    get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
    get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_resp_pkt->source_win_handle = get_pkt->source_win_handle;
905
    get_resp_pkt->immed_len = 0;
906

907
    /* length of target data */
908
    MPID_Datatype_get_size_macro(get_pkt->datatype, type_size);
909
910
911
912
913
914
915
916
    MPIU_Assign_trunc(len, get_pkt->count * type_size, size_t);

    /* both origin buffer and target buffer are basic datatype,
       fill IMMED data area in response packet header. */
    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
        /* Try to copy target data into packet header. */
        MPIU_Assign_trunc(get_resp_pkt->immed_len,
                          MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size),
917
                          int);
918
919
920
921
922

        if (get_resp_pkt->immed_len > 0) {
            void *src = get_pkt->addr;
            void *dest = (void*) get_resp_pkt->data;
            /* copy data from origin buffer to immed area in packet header */
923
            mpi_errno = immed_copy(src, dest, (size_t)get_resp_pkt->immed_len);
924
925
926
927
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }

928
    if (len == (size_t)get_resp_pkt->immed_len) {
929
930
931
932
933
934
935
936
937
938
939
940
        /* All origin data is in packet header, issue the header. */
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
        iovcnt = 1;
    }
    else {
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)get_pkt->addr + get_resp_pkt->immed_len);
        iov[1].MPID_IOV_LEN = get_pkt->count * type_size - get_resp_pkt->immed_len;
        iovcnt = 2;
    }
941
942
943
944

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, get_pkt->origin_rank, &vc);

945
    mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, iovcnt);
946
947
948
949
950
951
952
953
954
955
956
957
    if (mpi_errno != MPI_SUCCESS) {
        MPID_Request_release(sreq);
	MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


958
static inline int perform_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
{
    MPIDI_CH3_Pkt_accum_t *acc_pkt = &((lock_entry->pkt).accum);
    MPIDI_VC_t *vc = NULL;
    int mpi_errno = MPI_SUCCESS;

    MPIU_Assert(lock_entry->all_data_recved == 1);

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

    if (lock_entry->data == NULL) {
        /* All data fits in packet header */
        mpi_errno = do_accumulate_op(acc_pkt->data, acc_pkt->addr,
                                     acc_pkt->count, acc_pkt->datatype, acc_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }
    else {
        mpi_errno = do_accumulate_op(lock_entry->data, acc_pkt->addr,
                                     acc_pkt->count, acc_pkt->datatype, acc_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, acc_pkt->origin_rank, &vc);

    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_ACCUMULATE,
                                    acc_pkt->flags, acc_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


998
static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
999
1000
1001
1002
1003
1004
1005
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &((lock_entry->pkt).get_accum);
    MPID_Request *sreq = NULL;
    MPIDI_VC_t *vc = NULL;
    MPI_Aint type_size;
1006
1007
    size_t len;
    int iovcnt;
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
    MPID_IOV iov[MPID_IOV_LIMIT];
    int mpi_errno = MPI_SUCCESS;

    sreq = MPID_Request_create();
    if (sreq == NULL) {
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    }
    MPIU_Object_set_ref(sreq, 1);

    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
    sreq->kind = MPID_REQUEST_SEND;
    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
    sreq->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;

    sreq->dev.target_win_handle = win_ptr->handle;
    sreq->dev.flags = get_accum_pkt->flags;

    /* Copy data into a temporary buffer */
    MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
    sreq->dev.user_buf = (void *)MPIU_Malloc(get_accum_pkt->count * type_size);

    if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
        MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr,
                    get_accum_pkt->count * type_size);
    } else {
        MPID_Segment *seg = MPID_Segment_alloc();
        MPI_Aint last = type_size * get_accum_pkt->count;

        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
        MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count,
                          get_accum_pkt->datatype, seg, 0);
        MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
        MPID_Segment_free(seg);
    }

    /* here we increment the Active Target counter to guarantee the GET-like
       operation are completed when counter reaches zero. */
    win_ptr->at_completion_counter++;

    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
    get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
    get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
    get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    get_accum_resp_pkt->source_win_handle = get_accum_pkt->source_win_handle;
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
    get_accum_resp_pkt->immed_len = 0;

    /* length of target data */
    MPIU_Assign_trunc(len, get_accum_pkt->count * type_size, size_t);

    /* both origin buffer and target buffer are basic datatype,
       fill IMMED data area in response packet header. */
    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
        /* Try to copy target data into packet header. */
        MPIU_Assign_trunc(get_accum_resp_pkt->immed_len,
                          MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size),
1069
                          int);
1070
1071
1072
1073
1074

        if (get_accum_resp_pkt->immed_len > 0) {
            void *src = sreq->dev.user_buf;
            void *dest = (void*) get_accum_resp_pkt->data;
            /* copy data from origin buffer to immed area in packet header */
1075
            mpi_errno = immed_copy(src, dest, (size_t)get_accum_resp_pkt->immed_len);
1076
1077
1078
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }
1079

1080
    if (len == (size_t)get_accum_resp_pkt->immed_len) {
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
        /* All origin data is in packet header, issue the header. */
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iovcnt = 1;
    }
    else {
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)sreq->dev.user_buf + get_accum_resp_pkt->immed_len);
        iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size - get_accum_resp_pkt->immed_len;
        iovcnt = 2;
    }
1093
1094
1095
1096

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, get_accum_pkt->origin_rank, &vc);

1097
    mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, iovcnt);
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
    if (mpi_errno != MPI_SUCCESS) {
        MPID_Request_release(sreq);
	MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }

    /* Perform ACCUMULATE OP */
    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

    if (lock_entry->data == NULL) {
        /* All data fits in packet header */
        mpi_errno = do_accumulate_op(get_accum_pkt->data, get_accum_pkt->addr,
                                     get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }
    else {
        mpi_errno = do_accumulate_op(lock_entry->data, get_accum_pkt->addr,
                                     get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1129
static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
1130
{
1131
1132
1133
1134
1135
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
    MPIDI_CH3_Pkt_fop_t *fop_pkt = &((lock_entry->pkt).fop);
    MPID_Request *resp_req = NULL;
    MPIDI_VC_t *vc = NULL;
1136
    int mpi_errno = MPI_SUCCESS;
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158

    /* FIXME: this function is same with PktHandler_FOP(), should
       do code refactoring on both of them. */

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, fop_pkt->origin_rank, &vc);

    MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
    fop_resp_pkt->request_handle = fop_pkt->request_handle;
    fop_resp_pkt->source_win_handle = fop_pkt->source_win_handle;
    fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
    fop_resp_pkt->immed_len = fop_pkt->immed_len;

    /* copy data to resp pkt header */
    void *src = fop_pkt->addr, *dest = fop_resp_pkt->data;
1159
    mpi_errno = immed_copy(src, dest, (size_t)fop_resp_pkt->immed_len);
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

    /* Apply the op */
    if (fop_pkt->op != MPI_NO_OP) {
        MPI_User_function *uop = MPIR_OP_HDL_TO_FN(fop_pkt->op);
        int one = 1;
        if (win_ptr->shm_allocated == TRUE)
            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        (*uop)(fop_pkt->data, fop_pkt->addr, &one, &(fop_pkt->datatype));
        if (win_ptr->shm_allocated == TRUE)
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    }

    /* send back the original data */
    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
    mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

    if (resp_req != NULL) {
        if (!MPID_Request_is_complete(resp_req)) {
            /* sending process is not completed, set proper OnDataAvail
               (it is initialized to NULL by lower layer) */
            resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
            resp_req->dev.flags = fop_pkt->flags;
            resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;

            /* here we increment the Active Target counter to guarantee the GET-like
               operation are completed when counter reaches zero. */
            win_ptr->at_completion_counter++;

            MPID_Request_release(resp_req);
            goto fn_exit;
        }
        else {
            MPID_Request_release(resp_req);
        }
    }

    /* do final action */
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_FOP,
                                    fop_pkt->flags, fop_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1211
static inline int perform_cas_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
1212
1213
1214
1215
1216
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_cas_resp_t *cas_resp_pkt = &upkt.cas_resp;
    MPIDI_CH3_Pkt_cas_t *cas_pkt = &((lock_entry->pkt).cas);
    MPID_Request *send_req = NULL;
1217
    MPIDI_VC_t *vc = NULL;
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
    MPI_Aint len;
    int mpi_errno = MPI_SUCCESS;

    /* get vc object */
    MPIDI_Comm_get_vc(win_ptr->comm_ptr, cas_pkt->origin_rank, &vc);

    MPIDI_Pkt_init(cas_resp_pkt, MPIDI_CH3_PKT_CAS_RESP);
    cas_resp_pkt->request_handle = cas_pkt->request_handle;
    cas_resp_pkt->source_win_handle = cas_pkt->source_win_handle;
    cas_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
    cas_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;

    /* Copy old value into the response packet */
    MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
    MPIU_Assert(len <= sizeof(MPIDI_CH3_CAS_Immed_u));

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);

    MPIU_Memcpy((void *) &cas_resp_pkt->data, cas_pkt->addr, len);

    /* Compare and replace if equal */
    if (MPIR_Compare_equal(&cas_pkt->compare_data, cas_pkt->addr, cas_pkt->datatype)) {
        MPIU_Memcpy(cas_pkt->addr, &cas_pkt->origin_data, len);
    }

    if (win_ptr->shm_allocated == TRUE)
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);

    /* Send the response packet */
    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
    mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &send_req);
    MPIU_THREAD_CS_EXIT(CH3COMM, vc);

    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

    if (send_req != NULL) {
        if (!MPID_Request_is_complete(send_req)) {
            /* sending process is not completed, set proper OnDataAvail
               (it is initialized to NULL by lower layer) */
            send_req->dev.target_win_handle = cas_pkt->target_win_handle;
            send_req->dev.flags = cas_pkt->flags;
            send_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_CASSendComplete;

            /* here we increment the Active Target counter to guarantee the GET-like
               operation are completed when counter reaches zero. */
            win_ptr->at_completion_counter++;

            MPID_Request_release(send_req);
            goto fn_exit;
        }
        else
            MPID_Request_release(send_req);
    }

    /* do final action */
    mpi_errno = finish_op_on_target(win_ptr, vc, MPIDI_CH3_PKT_CAS,
                                    cas_pkt->flags, cas_pkt->source_win_handle);
    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1291
static inline int perform_op_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
1292
1293
{
    int mpi_errno = MPI_SUCCESS;
1294
1295
1296
1297
1298
1299
1300

    if (lock_entry->pkt.type == MPIDI_CH3_PKT_LOCK) {

        /* single LOCK request */

        MPIDI_CH3_Pkt_lock_t *lock_pkt = &(lock_entry->pkt.lock);
        if (lock_pkt->origin_rank == win_ptr->comm_ptr->rank) {
1301
1302
            mpi_errno = set_lock_sync_counter(win_ptr, lock_pkt->origin_rank);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1303
1304
        }
        else {
1305
            MPIDI_VC_t *vc = NULL;
1306
1307
            MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr,
                                         lock_pkt->origin_rank, &vc);
1308
            mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(vc, win_ptr,
1309
                                                     MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED,
1310
1311
1312
1313
1314
1315
                                              lock_pkt->source_win_handle);
            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
        }
    }
    else {
        /* LOCK+OP packet */
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
        switch(lock_entry->pkt.type) {
        case (MPIDI_CH3_PKT_PUT):
            mpi_errno = perform_put_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_GET):
            mpi_errno = perform_get_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_ACCUMULATE):
            mpi_errno = perform_acc_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_GET_ACCUM):
            mpi_errno = perform_get_acc_in_lock_queue(win_ptr, lock_entry);
1331
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
            break;
        case (MPIDI_CH3_PKT_FOP):
            mpi_errno = perform_fop_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        case (MPIDI_CH3_PKT_CAS):
            mpi_errno = perform_cas_in_lock_queue(win_ptr, lock_entry);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            break;
        default:
            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                                 "**invalidpkt", "**invalidpkt %d", lock_entry->pkt.type);
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
        }
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}


1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
static int entered_flag = 0;
static int entered_count = 0;

/* Release the current lock on the window and grant the next lock in the
   queue if any */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Release_lock
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
{
1365
    MPIDI_RMA_Lock_entry_t *lock_entry, *lock_entry_next;
1366
1367
    int requested_lock, mpi_errno = MPI_SUCCESS, temp_entered_count;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
1368

1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);

    if (win_ptr->current_lock_type == MPI_LOCK_SHARED) {
        /* decr ref cnt */
        /* FIXME: MT: Must be done atomically */
        win_ptr->shared_lock_ref_cnt--;
    }

    /* If shared lock ref count is 0 (which is also true if the lock is an
       exclusive lock), release the lock. */
    if (win_ptr->shared_lock_ref_cnt == 0) {

	/* This function needs to be reentrant even in the single-threaded case
1382
1383
1384
1385
1386
1387
           because when going through the lock queue, pkt_handler() in
           perform_op_in_lock_queue() may again call release_lock(). To handle
           this possibility, we use an entered_flag.
           If the flag is not 0, we simply increment the entered_count and return.
           The loop through the lock queue is repeated if the entered_count has
           changed while we are in the loop.
1388
1389
	 */
	if (entered_flag != 0) {
1390
	    entered_count++; /* Count how many times we re-enter */
1391
1392
1393
	    goto fn_exit;
	}

1394
1395
1396
1397
        entered_flag = 1;  /* Mark that we are now entering release_lock() */
        temp_entered_count = entered_count;

	do {
1398
1399
1400
1401
	    if (temp_entered_count != entered_count) temp_entered_count++;

	    /* FIXME: MT: The setting of the lock type must be done atomically */
	    win_ptr->current_lock_type = MPID_LOCK_NONE;
1402

1403
1404
1405
1406
	    /* If there is a lock queue, try to satisfy as many lock requests as 
	       possible. If the first one is a shared lock, grant it and grant all 
	       other shared locks. If the first one is an exclusive lock, grant 
	       only that one. */
1407

1408
	    /* FIXME: MT: All queue accesses need to be made atomic */
1409
            lock_entry = (MPIDI_RMA_Lock_entry_t *) win_ptr->lock_queue;
1410
1411
1412
            while (lock_entry) {
                lock_entry_next = lock_entry->next;

1413
                if (lock_entry->all_data_recved) {
1414
1415
1416
1417
1418
1419
                MPIDI_CH3_PKT_RMA_GET_LOCK_TYPE(lock_entry->pkt, requested_lock, mpi_errno);
                if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
                    /* perform this OP */

                    mpi_errno = perform_op_in_lock_queue(win_ptr, lock_entry);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
1420

1421
1422
                    /* dequeue entry from lock queue */
                    MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_entry);
1423
1424
1425
1426

                    /* free this entry */
                    mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_entry);
                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
1427
1428
1429
1430
1431
1432

                    /* if the granted lock is exclusive,
                       no need to continue */
                    if (requested_lock == MPI_LOCK_EXCLUSIVE)
                        break;
                }
1433
                }
1434
                lock_entry = lock_entry_next;
1435
1436
	    }
	} while (temp_entered_count != entered_count);
1437

1438
1439
1440
1441
1442
1443
	entered_count = entered_flag = 0;
    }

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
    return mpi_errno;
1444
1445
 fn_fail:
    goto fn_exit;
1446
}
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460



#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete( MPIDI_VC_t *vc,
                                                      MPID_Request *rreq,
                                                      int *complete )
{
    int requested_lock;
    MPI_Win target_win_handle;
    MPID_Win *win_ptr = NULL;
1461
    MPIDI_RMA_Lock_entry_t *lock_queue_entry = rreq->dev.lock_queue_entry;
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PIGGYBACKLOCKOPRECVCOMPLETE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PIGGYBACKLOCKOPRECVCOMPLETE);

    /* This handler is triggered when we received all data of a lock queue
       entry */

    MPIU_Assert(lock_queue_entry != NULL);

    /* Mark all data received in lock queue entry */
    lock_queue_entry->all_data_recved = 1;

    /* try to acquire the lock here */
    MPIDI_CH3_PKT_RMA_GET_LOCK_TYPE(lock_queue_entry->pkt, requested_lock, mpi_errno);
    MPIDI_CH3_PKT_RMA_GET_TARGET_WIN_HANDLE(lock_queue_entry->pkt, target_win_handle, mpi_errno);
    MPID_Win_get_ptr(target_win_handle, win_ptr);

    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
        /* perform this OP */
        mpi_errno = perform_op_in_lock_queue(win_ptr, lock_queue_entry);
        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);

        /* dequeue entry from lock queue */
        MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_queue_entry);