ch3u_rma_sync.c 148 KB
Newer Older
1
2
3
4
5
6
7
8
9
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"
#include "mpidrma.h"

10
11
12
13
14
15
16
#ifdef USE_MPIU_INSTR
MPIU_INSTR_DURATION_DECL(winfence_clearlock);
MPIU_INSTR_DURATION_DECL(winfence_rs);
MPIU_INSTR_DURATION_DECL(winfence_issue);
MPIU_INSTR_DURATION_DECL(winfence_complete);
MPIU_INSTR_DURATION_DECL(winfence_wait);
MPIU_INSTR_DURATION_DECL(winfence_block);
17
18
19
MPIU_INSTR_COUNTER_DECL(winfence_reqs);
MPIU_INSTR_COUNTER_DECL(winunlock_reqs);
MPIU_INSTR_COUNTER_DECL(wincomplete_reqs);
20
21
22
23
24
25
MPIU_INSTR_DURATION_DECL(winpost_clearlock);
MPIU_INSTR_DURATION_DECL(winpost_sendsync);
MPIU_INSTR_DURATION_DECL(winstart_clearlock);
MPIU_INSTR_DURATION_DECL(wincomplete_issue);
MPIU_INSTR_DURATION_DECL(wincomplete_complete);
MPIU_INSTR_DURATION_DECL(wincomplete_recvsync);
26
MPIU_INSTR_DURATION_DECL(wincomplete_block);
27
28
29
30
31
MPIU_INSTR_DURATION_DECL(winwait_wait);
MPIU_INSTR_DURATION_DECL(winlock_getlocallock);
MPIU_INSTR_DURATION_DECL(winunlock_getlock);
MPIU_INSTR_DURATION_DECL(winunlock_issue);
MPIU_INSTR_DURATION_DECL(winunlock_complete);
32
MPIU_INSTR_DURATION_DECL(winunlock_block);
33
34
35
36
MPIU_INSTR_DURATION_DECL(lockqueue_alloc);
MPIU_INSTR_DURATION_DECL(rmapkt_acc);
MPIU_INSTR_DURATION_DECL(rmapkt_acc_predef);
MPIU_INSTR_DURATION_DECL(rmapkt_acc_immed);
37
MPIU_INSTR_DURATION_DECL(rmapkt_acc_immed_op);
38
MPIU_INSTR_DURATION_DECL(rmapkt_cas);
39
MPIU_INSTR_DURATION_DECL(rmapkt_fop);
40
MPIU_INSTR_DURATION_EXTERN_DECL(rmaqueue_alloc);
41
MPIU_INSTR_DURATION_EXTERN_DECL(rmaqueue_set);
42
43
44
45
46
47
48
49
50
51
52
void MPIDI_CH3_RMA_InitInstr(void);

void MPIDI_CH3_RMA_InitInstr(void)
{
    MPIU_INSTR_DURATION_INIT(lockqueue_alloc,0,"Allocate Lock Queue element");
    MPIU_INSTR_DURATION_INIT(winfence_clearlock,1,"WIN_FENCE:Clear prior lock");
    MPIU_INSTR_DURATION_INIT(winfence_rs,0,"WIN_FENCE:ReduceScatterBlock");
    MPIU_INSTR_DURATION_INIT(winfence_issue,2,"WIN_FENCE:Issue RMA ops");
    MPIU_INSTR_DURATION_INIT(winfence_complete,1,"WIN_FENCE:Complete RMA ops");
    MPIU_INSTR_DURATION_INIT(winfence_wait,1,"WIN_FENCE:Wait for ops from other processes");
    MPIU_INSTR_DURATION_INIT(winfence_block,0,"WIN_FENCE:Wait for any progress");
53
54
55
    MPIU_INSTR_COUNTER_INIT(winfence_reqs,"WIN_FENCE:Pending requests");
    MPIU_INSTR_COUNTER_INIT(winunlock_reqs,"WIN_UNLOCK:Pending requests");
    MPIU_INSTR_COUNTER_INIT(wincomplete_reqs,"WIN_COMPLETE:Pending requests");
56
57
58
59
60
61
    MPIU_INSTR_DURATION_INIT(winpost_clearlock,1,"WIN_POST:Clear prior lock");
    MPIU_INSTR_DURATION_INIT(winpost_sendsync,1,"WIN_POST:Senc sync messages");
    MPIU_INSTR_DURATION_INIT(winstart_clearlock,1,"WIN_START:Clear prior lock");
    MPIU_INSTR_DURATION_INIT(wincomplete_recvsync,1,"WIN_COMPLETE:Recv sync messages");
    MPIU_INSTR_DURATION_INIT(wincomplete_issue,2,"WIN_COMPLETE:Issue RMA ops");
    MPIU_INSTR_DURATION_INIT(wincomplete_complete,1,"WIN_COMPLETE:Complete RMA ops");
62
    MPIU_INSTR_DURATION_INIT(wincomplete_block,0,"WIN_COMPLETE:Wait for any progress");
63
64
65
66
67
    MPIU_INSTR_DURATION_INIT(winwait_wait,1,"WIN_WAIT:Wait for ops from other processes");
    MPIU_INSTR_DURATION_INIT(winlock_getlocallock,0,"WIN_LOCK:Get local lock");
    MPIU_INSTR_DURATION_INIT(winunlock_issue,2,"WIN_UNLOCK:Issue RMA ops");
    MPIU_INSTR_DURATION_INIT(winunlock_complete,1,"WIN_UNLOCK:Complete RMA ops");
    MPIU_INSTR_DURATION_INIT(winunlock_getlock,0,"WIN_UNLOCK:Acquire lock");
68
    MPIU_INSTR_DURATION_INIT(winunlock_block,0,"WIN_UNLOCK:Wait for any progress");
69
70
71
    MPIU_INSTR_DURATION_INIT(rmapkt_acc,0,"RMA:PKTHANDLER for Accumulate");
    MPIU_INSTR_DURATION_INIT(rmapkt_acc_predef,0,"RMA:PKTHANDLER for Accumulate: predef dtype");
    MPIU_INSTR_DURATION_INIT(rmapkt_acc_immed,0,"RMA:PKTHANDLER for Accum immed");
72
    MPIU_INSTR_DURATION_INIT(rmapkt_acc_immed_op,0,"RMA:PKTHANDLER for Accum immed operation");
73
    MPIU_INSTR_DURATION_INIT(rmapkt_cas,0,"RMA:PKTHANDLER for Compare-and-swap");
74
    MPIU_INSTR_DURATION_INIT(rmapkt_fop,0,"RMA:PKTHANDLER for Fetch-and-op");
75
}
76
77
78
79
80
81
82
83
84
85

/* These are used to use a common routine to complete lists of RMA 
   operations with a single routine, while collecting data that 
   distinguishes between different synchronization modes.  This is not
   thread-safe; the best choice for thread-safety is to eliminate this
   ability to discriminate between the different types of RMA synchronization.
*/
static MPIU_INSTR_Duration_count *list_complete;  /* outer */
static MPIU_INSTR_Duration_count *list_block;     /* Inner; while waiting */

86
87
#endif

88
89
90
/*
 * These routines provide a default implementation of the MPI RMA operations
 * in terms of the low-level, two-sided channel operations.  A channel
91
92
 * may override these functions, on a per-window basis, by overriding
 * the MPID functions in the RMAFns section of MPID_Win object.
93
94
 */

95
96
#define SYNC_POST_TAG 100

97
98
99
100
101
102
103
104
105
106
static int MPIDI_CH3I_Send_rma_msg(MPIDI_RMA_ops * rma_op, MPID_Win * win_ptr, 
				   MPI_Win source_win_handle, 
				   MPI_Win target_win_handle, 
				   MPIDI_RMA_dtype_info * dtype_info, 
				   void ** dataloop, MPID_Request ** request);
static int MPIDI_CH3I_Recv_rma_msg(MPIDI_RMA_ops * rma_op, MPID_Win * win_ptr, 
				   MPI_Win source_win_handle, 
				   MPI_Win target_win_handle, 
				   MPIDI_RMA_dtype_info * dtype_info, 
				   void ** dataloop, MPID_Request ** request); 
107
108
static int MPIDI_CH3I_Send_contig_acc_msg(MPIDI_RMA_ops *, MPID_Win *,
					  MPI_Win, MPI_Win, MPID_Request ** );
109
110
static int MPIDI_CH3I_Send_immed_rmw_msg(MPIDI_RMA_ops *, MPID_Win *,
                                         MPI_Win, MPI_Win, MPID_Request ** );
111
112
113
114
115
116
static int MPIDI_CH3I_Do_passive_target_rma(MPID_Win *, int *);
static int MPIDI_CH3I_Send_lock_put_or_acc(MPID_Win *);
static int MPIDI_CH3I_Send_lock_get(MPID_Win *);
static int MPIDI_CH3I_RMAListComplete(MPID_Win *);
static int MPIDI_CH3I_RMAListPartialComplete( MPID_Win *, MPIDI_RMA_ops *, 
					      int * );
117

118
119
static int create_datatype(const MPIDI_RMA_dtype_info *dtype_info,
                           const void *dataloop, MPI_Aint dataloop_sz,
120
121
                           const void *o_addr, int o_count,
			   MPI_Datatype o_datatype,
122
                           MPID_Datatype **combined_dtp);
123
124
125
126
127
128
129
130

#undef FUNCNAME
#define FUNCNAME MPIDI_Win_fence
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Win_fence(int assert, MPID_Win *win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
131
    int comm_size;
132
    int *rma_target_proc, *nops_to_proc, i, total_op_count, *curr_ops_cnt;
133
    MPIDI_RMA_ops *curr_ptr;
134
135
136
    MPID_Comm *comm_ptr;
    MPI_Win source_win_handle, target_win_handle;
    MPID_Progress_state progress_state;
137
    int errflag = FALSE;
138
    MPIU_CHKLMEM_DECL(3);
139
140
141
142
143
144
145
146
147
148
149
150
151
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);

    /* In case this process was previously the target of passive target rma
     * operations, we need to take care of the following...
     * Since we allow MPI_Win_unlock to return without a done ack from
     * the target in the case of multiple rma ops and exclusive lock,
     * we need to check whether there is a lock on the window, and if
     * there is a lock, poke the progress engine until the operartions
     * have completed and the lock is released. */
    if (win_ptr->current_lock_type != MPID_LOCK_NONE)
    {
152
	MPIU_INSTR_DURATION_START(winfence_clearlock);
153
154
155
156
157
158
	MPID_Progress_start(&progress_state);
	while (win_ptr->current_lock_type != MPID_LOCK_NONE)
	{
	    /* poke the progress engine */
	    mpi_errno = MPID_Progress_wait(&progress_state);
	    /* --BEGIN ERROR HANDLING-- */
159
	    if (mpi_errno != MPI_SUCCESS) {
160
		MPID_Progress_end(&progress_state);
161
		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
162
163
	    }
	    /* --END ERROR HANDLING-- */
164
	    MPIU_INSTR_DURATION_INCR(winfence_clearlock,0,1);
165
166
	}
	MPID_Progress_end(&progress_state);
167
	MPIU_INSTR_DURATION_END(winfence_clearlock);
168
    }
169
170
171

    /* Note that the NOPRECEDE and NOSUCCEED must be specified by all processes
       in the window's group if any specify it */
172
173
174
175
176
177
    if (assert & MPI_MODE_NOPRECEDE)
    {
	win_ptr->fence_cnt = (assert & MPI_MODE_NOSUCCEED) ? 0 : 1;
	goto fn_exit;
    }
    
178
    if (win_ptr->fence_cnt == 0)
179
180
181
182
    {
	/* win_ptr->fence_cnt == 0 means either this is the very first
	   call to fence or the preceding fence had the
	   MPI_MODE_NOSUCCEED assert. 
183
184
185
186
187

           If this fence has MPI_MODE_NOSUCCEED, do nothing and return.
	   Otherwise just increment the fence count and return. */

	if (!(assert & MPI_MODE_NOSUCCEED)) win_ptr->fence_cnt = 1;
188
189
190
    }
    else
    {
191
	MPIDI_RMA_ops **prevNextPtr, *tmpptr;
192
193
	int nRequest = 0;
	int nRequestNew = 0;
194
	MPIU_INSTR_DURATION_START(winfence_rs);
195
	/* This is the second or later fence. Do all the preceding RMA ops. */
196
	comm_ptr = win_ptr->comm_ptr;
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
	/* First inform every process whether it is a target of RMA
	   ops from this process */
	comm_size = comm_ptr->local_size;

	MPIU_CHKLMEM_MALLOC(rma_target_proc, int *, comm_size*sizeof(int),
			    mpi_errno, "rma_target_proc");
	for (i=0; i<comm_size; i++) rma_target_proc[i] = 0;
	
	/* keep track of no. of ops to each proc. Needed for knowing
	   whether or not to decrement the completion counter. The
	   completion counter is decremented only on the last
	   operation. */
	MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size*sizeof(int),
			    mpi_errno, "nops_to_proc");
	for (i=0; i<comm_size; i++) nops_to_proc[i] = 0;

	/* set rma_target_proc[i] to 1 if rank i is a target of RMA
	   ops from this process */
	total_op_count = 0;
216
	curr_ptr = win_ptr->rma_ops_list_head;
217
218
219
220
221
222
223
224
225
226
227
	while (curr_ptr != NULL)
	{
	    total_op_count++;
	    rma_target_proc[curr_ptr->target_rank] = 1;
	    nops_to_proc[curr_ptr->target_rank]++;
	    curr_ptr = curr_ptr->next;
	}
	
	MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size*sizeof(int),
			    mpi_errno, "curr_ops_cnt");
	for (i=0; i<comm_size; i++) curr_ops_cnt[i] = 0;
228
229
	/* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc. 
	   As a result,
230
231
232
233
234
235
	   each process knows how many other processes will be doing
	   RMA ops on its window */  
            
	/* first initialize the completion counter. */
	win_ptr->my_counter = comm_size;
            
236
	mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_proc, 1,
237
                                                   MPI_INT, MPI_SUM, comm_ptr, &errflag);
238
	MPIU_INSTR_DURATION_END(winfence_rs);
239
240
	/* result is stored in rma_target_proc[0] */
	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
241
        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
242
243
244
245
246
247

	/* Set the completion counter */
	/* FIXME: MT: this needs to be done atomically because other
	   procs have the address and could decrement it. */
	win_ptr->my_counter = win_ptr->my_counter - comm_size + 
	    rma_target_proc[0];  
248
249
250
251

	MPIU_INSTR_DURATION_START(winfence_issue);
	MPIU_INSTR_DURATION_INCR(winfence_issue,0,total_op_count);
	MPIU_INSTR_DURATION_MAX(winfence_issue,1,total_op_count);
252
	MPIU_INSTR_COUNTER_RESET(winfence_reqs);
253
	i = 0;
254
	curr_ptr    = win_ptr->rma_ops_list_head;
255
	prevNextPtr = &win_ptr->rma_ops_list_head;
256
257
258
259
260
261
262
263
264
265
266
267
268
	while (curr_ptr != NULL)
	{
	    /* The completion counter at the target is decremented only on 
	       the last RMA operation. We indicate the last operation by 
	       passing the source_win_handle only on the last operation. 
	       Otherwise, we pass NULL */
	    if (curr_ops_cnt[curr_ptr->target_rank] ==
		nops_to_proc[curr_ptr->target_rank] - 1) 
		source_win_handle = win_ptr->handle;
	    else 
		source_win_handle = MPI_WIN_NULL;
	    
	    target_win_handle = win_ptr->all_win_handles[curr_ptr->target_rank];
269
270

	    curr_ptr->dataloop = 0;
271
272
273
274
275
276
	    switch (curr_ptr->type)
	    {
	    case (MPIDI_RMA_PUT):
	    case (MPIDI_RMA_ACCUMULATE):
		mpi_errno = MPIDI_CH3I_Send_rma_msg(curr_ptr, win_ptr,
					source_win_handle, target_win_handle, 
277
278
					&curr_ptr->dtype_info,
					&curr_ptr->dataloop, &curr_ptr->request);
279
280
		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
		break;
281
282
283
284
	    case MPIDI_RMA_ACC_CONTIG:
		mpi_errno = MPIDI_CH3I_Send_contig_acc_msg(curr_ptr, win_ptr,
				   source_win_handle, target_win_handle, 
				   &curr_ptr->request );
285
		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
286
		break;
287
288
289
	    case (MPIDI_RMA_GET):
		mpi_errno = MPIDI_CH3I_Recv_rma_msg(curr_ptr, win_ptr,
					source_win_handle, target_win_handle, 
290
291
					&curr_ptr->dtype_info, 
					&curr_ptr->dataloop, &curr_ptr->request);
292
293
		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
		break;
294
            case (MPIDI_RMA_COMPARE_AND_SWAP):
295
            case (MPIDI_RMA_FETCH_AND_OP):
296
297
298
299
300
301
                mpi_errno = MPIDI_CH3I_Send_immed_rmw_msg(curr_ptr, win_ptr,
                                                          source_win_handle, target_win_handle, 
                                                          &curr_ptr->request );
                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
                break;

302
	    default:
303
		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winInvalidOp");
304
305
306
	    }
	    i++;
	    curr_ops_cnt[curr_ptr->target_rank]++;
307
308
309
310
311
312
313
314
315
316
317
318
	    /* If the request is null, we can remove it immediately */
	    if (!curr_ptr->request) {
		if (curr_ptr->dataloop != NULL) {
		    MPIU_Free(curr_ptr->dataloop); /* allocated in send_rma_msg or 
						      recv_rma_msg */
		}
		tmpptr       = curr_ptr->next;
		*prevNextPtr = tmpptr;
		MPIU_Free( curr_ptr );
		curr_ptr     = tmpptr;
	    }
	    else  {
319
320
		nRequest++;
		MPIU_INSTR_COUNTER_INCR(winfence_reqs,1);
321
		prevNextPtr = &curr_ptr->next;
322
		curr_ptr    = curr_ptr->next;
323
324
325
326
327
328
329
330
331
332
333
334
335
		/* The test on the difference is to reduce the number
		   of times the partial complete routine is called. Without
		   this, significant overhead is added once the
		   number of requests exceeds the threshold, since the
		   number that are completed in a call may be small. */
		if (nRequest > MPIR_PARAM_RMA_NREQUEST_THRESHOLD && 
		    nRequest - nRequestNew > MPIR_PARAM_RMA_NREQUEST_NEW_THRESHOLD) {
		    int nDone = 0;
		    MPIDI_CH3I_RMAListPartialComplete( win_ptr, curr_ptr, &nDone );
		    /* if (nDone > 0) printf( "nDone = %d\n", nDone ); */
		    nRequest -= nDone;
		    nRequestNew = nRequest;
		}
336
	    }
337
	}
338
339
340
341
342
343
344
345
346
347
348
349
350
	MPIU_INSTR_DURATION_END(winfence_issue);

	/* We replaced a loop over an array of requests with a list of the
	   incomplete requests.  The reason to do 
	   that is for long lists - processing the entire list until
	   all are done introduces a potentially n^2 time.  In 
	   testing with test/mpi/perf/manyrma.c , the number of iterations
	   within the "while (total_op_count) was O(total_op_count).
	   
	   Another alternative is to create a more compressed list (storing
	   only the necessary information, reducing the number of cache lines
	   needed while looping through the requests.
	*/
351
352
	if (total_op_count)
	{ 
353
354
355
	    MPIU_INSTR_STMT(list_complete=MPIU_INSTR_GET_VAR(winfence_complete));
	    MPIU_INSTR_STMT(list_block=MPIU_INSTR_GET_VAR(winfence_block));
	    mpi_errno = MPIDI_CH3I_RMAListComplete(win_ptr);
356
	}
357

358
            
359
360
	win_ptr->rma_ops_list_head = NULL;
	win_ptr->rma_ops_list_tail = NULL;
361
362
363
364
	
	/* wait for all operations from other processes to finish */
	if (win_ptr->my_counter)
	{
365
	    MPIU_INSTR_DURATION_START(winfence_wait);
366
367
368
369
370
	    MPID_Progress_start(&progress_state);
	    while (win_ptr->my_counter)
	    {
		mpi_errno = MPID_Progress_wait(&progress_state);
		/* --BEGIN ERROR HANDLING-- */
371
		if (mpi_errno != MPI_SUCCESS) {
372
		    MPID_Progress_end(&progress_state);
373
		    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
374
375
		}
		/* --END ERROR HANDLING-- */
376
		MPIU_INSTR_DURATION_INCR(winfence_wait,0,1);
377
378
	    }
	    MPID_Progress_end(&progress_state);
379
	    MPIU_INSTR_DURATION_END(winfence_wait);
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
	} 
	
	if (assert & MPI_MODE_NOSUCCEED)
	{
	    win_ptr->fence_cnt = 0;
	}
    }

 fn_exit:
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FENCE);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
/* create_datatype() creates a new struct datatype for the dtype_info
   and the dataloop of the target datatype together with the user data */
#undef FUNCNAME
#define FUNCNAME create_datatype
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int create_datatype(const MPIDI_RMA_dtype_info *dtype_info,
                           const void *dataloop, MPI_Aint dataloop_sz,
                           const void *o_addr, int o_count, MPI_Datatype o_datatype,
                           MPID_Datatype **combined_dtp)
{
    int mpi_errno = MPI_SUCCESS;
    /* datatype_set_contents wants an array 'ints' which is the
       blocklens array with count prepended to it.  So blocklens
       points to the 2nd element of ints to avoid having to copy
       blocklens into ints later. */
    int ints[4];
    int *blocklens = &ints[1];
    MPI_Aint displaces[3];
    MPI_Datatype datatypes[3];
    const int count = 3;
    MPI_Datatype combined_datatype;
    MPIDI_STATE_DECL(MPID_STATE_CREATE_DATATYPE);

    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DATATYPE);

    /* create datatype */
    displaces[0] = MPIU_PtrToAint(dtype_info);
    blocklens[0] = sizeof(*dtype_info);
    datatypes[0] = MPI_BYTE;
    
    displaces[1] = MPIU_PtrToAint(dataloop);
    blocklens[1] = dataloop_sz;
    datatypes[1] = MPI_BYTE;
    
    displaces[2] = MPIU_PtrToAint(o_addr);
    blocklens[2] = o_count;
    datatypes[2] = o_datatype;
    
    mpi_errno = MPID_Type_struct(count,
                                 blocklens,
                                 displaces,
                                 datatypes,
                                 &combined_datatype);
442
443
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
   
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
    ints[0] = count;

    MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);    
    mpi_errno = MPID_Datatype_set_contents(*combined_dtp,
				           MPI_COMBINER_STRUCT,
				           count+1, /* ints (cnt,blklen) */
				           count, /* aints (disps) */
				           count, /* types */
				           ints,
				           displaces,
				           datatypes);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    /* Commit datatype */
    
    MPID_Dataloop_create(combined_datatype,
                         &(*combined_dtp)->dataloop,
                         &(*combined_dtp)->dataloop_size,
                         &(*combined_dtp)->dataloop_depth,
                         MPID_DATALOOP_HOMOGENEOUS);
    
    /* create heterogeneous dataloop */
    MPID_Dataloop_create(combined_datatype,
                         &(*combined_dtp)->hetero_dloop,
                         &(*combined_dtp)->hetero_dloop_size,
                         &(*combined_dtp)->hetero_dloop_depth,
                         MPID_DATALOOP_HETEROGENEOUS);
 
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DATATYPE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Send_rma_msg
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPIDI_CH3I_Send_rma_msg(MPIDI_RMA_ops *rma_op, MPID_Win *win_ptr,
				   MPI_Win source_win_handle, 
				   MPI_Win target_win_handle, 
				   MPIDI_RMA_dtype_info *dtype_info, 
				   void **dataloop, MPID_Request **request) 
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_put_t *put_pkt = &upkt.put;
    MPIDI_CH3_Pkt_accum_t *accum_pkt = &upkt.accum;
    MPID_IOV iov[MPID_IOV_LIMIT];
    int mpi_errno=MPI_SUCCESS, predefined;
495
    int origin_dt_derived, target_dt_derived, origin_type_size, iovcnt; 
496
497
498
499
500
501
502
503
504
    MPIDI_VC_t * vc;
    MPID_Comm *comm_ptr;
    MPID_Datatype *target_dtp=NULL, *origin_dtp=NULL;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_RMA_MSG);
    MPIDI_STATE_DECL(MPID_STATE_MEMCPY);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_RMA_MSG);

505
506
    *request = NULL;

507
508
509
510
511
512
513
514
515
516
517
    if (rma_op->type == MPIDI_RMA_PUT)
    {
        MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        put_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;

        put_pkt->count = rma_op->target_count;
        put_pkt->datatype = rma_op->target_datatype;
        put_pkt->dataloop_size = 0;
        put_pkt->target_win_handle = target_win_handle;
        put_pkt->source_win_handle = source_win_handle;
518
        
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) put_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*put_pkt);
    }
    else
    {
        MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
        accum_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
            win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
        accum_pkt->count = rma_op->target_count;
        accum_pkt->datatype = rma_op->target_datatype;
        accum_pkt->dataloop_size = 0;
        accum_pkt->op = rma_op->op;
        accum_pkt->target_win_handle = target_win_handle;
        accum_pkt->source_win_handle = source_win_handle;

        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
    }

538
539
540
541
    /*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
          rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
          fflush(stdout);
    */
542

543
    comm_ptr = win_ptr->comm_ptr;
544
    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571

    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype, predefined);
    if (!predefined)
    {
        origin_dt_derived = 1;
        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp);
    }
    else
    {
        origin_dt_derived = 0;
    }

    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->target_datatype, predefined);
    if (!predefined)
    {
        target_dt_derived = 1;
        MPID_Datatype_get_ptr(rma_op->target_datatype, target_dtp);
    }
    else
    {
        target_dt_derived = 0;
    }

    if (target_dt_derived)
    {
        /* derived datatype on target. fill derived datatype info */
        dtype_info->is_contig = target_dtp->is_contig;
572
        dtype_info->max_contig_blocks = target_dtp->max_contig_blocks;
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
        dtype_info->size = target_dtp->size;
        dtype_info->extent = target_dtp->extent;
        dtype_info->dataloop_size = target_dtp->dataloop_size;
        dtype_info->dataloop_depth = target_dtp->dataloop_depth;
        dtype_info->eltype = target_dtp->eltype;
        dtype_info->dataloop = target_dtp->dataloop;
        dtype_info->ub = target_dtp->ub;
        dtype_info->lb = target_dtp->lb;
        dtype_info->true_ub = target_dtp->true_ub;
        dtype_info->true_lb = target_dtp->true_lb;
        dtype_info->has_sticky_ub = target_dtp->has_sticky_ub;
        dtype_info->has_sticky_lb = target_dtp->has_sticky_lb;

	MPIU_CHKPMEM_MALLOC(*dataloop, void *, target_dtp->dataloop_size, 
			    mpi_errno, "dataloop");

	MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY);
590
        MPIU_Memcpy(*dataloop, target_dtp->dataloop, target_dtp->dataloop_size);
591
	MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY);
592
593
        /* the dataloop can have undefined padding sections, so we need to let
         * valgrind know that it is OK to pass this data to writev later on */
594
        MPL_VG_MAKE_MEM_DEFINED(*dataloop, target_dtp->dataloop_size);
595
596
597
598
599
600
601
602
603
604
605
606
607

        if (rma_op->type == MPIDI_RMA_PUT)
	{
            put_pkt->dataloop_size = target_dtp->dataloop_size;
	}
        else
	{
            accum_pkt->dataloop_size = target_dtp->dataloop_size;
	}
    }

    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);

608
    if (!target_dt_derived)
609
    {
610
611
612
613
        /* basic datatype on target */
        if (!origin_dt_derived)
        {
            /* basic datatype on origin */
614
615
616
            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rma_op->origin_addr;
            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
            iovcnt = 2;
617
	    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
618
            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, request);
619
	    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
620
            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
621
622
        }
        else
623
624
625
        {
            /* derived datatype on origin */
            *request = MPID_Request_create();
626
            MPIU_ERR_CHKANDJUMP(*request == NULL,mpi_errno,MPI_ERR_OTHER,"**nomemreq");
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
            
            MPIU_Object_set_ref(*request, 2);
            (*request)->kind = MPID_REQUEST_SEND;
            
            (*request)->dev.segment_ptr = MPID_Segment_alloc( );
            MPIU_ERR_CHKANDJUMP1((*request)->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

            (*request)->dev.datatype_ptr = origin_dtp;
            /* this will cause the datatype to be freed when the request
               is freed. */
            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
                              rma_op->origin_datatype,
                              (*request)->dev.segment_ptr, 0);
            (*request)->dev.segment_first = 0;
            (*request)->dev.segment_size = rma_op->origin_count * origin_type_size;

            (*request)->dev.OnFinal = 0;
            (*request)->dev.OnDataAvail = 0;

646
	    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
647
            mpi_errno = vc->sendNoncontig_fn(vc, *request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
648
	    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
649
            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
650
651
652
653
        }
    }
    else
    {
654
        /* derived datatype on target */
655
        MPID_Datatype *combined_dtp = NULL;
656
657
658

        *request = MPID_Request_create();
        if (*request == NULL) {
659
	    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomemreq");
660
661
662
663
664
665
        }

        MPIU_Object_set_ref(*request, 2);
        (*request)->kind = MPID_REQUEST_SEND;

	(*request)->dev.segment_ptr = MPID_Segment_alloc( );
666
667
668
669
        MPIU_ERR_CHKANDJUMP1((*request)->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");

        /* create a new datatype containing the dtype_info, dataloop, and origin data */

670
671
        mpi_errno = create_datatype(dtype_info, *dataloop, target_dtp->dataloop_size, rma_op->origin_addr,
                                    rma_op->origin_count, rma_op->origin_datatype, &combined_dtp);
672
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
673
674
675
676
677

        (*request)->dev.datatype_ptr = combined_dtp;
        /* combined_datatype will be freed when request is freed */

        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle,
678
679
                          (*request)->dev.segment_ptr, 0);
        (*request)->dev.segment_first = 0;
680
681
682
683
684
        (*request)->dev.segment_size = combined_dtp->size;

        (*request)->dev.OnFinal = 0;
        (*request)->dev.OnDataAvail = 0;

685
	MPIU_THREAD_CS_ENTER(CH3COMM,vc);
686
        mpi_errno = vc->sendNoncontig_fn(vc, *request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
687
	MPIU_THREAD_CS_EXIT(CH3COMM,vc);
688
        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
689

690
691
692
        /* we're done with the datatypes */
        if (origin_dt_derived)
            MPID_Datatype_release(origin_dtp);
693
        MPID_Datatype_release(target_dtp);
694
    }    
695
696
697
698
699
700

 fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_RMA_MSG);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
701
702
703
    if (*request)
    {
        MPIU_CHKPMEM_REAP();
704
705
        if ((*request)->dev.datatype_ptr)
            MPID_Datatype_release((*request)->dev.datatype_ptr);
706
707
708
709
        MPIU_Object_set_ref(*request, 0);
        MPIDI_CH3_Request_destroy(*request);
    }
    *request = NULL;
710
711
712
713
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
/*
 * Use this for contiguous accumulate operations
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Send_contig_acc_msg
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPIDI_CH3I_Send_contig_acc_msg(MPIDI_RMA_ops *rma_op, 
					  MPID_Win *win_ptr,
					  MPI_Win source_win_handle, 
					  MPI_Win target_win_handle, 
					  MPID_Request **request) 
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_accum_t *accum_pkt = &upkt.accum;
    MPID_IOV iov[MPID_IOV_LIMIT];
    int mpi_errno=MPI_SUCCESS;
    int origin_type_size, iovcnt; 
    MPIDI_VC_t * vc;
    MPID_Comm *comm_ptr;
    int len;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);

    *request = NULL;

    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
    /* FIXME: Make this size check efficient and match the packet type */
    len = rma_op->origin_count * origin_type_size;
744
    if (MPIR_PARAM_RMA_ACC_IMMED && len <= MPIDI_RMA_IMMED_INTS*sizeof(int)) {
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
	MPIDI_CH3_Pkt_accum_immed_t * accumi_pkt = &upkt.accum_immed;
	void *dest = accumi_pkt->data, *src = rma_op->origin_addr;
	
	MPIDI_Pkt_init(accumi_pkt, MPIDI_CH3_PKT_ACCUM_IMMED);
	accumi_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
	    win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
	accumi_pkt->count = rma_op->target_count;
	accumi_pkt->datatype = rma_op->target_datatype;
	accumi_pkt->op = rma_op->op;
	accumi_pkt->target_win_handle = target_win_handle;
	accumi_pkt->source_win_handle = source_win_handle;
	
	switch (len) {
	case 1: *(uint8_t *)dest  = *(uint8_t *)src;  break;
	case 2: *(uint16_t *)dest = *(uint16_t *)src; break;
	case 4: *(uint32_t *)dest = *(uint32_t *)src; break;
	case 8: *(uint64_t *)dest = *(uint64_t *)src; break;
	default:
	    MPIU_Memcpy( accumi_pkt->data, (void *)rma_op->origin_addr, len );
	}
	comm_ptr = win_ptr->comm_ptr;
	MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
	MPIU_THREAD_CS_ENTER(CH3COMM,vc);
768
	mpi_errno = MPIDI_CH3_iStartMsg(vc, accumi_pkt, sizeof(*accumi_pkt), request);
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
	MPIU_THREAD_CS_EXIT(CH3COMM,vc);
	MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
	goto fn_exit;
    }

    MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
    accum_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
	win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
    accum_pkt->count = rma_op->target_count;
    accum_pkt->datatype = rma_op->target_datatype;
    accum_pkt->dataloop_size = 0;
    accum_pkt->op = rma_op->op;
    accum_pkt->target_win_handle = target_win_handle;
    accum_pkt->source_win_handle = source_win_handle;
    
    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
    iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);

    /*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
          rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
          fflush(stdout);
    */

    comm_ptr = win_ptr->comm_ptr;
    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);


    /* basic datatype on target */
    /* basic datatype on origin */
    /* FIXME: This is still very heavyweight for a small message operation,
       such as a single word update */
    /* One possibility is to use iStartMsg with a buffer that is just large 
       enough, though note that nemesis has an optimization for this */
    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rma_op->origin_addr;
    iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
    iovcnt = 2;
    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
806
    mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, request);
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");

 fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_CONTIG_ACC_MSG);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    if (*request)
    {
        MPIU_Object_set_ref(*request, 0);
        MPIDI_CH3_Request_destroy(*request);
    }
    *request = NULL;
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

825

826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
/*
 * Initiate an immediate RMW accumulate operation
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Send_immed_rmw_msg
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPIDI_CH3I_Send_immed_rmw_msg(MPIDI_RMA_ops *rma_op, 
                                         MPID_Win *win_ptr,
                                         MPI_Win source_win_handle, 
                                         MPI_Win target_win_handle, 
                                         MPID_Request **request) 
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *req, *resp_req = NULL;
    MPIDI_VC_t *vc;
    MPID_Comm *comm_ptr;
    int len;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_IMMED_RMW_MSG);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_IMMED_RMW_MSG);

    *request = NULL;

    /* Create a request for the RMW response.  Store the origin buf, count, and
       datatype in it, and pass the request's handle RMW packet. When the
       response comes from the target, it will contain the request handle. */
    resp_req = MPID_Request_create();
    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
    *request = resp_req;

    /* Set refs on the request to 2: one for the response message, and one for
       the partial completion handler */
    MPIU_Object_set_ref(resp_req, 2);

    resp_req->dev.user_buf = rma_op->result_addr;
    resp_req->dev.user_count = rma_op->result_count;
    resp_req->dev.datatype = rma_op->result_datatype;
    resp_req->dev.target_win_handle = MPI_WIN_NULL;
    resp_req->dev.source_win_handle = source_win_handle;

    /* REQUIRE: All datatype arguments must be of the same, builtin
                type and counts must be 1. */
    MPID_Datatype_get_size_macro(rma_op->origin_datatype, len);

    if (rma_op->type == MPIDI_RMA_COMPARE_AND_SWAP) {
        MPIDI_CH3_Pkt_t upkt;
        MPIDI_CH3_Pkt_cas_t *cas_pkt = &upkt.cas;

        MPIU_Assert(len <= sizeof(MPIDI_CH3_CAS_Immed_u));

877
878
        /* If this is the last operation, it also unlocks the window 
           at the target. */
879
        if (source_win_handle != MPI_WIN_NULL) {
880
881
882
883
884
            MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_UNLOCK);
        } else {
            MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
        }

885
        cas_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] + win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
        cas_pkt->datatype = rma_op->target_datatype;
        cas_pkt->target_win_handle = target_win_handle;
        cas_pkt->request_handle = resp_req->handle;

        MPIU_Memcpy( (void *) &cas_pkt->origin_data, rma_op->origin_addr, len );
        MPIU_Memcpy( (void *) &cas_pkt->compare_data, rma_op->compare_addr, len );

        comm_ptr = win_ptr->comm_ptr;
        MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
        MPIU_THREAD_CS_ENTER(CH3COMM,vc);
        mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_pkt, sizeof(*cas_pkt), &req);
        MPIU_THREAD_CS_EXIT(CH3COMM,vc);
        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }

901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
    else if (rma_op->type == MPIDI_RMA_FETCH_AND_OP) {
        MPIDI_CH3_Pkt_t upkt;
        MPIDI_CH3_Pkt_fop_t *fop_pkt = &upkt.fop;

        MPIU_Assert(len <= sizeof(MPIDI_CH3_FOP_Immed_u));

        /* If this is the last operation, it also unlocks the window 
           at the target. */
        if (source_win_handle != MPI_WIN_NULL) {
            MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP_UNLOCK);
        } else {
            MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
        }

        fop_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] + win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
        fop_pkt->datatype = rma_op->target_datatype;
        fop_pkt->target_win_handle = target_win_handle;
        fop_pkt->request_handle = resp_req->handle;
        fop_pkt->op = rma_op->op;

921
922
923
        if (rma_op->op != MPIX_NO_OP) {
            MPIU_Memcpy( (void *) &fop_pkt->origin_data, rma_op->origin_addr, len );
        }
924
925
926
927
928
929
930
931

        comm_ptr = win_ptr->comm_ptr;
        MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
        MPIU_THREAD_CS_ENTER(CH3COMM,vc);
        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &req);
        MPIU_THREAD_CS_EXIT(CH3COMM,vc);
        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952

    else {
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
    }

fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_IMMED_RMW_MSG);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
fn_fail:
    if (*request)
    {
        MPIU_Object_set_ref(*request, 0);
        MPIDI_CH3_Request_destroy(*request);
    }
    *request = NULL;
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


953
954
955
956
957
958
959
960
961
962
963
964
965

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Recv_rma_msg
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPIDI_CH3I_Recv_rma_msg(MPIDI_RMA_ops *rma_op, MPID_Win *win_ptr,
				   MPI_Win source_win_handle, 
				   MPI_Win target_win_handle, 
				   MPIDI_RMA_dtype_info *dtype_info, 
				   void **dataloop, MPID_Request **request) 
{
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_get_t *get_pkt = &upkt.get;
966
    int mpi_errno=MPI_SUCCESS, predefined;
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
    MPIDI_VC_t * vc;
    MPID_Comm *comm_ptr;
    MPID_Request *req = NULL;
    MPID_Datatype *dtp;
    MPID_IOV iov[MPID_IOV_LIMIT];
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RECV_RMA_MSG);
    MPIDI_STATE_DECL(MPID_STATE_MEMCPY);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_RECV_RMA_MSG);

    /* create a request, store the origin buf, cnt, datatype in it,
       and pass a handle to it in the get packet. When the get
       response comes from the target, it will contain the request
       handle. */  
    req = MPID_Request_create();
    if (req == NULL) {
984
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomemreq");
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
    }

    *request = req;

    MPIU_Object_set_ref(req, 2);

    req->dev.user_buf = rma_op->origin_addr;
    req->dev.user_count = rma_op->origin_count;
    req->dev.datatype = rma_op->origin_datatype;
    req->dev.target_win_handle = MPI_WIN_NULL;
    req->dev.source_win_handle = source_win_handle;
    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(req->dev.datatype, predefined);
    if (!predefined)
    {
        MPID_Datatype_get_ptr(req->dev.datatype, dtp);
        req->dev.datatype_ptr = dtp;
        /* this will cause the datatype to be freed when the
           request is freed. */  
    }

    MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
    get_pkt->addr = (char *) win_ptr->base_addrs[rma_op->target_rank] +
        win_ptr->disp_units[rma_op->target_rank] * rma_op->target_disp;
    get_pkt->count = rma_op->target_count;
    get_pkt->datatype = rma_op->target_datatype;
    get_pkt->request_handle = req->handle;
    get_pkt->target_win_handle = target_win_handle;
    get_pkt->source_win_handle = source_win_handle;

/*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
           rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
    fflush(stdout);
*/
	    
1019
    comm_ptr = win_ptr->comm_ptr;
1020
    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
1021
1022
1023
1024
1025

    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(rma_op->target_datatype, predefined);
    if (predefined)
    {
        /* basic datatype on target. simply send the get_pkt. */
1026
	MPIU_THREAD_CS_ENTER(CH3COMM,vc);
1027
        mpi_errno = MPIDI_CH3_iStartMsg(vc, get_pkt, sizeof(*get_pkt), &req);
1028
	MPIU_THREAD_CS_EXIT(CH3COMM,vc);
1029
1030
1031
1032
1033
1034
1035
1036
    }
    else
    {
        /* derived datatype on target. fill derived datatype info and
           send it along with get_pkt. */

        MPID_Datatype_get_ptr(rma_op->target_datatype, dtp);
        dtype_info->is_contig = dtp->is_contig;
1037
        dtype_info->max_contig_blocks = dtp->max_contig_blocks;
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
        dtype_info->size = dtp->size;
        dtype_info->extent = dtp->extent;
        dtype_info->dataloop_size = dtp->dataloop_size;
        dtype_info->dataloop_depth = dtp->dataloop_depth;
        dtype_info->eltype = dtp->eltype;
        dtype_info->dataloop = dtp->dataloop;
        dtype_info->ub = dtp->ub;
        dtype_info->lb = dtp->lb;
        dtype_info->true_ub = dtp->true_ub;
        dtype_info->true_lb = dtp->true_lb;
        dtype_info->has_sticky_ub = dtp->has_sticky_ub;
        dtype_info->has_sticky_lb = dtp->has_sticky_lb;

	MPIU_CHKPMEM_MALLOC(*dataloop, void *, dtp->dataloop_size, 
			    mpi_errno, "dataloop");

	MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY);
1055
        MPIU_Memcpy(*dataloop, dtp->dataloop, dtp->dataloop_size);
1056
1057
	MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY);

1058
1059
        /* the dataloop can have undefined padding sections, so we need to let
         * valgrind know that it is OK to pass this data to writev later on */
1060
        MPL_VG_MAKE_MEM_DEFINED(*dataloop, dtp->dataloop_size);
1061

1062
1063
1064
1065
1066
1067
1068
1069
        get_pkt->dataloop_size = dtp->dataloop_size;

        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)get_pkt;
        iov[0].MPID_IOV_LEN = sizeof(*get_pkt);
        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)dtype_info;
        iov[1].MPID_IOV_LEN = sizeof(*dtype_info);
        iov[2].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)*dataloop;
        iov[2].MPID_IOV_LEN = dtp->dataloop_size;
1070

1071
	MPIU_THREAD_CS_ENTER(CH3COMM,vc);
1072
        mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, 3, &req);
1073
	MPIU_THREAD_CS_EXIT(CH3COMM,vc);
1074
1075
1076
1077
1078

        /* release the target datatype */
        MPID_Datatype_release(dtp);
    }

1079
1080
    if (mpi_errno != MPI_SUCCESS) {
	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg");
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
    }

    /* release the request returned by iStartMsg or iStartMsgv */
    if (req != NULL)
    {
        MPID_Request_release(req);
    }

 fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_RECV_RMA_MSG);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_Win_post
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
1105
int MPIDI_Win_post(MPID_Group *post_grp_ptr, int assert, MPID_Win *win_ptr)
1106
1107
{
    int mpi_errno=MPI_SUCCESS;
1108
    MPID_Group *win_grp_ptr;
1109
    int i, post_grp_size, *ranks_in_post_grp, *ranks_in_win_grp, dst, rank;
1110
    MPID_Comm *win_comm_ptr;
1111
    MPIU_CHKLMEM_DECL(4);
1112
1113
1114
1115
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_POST);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_POST);

1116
1117
1118
1119
1120
    /* Even though we would want to reset the fence counter to keep
     * the user from using the previous fence to mark the beginning of
     * a fence epoch if he switched from fence to lock-unlock
     * synchronization, we cannot do this because fence_cnt must be
     * updated collectively */
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132

    /* In case this process was previously the target of passive target rma
     * operations, we need to take care of the following...
     * Since we allow MPI_Win_unlock to return without a done ack from
     * the target in the case of multiple rma ops and exclusive lock,
     * we need to check whether there is a lock on the window, and if
     * there is a lock, poke the progress engine until the operations
     * have completed and the lock is therefore released. */
    if (win_ptr->current_lock_type != MPID_LOCK_NONE)
    {
	MPID_Progress_state progress_state;
	
1133
	MPIU_INSTR_DURATION_START(winpost_clearlock);
1134
1135
1136
1137
1138
1139
	/* poke the progress engine */
	MPID_Progress_start(&progress_state);
	while (win_ptr->current_lock_type != MPID_LOCK_NONE)
	{
	    mpi_errno = MPID_Progress_wait(&progress_state);
	    /* --BEGIN ERROR HANDLING-- */
1140
	    if (mpi_errno != MPI_SUCCESS) {
1141
		MPID_Progress_end(&progress_state);
1142
		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
1143
1144
	    }
	    /* --END ERROR HANDLING-- */
1145
	    MPIU_INSTR_DURATION_INCR(winpost_clearlock,0,1);
1146
1147
	}
	MPID_Progress_end(&progress_state);
1148
	MPIU_INSTR_DURATION_END(winpost_clearlock);
1149
1150
    }
        
1151
    post_grp_size = post_grp_ptr->size;
1152
1153
1154
1155
1156
1157
        
    /* initialize the completion counter */
    win_ptr->my_counter = post_grp_size;
        
    if ((assert & MPI_MODE_NOCHECK) == 0)
    {
1158
1159
        MPI_Request *req;
        MPI_Status *status;
1160
1161

	MPIU_INSTR_DURATION_START(winpost_sendsync);
1162
 
1163
1164
1165
1166
	/* NOCHECK not specified. We need to notify the source
	   processes that Post has been called. */  
	
	/* We need to translate the ranks of the processes in
1167
	   post_group to ranks in win_ptr->comm_ptr, so that we
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
	   can do communication */
            
	MPIU_CHKLMEM_MALLOC(ranks_in_post_grp, int *, 
			    post_grp_size * sizeof(int),
			    mpi_errno, "ranks_in_post_grp");
	MPIU_CHKLMEM_MALLOC(ranks_in_win_grp, int *, 
			    post_grp_size * sizeof(int),
			    mpi_errno, "ranks_in_win_grp");
        
	for (i=0; i<post_grp_size; i++)
	{
	    ranks_in_post_grp[i] = i;
	}
        
1182
	win_comm_ptr = win_ptr->comm_ptr;
1183
1184
1185

        mpi_errno = MPIR_Comm_group_impl(win_comm_ptr, &win_grp_ptr);
	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1186
	
1187

1188
1189
1190
        mpi_errno = MPIR_Group_translate_ranks_impl(post_grp_ptr, post_grp_size, ranks_in_post_grp,
                                                    win_grp_ptr, ranks_in_win_grp);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1191
	
1192
        rank = win_ptr->myrank;
1193
	
1194
1195
1196
	MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request), mpi_errno, "req");
        MPIU_CHKLMEM_MALLOC(status, MPI_Status *, post_grp_size*sizeof(MPI_Status), mpi_errno, "status");

1197
	/* Send a 0-byte message to the source processes */
1198
	MPIU_INSTR_DURATION_INCR(winpost_sendsync,0,post_grp_size);
1199
	for (i = 0; i < post_grp_size; i++) {
1200
	    dst = ranks_in_win_grp[i];
1201
1202
1203
1204
1205

	    /* FIXME: Short messages like this shouldn't normally need a 
	       request - this should consider using the ch3 call to send
	       a short message and return a request only if the message is
	       not delivered. */
1206
	    if (dst != rank) {
1207
1208
1209
1210
1211
1212
1213
1214
                MPID_Request *req_ptr;
		mpi_errno = MPID_Isend(&i, 0, MPI_INT, dst, SYNC_POST_TAG, win_comm_ptr,
                                       MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
		if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                req[i] = req_ptr->handle;
	    } else {
                req[i] = MPI_REQUEST_NULL;
            }
1215
	}
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
        mpi_errno = MPIR_Waitall_impl(post_grp_size, req, status);
        if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIU_ERR_POP(mpi_errno);

        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno == MPI_ERR_IN_STATUS) {
            for (i = 0; i < post_grp_size; i++) {
                if (status[i].MPI_ERROR != MPI_SUCCESS) {
                    mpi_errno = status[i].MPI_ERROR;
                    MPIU_ERR_POP(mpi_errno);
                }
            }
        }
        /* --END ERROR HANDLING-- */
1229
1230
1231

        mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1232
	MPIU_INSTR_DURATION_END(winpost_sendsync);
1233
    }
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257

 fn_exit:
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_POST);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
 fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Win_start
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Win_start(MPID_Group *group_ptr, int assert, MPID_Win *win_ptr)
{
    int mpi_errno=MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_START);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_START);

1258
1259
1260
1261
1262
    /* Even though we would want to reset the fence counter to keep
     * the user from using the previous fence to mark the beginning of
     * a fence epoch if he switched from fence to lock-unlock
     * synchronization, we cannot do this because fence_cnt must be
     * updated collectively */
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274

    /* In case this process was previously the target of passive target rma
     * operations, we need to take care of the following...
     * Since we allow MPI_Win_unlock to return without a done ack from
     * the target in the case of multiple rma ops and exclusive lock,
     * we need to check whether there is a lock on the window, and if
     * there is a lock, poke the progress engine until the operations
     * have completed and the lock is therefore released. */
    if (win_ptr->current_lock_type != MPID_LOCK_NONE)
    {
	MPID_Progress_state progress_state;
	
1275
	MPIU_INSTR_DURATION_START(winstart_clearlock);
1276
1277
1278
1279
1280
1281
	/* poke the progress engine */
	MPID_Progress_start(&progress_state);
	while (win_ptr->current_lock_type != MPID_LOCK_NONE)
	{
	    mpi_errno = MPID_Progress_wait(&progress_state);
	    /* --BEGIN ERROR HANDLING-- */
1282
	    if (mpi_errno != MPI_SUCCESS) {
1283
		MPID_Progress_end(&progress_state);
1284
		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
1285
1286
	    }
	    /* --END ERROR HANDLING-- */
1287
	    MPIU_INSTR_DURATION_INCR(winstart_clearlock,0,1);
1288
1289
	}
	MPID_Progress_end(&progress_state);
1290
	MPIU_INSTR_DURATION_END(winstart_clearlock);
1291
1292
1293
1294
1295
    }
    
    win_ptr->start_group_ptr = group_ptr;
    MPIR_Group_add_ref( group_ptr );
    win_ptr->start_assert = assert;
1296
1297

 fn_fail:
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_START);
    return mpi_errno;
}



#undef FUNCNAME
#define FUNCNAME MPIDI_Win_complete
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_Win_complete(MPID_Win *win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size, *nops_to_proc, src, new_total_op_count;
1312
    int i, j, dst, total_op_count, *curr_ops_cnt;
1313
    MPIDI_RMA_ops *curr_ptr, *tmpptr, **prevNextPtr;
1314
1315
    MPID_Comm *comm_ptr;
    MPI_Win source_win_handle, target_win_handle;
1316
    MPID_Group *win_grp_ptr;
1317
    int start_grp_size, *ranks_in_start_grp, *ranks_in_win_grp, rank;
1318
1319
    int nRequest = 0;
    int nRequestNew = 0;
1320
    MPIU_CHKLMEM_DECL(9);
1321
1322
1323
1324
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_COMPLETE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_COMPLETE);

1325
    comm_ptr = win_ptr->comm_ptr;
1326
1327
1328
    comm_size = comm_ptr->local_size;
        
    /* Translate the ranks of the processes in
1329
       start_group to ranks in win_ptr->comm_ptr */
1330
1331
    
    start_grp_size = win_ptr->start_group_ptr->size;
1332
1333

    MPIU_INSTR_DURATION_START(wincomplete_recvsync);
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
    MPIU_CHKLMEM_MALLOC(ranks_in_start_grp, int *, start_grp_size*sizeof(int), 
			mpi_errno, "ranks_in_start_grp");
        
    MPIU_CHKLMEM_MALLOC(ranks_in_win_grp, int *, start_grp_size*sizeof(int), 
			mpi_errno, "ranks_in_win_grp");
        
    for (i=0; i<start_grp_size; i++)
    {
	ranks_in_start_grp[i] = i;
    }
        
1345
1346
1347
    mpi_errno = MPIR_Comm_group_impl(comm_ptr, &win_grp_ptr);
    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }

1348
1349
1350
1351
1352
    mpi_errno = MPIR_Group_translate_ranks_impl(win_ptr->start_group_ptr, start_grp_size,
                                                ranks_in_start_grp,
                                                win_grp_ptr, ranks_in_win_grp);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

1353
    rank = win_ptr->myrank;
1354

1355
1356
1357
1358
1359
    /* If MPI_MODE_NOCHECK was not specified, we need to check if
       Win_post was called on the target processes. Wait for a 0-byte sync
       message from each target process */
    if ((win_ptr->start_assert & MPI_MODE_NOCHECK) == 0)
    {
1360
1361
        MPI_Request *req;
        MPI_Status *status;
1362

1363
1364
1365
        MPIU_CHKLMEM_MALLOC(req, MPI_Request *, start_grp_size*sizeof(MPI_Request), mpi_errno, "req");
        MPIU_CHKLMEM_MALLOC(status, MPI_Status *, start_grp_size*sizeof(MPI_Status), mpi_errno, "status");

1366
	MPIU_INSTR_DURATION_INCR(wincomplete_recvsync,0,start_grp_size);
1367
	for (i = 0; i < start_grp_size; i++) {
1368
1369
	    src = ranks_in_win_grp[i];
	    if (src != rank) {
1370
                MPID_Request *req_ptr;
1371
1372
1373
1374
		/* FIXME: This is a heavyweight way to process these sync 
		   messages - this should be handled with a special packet
		   type and callback function.
		*/
1375
1376
1377
1378
1379
1380
1381
1382
                mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
                                       comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
		if (mpi_errno) MPIU_ERR_POP(mpi_errno);
                req[i] = req_ptr->handle;
	    } else {
                req[i] = MPI_REQUEST_NULL;
            }

1383
	}
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
        mpi_errno = MPIR_Waitall_impl(start_grp_size, req, status);
        if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIU_ERR_POP(mpi_errno);

        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno == MPI_ERR_IN_STATUS) {
            for (i = 0; i < start_grp_size; i++) {
                if (status[i].MPI_ERROR != MPI_SUCCESS) {
                    mpi_errno = status[i].MPI_ERROR;
                    MPIU_ERR_POP(mpi_errno);
                }
            }
        }
        /* --END ERROR HANDLING-- */
1397
    }
1398
1399
    MPIU_INSTR_DURATION_END(wincomplete_recvsync);

1400
1401
1402
1403
    /* keep track of no. of ops to each proc. Needed for knowing
       whether or not to decrement the completion counter. The
       completion counter is decremented only on the last
       operation. */
1404
1405
1406

    MPIU_INSTR_DURATION_START(wincomplete_issue);

1407
1408
1409
1410
1411
    MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size*sizeof(int), 
			mpi_errno, "nops_to_proc");
    for (i=0; i<comm_size; i++) nops_to_proc[i] = 0;

    total_op_count = 0;
1412
    curr_ptr = win_ptr->rma_ops_list_head;
1413
1414
1415
1416
1417
1418
    while (curr_ptr != NULL)
    {
	nops_to_proc[curr_ptr->target_rank]++;
	total_op_count++;
	curr_ptr = curr_ptr->next;
    }
1419
1420
1421
1422

    MPIU_INSTR_DURATION_INCR(wincomplete_issue,0,total_op_count);
    MPIU_INSTR_DURATION_MAX(wincomplete_issue,1,total_op_count);

1423
1424
1425
1426
1427
1428
1429
1430
1431
    /* We allocate a few extra requests because if there are no RMA
       ops to a target process, we need to send a 0-byte message just
       to decrement the completion counter. */
        
    MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size*sizeof(int),