alltoallv.c 21.2 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/*
 *
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpiimpl.h"

/* -- Begin Profiling Symbol Block for routine MPI_Alltoallv */
#if defined(HAVE_PRAGMA_WEAK)
#pragma weak MPI_Alltoallv = PMPI_Alltoallv
#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
#pragma _HP_SECONDARY_DEF PMPI_Alltoallv  MPI_Alltoallv
#elif defined(HAVE_PRAGMA_CRI_DUP)
#pragma _CRI duplicate MPI_Alltoallv as PMPI_Alltoallv
#endif
/* -- End Profiling Symbol Block */

/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
   the MPI routines */
#ifndef MPICH_MPI_FROM_PMPI
#undef MPI_Alltoallv
#define MPI_Alltoallv PMPI_Alltoallv
/* This is the default implementation of alltoallv. The algorithm is:
   
   Algorithm: MPI_Alltoallv

   Since each process sends/receives different amounts of data to
   every other process, we don't know the total message size for all
   processes without additional communication. Therefore we simply use
   the "middle of the road" isend/irecv algorithm that works
   reasonably well in all cases.

   We post all irecvs and isends and then do a waitall. We scatter the
   order of sources and destinations among the processes, so that all
   processes don't try to send/recv to/from the same process at the
   same time. 

40
   *** Modification: We post only a small number of isends and irecvs 
Rajeev Thakur's avatar
Rajeev Thakur committed
41
   at a time and wait on them as suggested by Tony Ladd. ***
42

43
44
45
46
   For MPI_IN_PLACE we use a completely different algorithm.  We perform
   pair-wise exchanges among all processes using sendrecv_replace.  This
   conserves memory usage at the expense of time performance.

47
48
49
50
51
   Possible improvements: 

   End Algorithm: MPI_Alltoallv
*/
 
52

53
/* not declared static because a machine-specific function may call this one in some cases */
54
55
56
57
#undef FUNCNAME
#define FUNCNAME MPIR_Alltoallv_intra
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
58
59
60
61
int MPIR_Alltoallv_intra(const void *sendbuf, const int *sendcounts, const int *sdispls,
                         MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
                         const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
                         int *errflag)
62
{
63
    int        comm_size, i, j;
64
    MPI_Aint   send_extent, recv_extent;
65
    int        mpi_errno = MPI_SUCCESS;
66
    int mpi_errno_ret = MPI_SUCCESS;
67
    MPI_Status *starray;
68
    MPI_Status status;
69
70
71
    MPI_Request *reqarray;
    int dst, rank, req_cnt;
    MPI_Comm comm;
72
    int ii, ss, bblock;
73
    int type_size;
74

75
76
    MPIU_CHKLMEM_DECL(2);

77
78
79
    comm = comm_ptr->handle;
    comm_size = comm_ptr->local_size;
    rank = comm_ptr->rank;
80
81

    /* Get extent of recv type, but send type is only valid if (sendbuf!=MPI_IN_PLACE) */
82
    MPID_Datatype_get_extent_macro(recvtype, recv_extent);
83

84
85
86
    /* check if multiple threads are calling this collective function */
    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );

87
88
89
90
91
92
93
94
95
96
97
98
99
    if (sendbuf == MPI_IN_PLACE) {
        /* We use pair-wise sendrecv_replace in order to conserve memory usage,
         * which is keeping with the spirit of the MPI-2.2 Standard.  But
         * because of this approach all processes must agree on the global
         * schedule of sendrecv_replace operations to avoid deadlock.
         *
         * Note that this is not an especially efficient algorithm in terms of
         * time and there will be multiple repeated malloc/free's rather than
         * maintaining a single buffer across the whole loop.  Something like
         * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */
        for (i = 0; i < comm_size; ++i) {
            /* start inner loop at i to avoid re-exchanging data */
            for (j = i; j < comm_size; ++j) {
100
                if (rank == i) {
101
                    /* also covers the (rank == i && rank == j) case */
102
                    mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + rdispls[j]*recv_extent),
103
                                                         recvcounts[j], recvtype,
104
105
106
107
108
109
110
111
112
113
                                                         j, MPIR_ALLTOALLV_TAG,
                                                         j, MPIR_ALLTOALLV_TAG,
                                                         comm, &status, errflag);
                    if (mpi_errno) {
                        /* for communication errors, just record the error but continue */
                        *errflag = TRUE;
                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                    }

114
                }
115
                else if (rank == j) {
116
                    /* same as above with i/j args reversed */
117
                    mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + rdispls[i]*recv_extent),
118
                                                         recvcounts[i], recvtype,
119
120
121
122
123
124
125
126
127
                                                         i, MPIR_ALLTOALLV_TAG,
                                                         i, MPIR_ALLTOALLV_TAG,
                                                         comm, &status, errflag);
                    if (mpi_errno) {
                        /* for communication errors, just record the error but continue */
                        *errflag = TRUE;
                        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
                        MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                    }
128
                }
129
            }
130
        }
131
132
    }
    else {
133
        bblock = MPIR_CVAR_ALLTOALL_THROTTLE;
134
135
        if (bblock == 0) bblock = comm_size;

136
137
        MPID_Datatype_get_extent_macro(sendtype, send_extent);

138
139
140
141
142
143
144
145
146
147
148
        MPIU_CHKLMEM_MALLOC(starray,  MPI_Status*,  2*bblock*sizeof(MPI_Status),  mpi_errno, "starray");
        MPIU_CHKLMEM_MALLOC(reqarray, MPI_Request*, 2*bblock*sizeof(MPI_Request), mpi_errno, "reqarray");

        /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */
        for (ii=0; ii<comm_size; ii+=bblock) {
            req_cnt = 0;
            ss = comm_size-ii < bblock ? comm_size-ii : bblock;

            /* do the communication -- post ss sends and receives: */
            for ( i=0; i<ss; i++ ) { 
                dst = (rank+i+ii) % comm_size;
149
                if (recvcounts[dst]) {
150
151
152
153
                    MPID_Datatype_get_size_macro(recvtype, type_size);
                    if (type_size) {
                        MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
                                                         rdispls[dst]*recv_extent);
154
                        mpi_errno = MPIC_Irecv((char *)recvbuf+rdispls[dst]*recv_extent,
155
                                                  recvcounts[dst], recvtype, dst,
156
157
158
159
160
161
162
163
                                                  MPIR_ALLTOALLV_TAG, comm,
                                                  &reqarray[req_cnt]);
                        if (mpi_errno) {
                            /* for communication errors, just record the error but continue */
                            *errflag = TRUE;
                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                        }
164
165
                        req_cnt++;
                    }
166
167
                }
            }
168

169
170
            for ( i=0; i<ss; i++ ) { 
                dst = (rank-i-ii+comm_size) % comm_size;
171
                if (sendcounts[dst]) {
172
173
174
175
                    MPID_Datatype_get_size_macro(sendtype, type_size);
                    if (type_size) {
                        MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT sendbuf +
                                                         sdispls[dst]*send_extent);
176
                        mpi_errno = MPIC_Isend((char *)sendbuf+sdispls[dst]*send_extent,
177
                                                  sendcounts[dst], sendtype, dst,
178
179
180
181
182
183
184
185
                                                  MPIR_ALLTOALLV_TAG, comm,
                                                  &reqarray[req_cnt], errflag);
                        if (mpi_errno) {
                            /* for communication errors, just record the error but continue */
                            *errflag = TRUE;
                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                        }
186
187
                        req_cnt++;
                    }
188
                }
189
            }
190

191
            mpi_errno = MPIC_Waitall(req_cnt, reqarray, starray, errflag);
192
            if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIU_ERR_POP(mpi_errno);
193

194
195
196
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno == MPI_ERR_IN_STATUS) {
                for (i=0; i<req_cnt; i++) {
197
                    if (starray[i].MPI_ERROR != MPI_SUCCESS) {
198
                        mpi_errno = starray[i].MPI_ERROR;
199
200
201
202
203
204
                        if (mpi_errno) {
                            /* for communication errors, just record the error but continue */
                            *errflag = TRUE;
                            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
                            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
                        }
205
                    }
206
                }
207
            }
208
            /* --END ERROR HANDLING-- */
209
210
        }
    }
211

212
fn_exit:
213
214
    /* check if multiple threads are calling this collective function */
    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
215
    MPIU_CHKLMEM_FREEALL();
216
217
218
219
220
221
222

    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag)
        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    return mpi_errno;
223
224
225

fn_fail:
    goto fn_exit;
226
227
}

228
229


230
/* not declared static because a machine-specific function may call this one in some cases */
231
232
233
234
#undef FUNCNAME
#define FUNCNAME MPIR_Alltoallv_inter
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
235
236
237
238
int MPIR_Alltoallv_inter(const void *sendbuf, const int *sendcounts, const int *sdispls,
                         MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
                         const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
                         int *errflag)
239
240
241
242
243
244
245
246
247
248
249
250
251
252
{
/* Intercommunicator alltoallv. We use a pairwise exchange algorithm
   similar to the one used in intracommunicator alltoallv. Since the
   local and remote groups can be of different 
   sizes, we first compute the max of local_group_size,
   remote_group_size. At step i, 0 <= i < max_size, each process
   receives from src = (rank - i + max_size) % max_size if src <
   remote_size, and sends to dst = (rank + i) % max_size if dst <
   remote_size. 

   FIXME: change algorithm to match intracommunicator alltoallv

*/
    int local_size, remote_size, max_size, i;
253
    MPI_Aint   send_extent, recv_extent;
254
    int        mpi_errno = MPI_SUCCESS;
255
    int mpi_errno_ret = MPI_SUCCESS;
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
    MPI_Status status;
    int src, dst, rank, sendcount, recvcount;
    char *sendaddr, *recvaddr;
    MPI_Comm comm;
    
    local_size = comm_ptr->local_size; 
    remote_size = comm_ptr->remote_size;
    comm = comm_ptr->handle;
    rank = comm_ptr->rank;
    
    /* Get extent of send and recv types */
    MPID_Datatype_get_extent_macro(sendtype, send_extent);
    MPID_Datatype_get_extent_macro(recvtype, recv_extent);
    
    /* check if multiple threads are calling this collective function */
    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );

    /* Use pairwise exchange algorithm. */
    max_size = MPIR_MAX(local_size, remote_size);
    for (i=0; i<max_size; i++) {
        src = (rank - i + max_size) % max_size;
        dst = (rank + i) % max_size;
        if (src >= remote_size) {
            src = MPI_PROC_NULL;
            recvaddr = NULL;
            recvcount = 0;
        }
        else {
284
285
            MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
					     rdispls[src]*recv_extent);
286
            recvaddr = (char *)recvbuf + rdispls[src]*recv_extent;
287
            recvcount = recvcounts[src];
288
289
290
291
292
293
294
        }
        if (dst >= remote_size) {
            dst = MPI_PROC_NULL;
            sendaddr = NULL;
            sendcount = 0;
        }
        else {
295
296
            MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT sendbuf +
					     sdispls[dst]*send_extent);
297
            sendaddr = (char *)sendbuf + sdispls[dst]*send_extent;
298
            sendcount = sendcounts[dst];
299
300
        }

301
        mpi_errno = MPIC_Sendrecv(sendaddr, sendcount, sendtype, dst,
302
303
304
305
306
307
308
309
310
                                     MPIR_ALLTOALLV_TAG, recvaddr, recvcount, 
                                     recvtype, src, MPIR_ALLTOALLV_TAG,
                                     comm, &status, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = TRUE;
            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail");
            MPIU_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
311
    }
312

313
 fn_exit:
314
315
    /* check if multiple threads are calling this collective function */
    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
316
317
318
319
320
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag)
        MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail");
    return mpi_errno;
321
322
 fn_fail:
    goto fn_exit;
323
}
324

325
326

#undef FUNCNAME
327
328
329
#define FUNCNAME MPIR_Alltoallv
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
330
331
int MPIR_Alltoallv(const void *sendbuf, const int *sendcounts, const int *sdispls,
                   MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls,
332
                   MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag)
333
334
335
336
337
{
    int mpi_errno = MPI_SUCCESS;
        
    if (comm_ptr->comm_kind == MPID_INTRACOMM) {
        /* intracommunicator */
338
339
        mpi_errno = MPIR_Alltoallv_intra(sendbuf, sendcounts, sdispls,
                                         sendtype, recvbuf, recvcounts,
340
                                         rdispls, recvtype, comm_ptr, errflag);
341
342
343
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    } else {
        /* intercommunicator */
344
345
        mpi_errno = MPIR_Alltoallv_inter(sendbuf, sendcounts, sdispls,
                                         sendtype, recvbuf, recvcounts,
346
                                         rdispls, recvtype, comm_ptr, errflag);
347
348
349
350
351
352
353
354
355
356
357
358
359
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIR_Alltoallv_impl
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
360
361
int MPIR_Alltoallv_impl(const void *sendbuf, const int *sendcounts, const int *sdispls,
                        MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
362
                        const int *rdispls, MPI_Datatype recvtype,
363
                        MPID_Comm *comm_ptr, int *errflag)
364
365
366
367
{
    int mpi_errno = MPI_SUCCESS;
        
    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Alltoallv != NULL) {
368
	/* --BEGIN USEREXTENSION-- */
369
370
	mpi_errno = comm_ptr->coll_fns->Alltoallv(sendbuf, sendcounts, sdispls,
                                                 sendtype, recvbuf, recvcounts,
371
                                                 rdispls, recvtype, comm_ptr, errflag);
372
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
373
	/* --END USEREXTENSION-- */
374
    } else {
375
376
        mpi_errno = MPIR_Alltoallv(sendbuf, sendcounts, sdispls,
                                   sendtype, recvbuf, recvcounts,
377
                                   rdispls, recvtype, comm_ptr, errflag);
378
379
380
381
382
383
384
385
386
387
388
389
390
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}



#endif

391

392
393
394
395
#undef FUNCNAME
#define FUNCNAME MPI_Alltoallv
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
/*@
MPI_Alltoallv - Sends data from all to all processes; each process may 
   send a different amount of data and provide displacements for the input
   and output data.

Input Parameters:
+ sendbuf - starting address of send buffer (choice) 
. sendcounts - integer array equal to the group size 
specifying the number of elements to send to each processor 
. sdispls - integer array (of length group size). Entry 
 'j'  specifies the displacement (relative to sendbuf  from
which to take the outgoing data destined for process  'j'  
. sendtype - data type of send buffer elements (handle) 
. recvcounts - integer array equal to the group size 
specifying the maximum number of elements that can be received from
each processor 
. rdispls - integer array (of length group size). Entry 
 'i'  specifies the displacement (relative to recvbuf  at
which to place the incoming data from process  'i'  
. recvtype - data type of receive buffer elements (handle) 
- comm - communicator (handle) 

418
Output Parameters:
419
420
421
422
423
424
425
426
427
428
429
430
. recvbuf - address of receive buffer (choice) 

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_ERR_COMM
.N MPI_ERR_COUNT
.N MPI_ERR_TYPE
.N MPI_ERR_BUFFER
@*/
431
int MPI_Alltoallv(const void *sendbuf, const int *sendcounts,
432
                  const int *sdispls, MPI_Datatype sendtype, void *recvbuf,
433
                  const int *recvcounts, const int *rdispls, MPI_Datatype recvtype,
434
                  MPI_Comm comm)
435
436
437
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Comm *comm_ptr = NULL;
438
    int errflag = FALSE;
439
440
441
442
    MPID_MPI_STATE_DECL(MPID_STATE_MPI_ALLTOALLV);

    MPIR_ERRTEST_INITIALIZED_ORDIE();
    
443
    MPIU_THREAD_CS_ENTER(ALLFUNC,);
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
    MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_ALLTOALLV);

    /* Validate parameters, especially handles needing to be converted */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    MPIR_ERRTEST_COMM(comm, mpi_errno);
	}
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* Convert MPI object handles to object pointers */
    MPID_Comm_get_ptr( comm, comm_ptr );

    /* Validate parameters and objects (post conversion) */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    MPID_Datatype *sendtype_ptr=NULL, *recvtype_ptr=NULL;
            int i, comm_size;
467
468
            int check_send = (comm_ptr->comm_kind == MPID_INTRACOMM && sendbuf != MPI_IN_PLACE);

469
470
471
            MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
            if (mpi_errno != MPI_SUCCESS) goto fn_fail;

472
            if (comm_ptr->comm_kind == MPID_INTRACOMM) {
473
                comm_size = comm_ptr->local_size;
474
475
476
477

                if (sendbuf != MPI_IN_PLACE && sendtype == recvtype && sendcounts == recvcounts)
                    MPIR_ERRTEST_ALIAS_COLL(sendbuf, recvbuf, mpi_errno);
            } else
478
479
                comm_size = comm_ptr->remote_size;

480
            if (comm_ptr->comm_kind == MPID_INTERCOMM && sendbuf == MPI_IN_PLACE) {
Wesley Bland's avatar
Wesley Bland committed
481
                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**sendbuf_inplace");
482
483
            }

484
            for (i=0; i<comm_size; i++) {
485
                if (check_send) {
486
                    MPIR_ERRTEST_COUNT(sendcounts[i], mpi_errno);
487
488
                    MPIR_ERRTEST_DATATYPE(sendtype, "sendtype", mpi_errno);
                }
489
                MPIR_ERRTEST_COUNT(recvcounts[i], mpi_errno);
490
                MPIR_ERRTEST_DATATYPE(recvtype, "recvtype", mpi_errno);
491
492
            }
            if (check_send && HANDLE_GET_KIND(sendtype) != HANDLE_KIND_BUILTIN) {
493
494
                MPID_Datatype_get_ptr(sendtype, sendtype_ptr);
                MPID_Datatype_valid_ptr( sendtype_ptr, mpi_errno );
495
                if (mpi_errno != MPI_SUCCESS) goto fn_fail;
496
                MPID_Datatype_committed_ptr( sendtype_ptr, mpi_errno );
497
                if (mpi_errno != MPI_SUCCESS) goto fn_fail;
498
499
500
501
            }
            if (HANDLE_GET_KIND(recvtype) != HANDLE_KIND_BUILTIN) {
                MPID_Datatype_get_ptr(recvtype, recvtype_ptr);
                MPID_Datatype_valid_ptr( recvtype_ptr, mpi_errno );
502
                if (mpi_errno != MPI_SUCCESS) goto fn_fail;
503
                MPID_Datatype_committed_ptr( recvtype_ptr, mpi_errno );
504
                if (mpi_errno != MPI_SUCCESS) goto fn_fail;
505
506
            }

507
            for (i=0; i<comm_size && check_send; i++) {
508
509
                if (sendcounts[i] > 0) {
                    MPIR_ERRTEST_USERBUFFER(sendbuf,sendcounts[i],sendtype,mpi_errno);
510
511
512
                }
            }
            for (i=0; i<comm_size; i++) {
513
514
515
                if (recvcounts[i] > 0) {
                    MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, recvcounts[i], mpi_errno);
                    MPIR_ERRTEST_USERBUFFER(recvbuf,recvcounts[i],recvtype,mpi_errno);
516
517
518
519
520
521
522
523
524
525
                    break;
                }
            }
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ...  */

526
527
    mpi_errno = MPIR_Alltoallv_impl(sendbuf, sendcounts, sdispls,
                                    sendtype, recvbuf, recvcounts,
528
                                    rdispls, recvtype, comm_ptr, &errflag);
529
    if (mpi_errno) goto fn_fail;
530
531
532
533
534

    /* ... end of body of routine ... */

  fn_exit:
    MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_ALLTOALLV);
535
    MPIU_THREAD_CS_EXIT(ALLFUNC,);
536
537
538
539
540
541
542
543
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
#   ifdef HAVE_ERROR_CHECKING
    {
	mpi_errno = MPIR_Err_create_code(
	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_alltoallv",
544
545
	    "**mpi_alltoallv %p %p %p %D %p %p %p %D %C", sendbuf, sendcounts, sdispls, sendtype,
	    recvbuf, recvcounts, rdispls, recvtype, comm);
546
547
548
549
550
551
    }
#   endif
    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}