Commit e87c158f authored by Sameh Sharkawi's avatar Sameh Sharkawi Committed by Tsai-Yang (Alan) Jea
Browse files

PAMID: MPI_Allreduce/MPI_Reduce coredump w/ DOUBLE_INT datatype



This commit includes multiple fixes:
 - Fixes for MPI_IN_PLACE checking. cudaGetPointerAttributes returns
   true on MPI_IN_PLACE which causes issues. Now we check on MPI_IN_PLACE
   before passing pointer to cuda.
 - Enabling PAMID geometries (in order to get to PAMID collectives) when
   MP_CUDA_AWARE=yes. This allows for intercepting CUDA buffer.
 - Disabling FCA when MP_CUDA_AWARE=yes if user enables FCA.
 - Copying user recv buffer into temp recv host buffer before collective
   starts, especially in MPI_IN_PLACE cases.

(ibm) D203255
Signed-off-by: default avatarTsai-Yang (Alan) Jea <tjea@us.ibm.com>
parent eb0e7712
......@@ -81,6 +81,7 @@ enum
MPID_COLL_OFF = 0,
MPID_COLL_ON = 1,
MPID_COLL_FCA = 2, /* Selecting these is fairly easy so special case */
MPID_COLL_CUDA = 3, /* This is used to enable PAMI geometry but sets default to MPICH */
};
/** \} */
......
......@@ -378,7 +378,14 @@ MPIDO_Allgather(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(rdt_extent * recvcount);
memset(rcbuf, 0, rdt_extent * recvcount);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, rdt_extent * recvcount);
}
else
rcbuf = recvbuf;
......
......@@ -405,7 +405,14 @@ MPIDO_Allgatherv(const void *sendbuf,
}
rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
rcbuf = MPIU_Malloc(rtotal_buf);
memset(rcbuf, 0, rtotal_buf);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, rtotal_buf);
}
else
rcbuf = recvbuf;
......
......@@ -138,7 +138,14 @@ int MPIDO_Allreduce(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(dt_extent * count);
memset(rcbuf, 0, dt_extent * count);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, dt_extent * count);
}
else
rcbuf = recvbuf;
......
......@@ -113,7 +113,14 @@ int MPIDO_Alltoall(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(recvcount * rdt_extent);
memset(rcbuf, 0, recvcount * rdt_extent);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, recvcount * rdt_extent, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, recvcount * rdt_extent);
}
else
rcbuf = recvbuf;
......
......@@ -213,7 +213,14 @@ int MPIDO_Gather(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(rdt_extent * recvcount);
memset(rcbuf, 0, rdt_extent * recvcount);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, rdt_extent * recvcount);
}
else
rcbuf = recvbuf;
......
......@@ -119,7 +119,14 @@ int MPIDO_Gatherv(const void *sendbuf,
}
rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
rcbuf = MPIU_Malloc(rtotal_buf);
memset(rcbuf, 0, rtotal_buf);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, rtotal_buf);
}
else
rcbuf = recvbuf;
......
......@@ -73,7 +73,14 @@ int MPIDO_Reduce_scatter(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(total_buf * dt_extent);
memset(rcbuf, 0, total_buf * dt_extent);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * total_buf, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, total_buf * dt_extent);
}
else
rcbuf = recvbuf;
......@@ -131,7 +138,7 @@ int MPIDO_Reduce_scatter_block(const void *sendbuf,
scbuf = MPIU_Malloc(dt_extent * recvcount * size);
cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, dt_extent * recvcount * size, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
fprintf(stderr, "cudaMemcpy failed: %s recvbuf: %p scbuf: %p is_send_dev_buf: %d is_recv_dev_buf: %p sendbuf: %p\n", CudaGetErrorString(cudaerr), recvbuf, scbuf, is_send_dev_buf,is_recv_dev_buf, sendbuf );
}
else
scbuf = sendbuf;
......@@ -139,7 +146,14 @@ int MPIDO_Reduce_scatter_block(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(dt_extent * recvcount * size);
memset(rcbuf, 0, dt_extent * recvcount * size);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * recvcount * size, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, recvcount * size * dt_extent);
}
else
rcbuf = recvbuf;
......@@ -154,7 +168,7 @@ int MPIDO_Reduce_scatter_block(const void *sendbuf,
{
cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, dt_extent * recvcount * size, cudaMemcpyHostToDevice);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
fprintf(stderr, "cudaMemcpy failed: %s recvbuf: %p rcbuf: %p is_send_dev_buf: %d is_recv_dev_buf: %p sendbuf: %p\n", CudaGetErrorString(cudaerr), recvbuf, rcbuf, is_send_dev_buf,is_recv_dev_buf, sendbuf );
MPIU_Free(rcbuf);
}
return cuda_res;
......
......@@ -138,7 +138,14 @@ int MPIDO_Reduce(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(dt_extent * count);
memset(rcbuf, 0, dt_extent * count);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, dt_extent * count);
}
else
rcbuf = recvbuf;
......
......@@ -156,7 +156,14 @@ int MPIDO_Doscan(const void *sendbuf, void *recvbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(dt_extent * count);
memset(rcbuf, 0, dt_extent * count);
if(sendbuf == MPI_IN_PLACE)
{
cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr)
fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
}
else
memset(rcbuf, 0, dt_extent * count);
}
else
rcbuf = recvbuf;
......
......@@ -161,7 +161,7 @@ int MPIDO_Scatter(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(rdt_extent * recvcount);
memset(rcbuf, 0, rdt_extent * recvcount);
CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
}
else
rcbuf = recvbuf;
......
......@@ -281,7 +281,7 @@ int MPIDO_Scatterv(const void *sendbuf,
if(is_recv_dev_buf)
{
rcbuf = MPIU_Malloc(recvcount * rdt_extent);
memset(rcbuf, 0, recvcount * rdt_extent);
CudaMemcpy(rcbuf, recvbuf, recvcount * rdt_extent, cudaMemcpyDeviceToHost);
}
else
rcbuf = recvbuf;
......
......@@ -257,7 +257,7 @@ void MPIDI_Comm_coll_envvars(MPID_Comm *comm)
comm->mpid.user_selected_type[i] = MPID_COLL_NOSELECTION;
if(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0)
fprintf(stderr,"Setting up collective %d on comm %p\n", i, comm);
if((comm->mpid.coll_count[i][0] == 0) && (comm->mpid.coll_count[i][1] == 0))
if(((comm->mpid.coll_count[i][0] == 0) && (comm->mpid.coll_count[i][1] == 0)) || MPIDI_Process.optimized.collectives == MPID_COLL_CUDA)
{
comm->mpid.user_selected_type[i] = MPID_COLL_USE_MPICH;
comm->mpid.user_selected[i] = 0;
......
......@@ -641,8 +641,8 @@ void MPIDI_Init_collsel_extension()
MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
#ifndef __BGQ__
//If collective selection will be disabled, check on fca, if both not required, disable pami alltogether
if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.collectives != MPID_COLL_FCA)
//If collective selection will be disabled, check on fca and CUDA if both not required, disable pami alltogether
if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.collectives != MPID_COLL_FCA && MPIDI_Process.optimized.collectives != MPID_COLL_CUDA)
MPIDI_Process.optimized.collectives = MPID_COLL_OFF;
#endif
}
......
......@@ -1156,6 +1156,17 @@ MPIDI_Env_setup(int rank, int requested)
fprintf(stderr, "Error loading libcudart\n");fflush(stderr);sleep(1);exit(1);
}
}
else if(MPIDI_Process.cuda_aware_support_on)
{
if(MPIDI_Process.optimized.collectives == MPID_COLL_FCA)
if(rank == 0)
{
fprintf(stderr, "Warning: FCA is not supported with CUDA Aware support\n");fflush(stderr);
}
MPIDI_Process.optimized.collectives = MPID_COLL_CUDA;
MPIDI_Process.optimized.select_colls = 0;
}
#endif
/* Exit if any deprecated environment variables were specified. */
......
......@@ -1969,19 +1969,25 @@ inline bool MPIDI_enable_cuda()
inline bool MPIDI_cuda_is_device_buf(const void* ptr)
{
bool result = false;
bool result = false;
#if CUDA_AWARE_SUPPORT
struct cudaPointerAttributes cuda_attr;
cudaError_t e= CudaPointerGetAttributes ( & cuda_attr, ptr);
if(MPIDI_Process.cuda_aware_support_on)
{
if(ptr != MPI_IN_PLACE)
{
struct cudaPointerAttributes cuda_attr;
cudaError_t e= CudaPointerGetAttributes ( & cuda_attr, ptr);
if (e != cudaSuccess)
result = false;
else if (cuda_attr.memoryType == cudaMemoryTypeDevice)
result = true;
else
result = false;
if (e != cudaSuccess)
result = false;
else if (cuda_attr.memoryType == cudaMemoryTypeDevice)
result = true;
else
result = false;
}
}
#endif
return result;
return result;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment