Commit d9c15cf3 authored by Sameh Sharkawi's avatar Sameh Sharkawi
Browse files

PAMID: Initial CUDA support



This is an initial limited implementation for CUDA support. This is not
performance optimized and only for testing.

(ibm) D202477
Signed-off-by: default avatarSu Huang <suhuang@us.ibm.com>
parent 3df58689
......@@ -38,6 +38,11 @@
#include "opa_primitives.h"
#if CUDA_AWARE_SUPPORT
#include <cuda_runtime_api.h>
#endif
#if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__)
struct MPID_Request;
typedef struct
......@@ -105,6 +110,11 @@ typedef struct
#if QUEUE_BINARY_SEARCH_SUPPORT
unsigned queue_binary_search_support_on;
#endif
#if CUDA_AWARE_SUPPORT
unsigned cuda_aware_support_on;
#endif
unsigned verbose; /**< The current level of verbosity for end-of-job stats. */
unsigned statistics; /**< The current level of stats collection. */
unsigned rma_pending; /**< The max num outstanding requests during an RMA op */
......
......@@ -86,6 +86,7 @@ typedef int32_t MPID_Node_id_t;
#define PAMIX_IS_LOCAL_TASK_SHIFT (6)
#define MPIDI_SMP_DETECT_DEFAULT 1
#define TOKEN_FLOW_CONTROL 0
#define CUDA_AWARE_SUPPORT 0
/*
* Enable both the 'internal vs application' and the 'local vs remote'
......@@ -142,6 +143,7 @@ static const char _ibm_release_version_[] = "V1R2M0";
#define MPIDI_NO_ASSERT 1
#define TOKEN_FLOW_CONTROL 1
#define DYNAMIC_TASKING 1
#define CUDA_AWARE_SUPPORT 1
/* 'is local task' extension and limits */
#define PAMIX_IS_LOCAL_TASK
......
......@@ -266,6 +266,7 @@ pami_result_t MPIDI_Register_algorithms_ext(void *cookie,
size_t *num_algorithms);
int MPIDI_collsel_pami_tune_parse_params(int argc, char ** argv);
void MPIDI_collsel_pami_tune_cleanup();
inline bool MPIDI_cuda_is_device_buf(const void* ptr);
void MPIDI_Coll_Comm_create (MPID_Comm *comm);
void MPIDI_Coll_Comm_destroy(MPID_Comm *comm);
void MPIDI_Comm_coll_query (MPID_Comm *comm);
......
......@@ -117,6 +117,22 @@ int MPIDO_Allreduce(const void *sendbuf,
fprintf(stderr,"Using MPICH allreduce type %u.\n",
selected_type);
MPIDI_Update_last_algorithm(comm_ptr, "ALLREDUCE_MPICH");
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
{
MPI_Aint dt_extent;
MPID_Datatype_get_extent_macro(dt, dt_extent);
char *buf = MPIU_Malloc(dt_extent * count);
cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
}
int cuda_res = MPIR_Allreduce(buf, recvbuf, count, dt, op, comm_ptr, mpierrno);
MPIU_Free(buf);
return cuda_res;
}
else
#endif
return MPIR_Allreduce(sendbuf, recvbuf, count, dt, op, comm_ptr, mpierrno);
}
......
......@@ -117,6 +117,22 @@ int MPIDO_Reduce(const void *sendbuf,
{
if(unlikely(verbose))
fprintf(stderr,"Using MPICH reduce algorithm\n");
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
{
MPI_Aint dt_extent;
MPID_Datatype_get_extent_macro(datatype, dt_extent);
char *buf = MPIU_Malloc(dt_extent * count);
cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
}
int cuda_res = MPIR_Reduce(buf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno);
MPIU_Free(buf);
return cuda_res;
}
else
#endif
return MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno);
}
......
......@@ -135,6 +135,26 @@ int MPIDO_Doscan(const void *sendbuf, void *recvbuf,
{
if(unlikely(verbose))
fprintf(stderr,"Using MPICH scan algorithm (exflag %d)\n",exflag);
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
{
MPI_Aint dt_extent;
MPID_Datatype_get_extent_macro(datatype, dt_extent);
char *buf = MPIU_Malloc(dt_extent * count);
cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
}
int cuda_res;
if(exflag)
cuda_res = MPIR_Exscan(buf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
else
cuda_res = MPIR_Scan(buf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
MPIU_Free(buf);
return cuda_res;
}
else
#endif
if(exflag)
return MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
else
......
......@@ -57,10 +57,12 @@ void MPIDI_Buffer_copy(
MPID_Datatype * sdt_ptr;
MPID_Datatype * rdt_ptr;
MPI_Aint sdt_extent;
MPI_Aint rdt_extent;
*smpi_errno = MPI_SUCCESS;
*rmpi_errno = MPI_SUCCESS;
/* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */
/* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */
......@@ -83,11 +85,52 @@ void MPIDI_Buffer_copy(
if (sdt_contig && rdt_contig)
{
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
{
cudaError_t cudaerr = cudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice);
}
else
#endif
memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz);
*rsz = sdata_sz;
}
else if (sdt_contig)
{
#if CUDA_AWARE_SUPPORT
// This will need to be done in two steps:
// 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it.
// 2 - Copy unpacked data into user buffer from temp buffer.
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
{
MPID_Datatype_get_extent_macro(rdt, rdt_extent);
char *buf = MPIU_Malloc(rdt_extent * rcount);
memset(buf, 0, rdt_extent * rcount);
MPID_Segment seg;
DLOOP_Offset last;
MPID_Segment_init(buf, rcount, rdt, &seg, 0);
last = sdata_sz;
MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb);
/* --BEGIN ERROR HANDLING-- */
if (last != sdata_sz)
{
*rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
}
/* --END ERROR HANDLING-- */
*rsz = last;
cudaError_t cudaerr = cudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice);
MPIU_Free(buf);
goto fn_exit;
}
#endif
MPID_Segment seg;
DLOOP_Offset last;
......
......@@ -118,6 +118,9 @@ MPIDI_Process_t MPIDI_Process = {
#endif
#ifdef QUEUE_BINARY_SEARCH_SUPPORT
.queue_binary_search_support_on = 0,
#endif
#if CUDA_AWARE_SUPPORT
.cuda_aware_support_on = 0,
#endif
.rma_pending = 1000,
.shmem_pt2pt = 1,
......
......@@ -1139,6 +1139,12 @@ MPIDI_Env_setup(int rank, int requested)
char* names[] = {"MP_S_USE_QUEUE_BINARY_SEARCH_SUPPORT", NULL};
ENV_Char(names, &MPIDI_Process.queue_binary_search_support_on);
#endif
#if CUDA_AWARE_SUPPORT
char* names[] = {"MP_CUDA_AWARE", NULL};
ENV_Char(names, &MPIDI_Process.cuda_aware_support_on);
#endif
/* Exit if any deprecated environment variables were specified. */
if (found_deprecated_env_var)
{
......
......@@ -1921,6 +1921,34 @@ void MPIDI_collsel_pami_tune_cleanup()
MPIDI_collsel_free_advisor_params(&MPIDI_Collsel_advisor_params);
}
/**********************************************************/
/* CUDA Utilities */
/**********************************************************/
inline bool MPIDI_cuda_is_device_buf(const void* ptr)
{
bool result = false;
#if CUDA_AWARE_SUPPORT
struct cudaPointerAttributes cuda_attr;
cudaError_t e= cudaPointerGetAttributes ( & cuda_attr, ptr);
if (e != cudaSuccess)
result = false;
else if (cuda_attr.memoryType == cudaMemoryTypeDevice)
result = true;
else
result = false;
#endif
return result;
}
/**********************************************************/
/* End CUDA Utilities */
/**********************************************************/
#if defined(MPID_USE_NODE_IDS)
#undef FUNCNAME
#define FUNCNAME MPID_Get_node_id
......
......@@ -195,7 +195,16 @@ MPIDI_RecvShortCB(pami_context_t context,
void* rcvbuf = rreq->mpid.userbuf;
if (sndlen > 0)
{
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
{
cudaError_t cudaerr = cudaMemcpy(rcvbuf, sndbuf, (size_t)sndlen, cudaMemcpyHostToDevice);
}
else
#endif
memcpy(rcvbuf, sndbuf, sndlen);
}
TRACE_SET_R_VAL(source,(rreq->mpid.idx),rlen,sndlen);
TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.comp_in_HH);
TRACE_SET_R_VAL(source,(rreq->mpid.idx),bufadd,rreq->mpid.userbuf);
......
......@@ -175,7 +175,13 @@ MPIDI_Callback_process_userdefined_dt(pami_context_t context,
MPID_assert(rreq->mpid.uebuf == NULL);
MPID_assert(rreq->mpid.uebuflen == 0);
void* rcvbuf = rreq->mpid.userbuf + dt_true_lb;;
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
{
cudaError_t cudaerr = cudaMemcpy(rcvbuf, sndbuf, (size_t)sndlen, cudaMemcpyHostToDevice);
}
else
#endif
memcpy(rcvbuf, sndbuf, sndlen);
MPIDI_Request_complete(rreq);
return;
......
......@@ -107,6 +107,16 @@ MPIDI_RendezvousTransfer(pami_context_t context,
pami_endpoint_t dest;
MPIDI_Context_endpoint(rreq, &dest);
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
{
MPIDI_RendezvousTransfer_use_pami_get(context,dest,rcvbuf,rreq);
}
else
{
#endif
#ifdef USE_PAMI_RDMA
size_t rcvlen_out;
rc = PAMI_Memregion_create(context,
......@@ -153,6 +163,10 @@ MPIDI_RendezvousTransfer(pami_context_t context,
}
#endif
#if CUDA_AWARE_SUPPORT
}
#endif
return PAMI_SUCCESS;
}
......
......@@ -345,6 +345,26 @@ MPIDI_SendMsg_process_userdefined_dt(MPID_Request * sreq,
*/
else
{
char *buf = NULL;
#if CUDA_AWARE_SUPPORT
// This will need to be done in two steps:
// 1 - Allocate a temp buffer which is the same size as user buffer and copy in it.
// 2 - Pack data into ue buffer from temp buffer.
int on_device = MPIDI_cuda_is_device_buf(sreq->mpid.userbuf);
if(MPIDI_Process.cuda_aware_support_on && on_device)
{
MPI_Aint dt_extent;
MPID_Datatype_get_extent_macro(sreq->mpid.datatype, dt_extent);
buf = MPIU_Malloc(dt_extent * sreq->mpid.userbufcount);
cudaError_t cudaerr = cudaMemcpy(buf, sreq->mpid.userbuf, dt_extent * sreq->mpid.userbufcount, cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaerr) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
}
}
#endif
MPID_Segment segment;
if(data_sz != 0) {
......@@ -359,13 +379,23 @@ MPIDI_SendMsg_process_userdefined_dt(MPID_Request * sreq,
sreq->mpid.uebuf_malloc = mpiuMalloc;
DLOOP_Offset last = data_sz;
MPID_Segment_init(sreq->mpid.userbuf,
#if CUDA_AWARE_SUPPORT
if(!MPIDI_Process.cuda_aware_support_on || !on_device)
buf = sreq->mpid.userbuf;
#endif
MPID_assert(buf != NULL);
MPID_Segment_init(buf,
sreq->mpid.userbufcount,
sreq->mpid.datatype,
&segment,
0);
MPID_Segment_pack(&segment, 0, &last, sndbuf);
MPID_assert(last == data_sz);
#if CUDA_AWARE_SUPPORT
if(MPIDI_Process.cuda_aware_support_on && on_device)
MPIU_Free(buf);
#endif
} else {
sndbuf = NULL;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment