Commit d3098a76 authored by Swann Perarnau's avatar Swann Perarnau
Browse files

Merge branch 'dma_cuda' into 'staging'

DMA Cuda

See merge request !140
parents daabb8fe f7021f4b
Pipeline #10763 passed with stages
in 2 minutes and 58 seconds
......@@ -3,6 +3,16 @@ AM_COLOR_TESTS = yes
AM_CFLAGS = -I$(top_srcdir)/include $(PTHREAD_CFLAGS) $(OPENMP_CFLAGS)
AM_LDFLAGS = ../src/libaml.la $(PTHREAD_LIBS) $(OPENMP_CFLAGS)
if HAVE_CUDA
# LIBS is used instead of AM_LDFLAGS on purpose
# AM_LDFLAGS appends flags before libraries added before LDADD.
# Thus, when linking with libaml.la, linking with cuda is not done.
LIBS += $(CUDA_CFLAGS)
LIBS += $(CUDA_LIBS)
AM_CFLAGS += $(CUDA_CFLAGS)
AM_LDFLAGS += $(CUDA_LIBS)
endif
if HAVE_HWLOC
AM_CFLAGS += $(HWLOC_CFLAGS)
AM_LDFLAGS += $(HWLOC_LIBS)
......
......@@ -160,7 +160,7 @@ if [[ "x$have_nvcc" = xyes ]]; then
[AC_MSG_ERROR([could not find cuda.h])])
AC_CHECK_HEADER([cuda_runtime.h],,
[AC_MSG_ERROR([could not find cuda_runtime.h])])
AC_CHECK_LIB(cudart, cudaMalloc,,
AC_CHECK_LIB(cudart, cudaLaunchHostFunc,,
AC_MSG_ERROR([could not find cudart library]))
LIBS=$saved_LIBS
CFLAGS=$saved_CFLAGS
......
include_HEADERS=aml.h
include_aml_areadir=$(includedir)/aml/area
include_aml_area_HEADERS = \
aml/area/linux.h \
aml/area/cuda.h
if HAVE_HWLOC
include_aml_area_HEADERS+= aml/area/hwloc.h
endif
include_aml_area_HEADERS = aml/area/linux.h
include_aml_layoutdir=$(includedir)/aml/layout
include_aml_layout_HEADERS = \
......@@ -43,3 +37,12 @@ include_amlutils_HEADERS = \
aml/utils/version.h \
aml/utils/features.h
if HAVE_HWLOC
include_aml_area_HEADERS+= aml/area/hwloc.h
endif
if HAVE_CUDA
include_aml_area_HEADERS += aml/area/cuda.h
include_aml_layout_HEADERS += aml/layout/cuda.h
include_aml_dma_HEADERS += aml/dma/cuda.h
endif
......@@ -8,6 +8,9 @@
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#ifndef AML_AREA_CUDA_H
#define AML_AREA_CUDA_H
/**
* @defgroup aml_area_cuda "AML Cuda Areas"
* @brief Cuda Implementation of Areas.
......@@ -226,3 +229,5 @@ aml_area_cuda_munmap(const struct aml_area_data *area_data,
/**
* @}
**/
#endif // AML_AREA_CUDA_H
/*****************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
****************************************************************************/
#ifndef AML_DMA_CUDA_H
#define AML_DMA_CUDA_H
/**
* @defgroup aml_dma_cuda "AML DMA Cuda"
* @brief dma between devices and host.
*
* Cuda dma is an implementation of aml dma to transfer data
* between devices and between host and devices.
*
* @code
* #include <aml/dma/cuda.h>
* @endcode
* @see aml_dma
* @{
**/
#include <cuda.h>
#include <cuda_runtime.h>
//--- DMA Requests --------------------------------------------------------//
#define AML_DMA_CUDA_REQUEST_STATUS_NONE 0
#define AML_DMA_CUDA_REQUEST_STATUS_PENDING 1
#define AML_DMA_CUDA_REQUEST_STATUS_DONE 2
/** Cuda DMA request. Only need a status flag is needed. **/
struct aml_dma_cuda_request {
int status;
};
/**
* AML dma cuda request creation operator.
* @return -AML_EINVAL if data, req, *req, dest or src is NULL.
* @return -AML_ENOMEM if allocation failed.
* @return AML_SUCCESS on success.
**/
int aml_dma_cuda_request_create(struct aml_dma_data *data,
struct aml_dma_request **req,
struct aml_layout *dest,
struct aml_layout *src,
aml_dma_operator op,
void *op_arg);
/**
* AML dma cuda request wait operator.
* @return -AML_EINVAL if dma, req, *req is NULL or if data was does not
* come from the dma used in request creation.
* @return AML_SUCCESS on success.
**/
int aml_dma_cuda_request_wait(struct aml_dma_data *dma,
struct aml_dma_request **req);
/** AML dma cuda request deletion operator **/
int aml_dma_cuda_request_destroy(struct aml_dma_data *dma,
struct aml_dma_request **req);
//--- DMA -----------------------------------------------------------------//
/**
* aml_dma data structure.
* AML dma cuda contains a single execution stream. When waiting
* a request, the whole request stream is synchronized and all
* the requests are waited.
**/
struct aml_dma_cuda_data {
cudaStream_t stream;
enum cudaMemcpyKind kind;
};
/** Default dma ops used at dma creation **/
extern struct aml_dma_ops aml_dma_cuda_ops;
/** Dma on stream 0 to send data from host to device **/
extern struct aml_dma aml_dma_cuda_host_to_device;
/** Dma on stream 0 to send data from device to host **/
extern struct aml_dma aml_dma_cuda_device_to_device;
/** Dma on stream 0 to send data from device to device **/
extern struct aml_dma aml_dma_cuda_device_to_host;
/**
* Creation of a dma engine for cuda backend.
* @param dma: A pointer to set with a new allocated dma.
* @param kind: The kind of transfer performed: host to device,
* device to host, device to device, or host to host.
* @see struct aml_dma_cuda_data.
* @return -AML_EINVAL if dma can't be set.
* @return -AML_FAILURE if any cuda backend call failed.
* @return -AML_ENOMEM if allocation failed.
* @return AML_SUCCESS on success.
**/
int aml_dma_cuda_create(struct aml_dma **dma, const enum cudaMemcpyKind kind);
/** Destroy a created dma and set it to NULL **/
int aml_dma_cuda_destroy(struct aml_dma **dma);
//--- DMA copy operators --------------------------------------------------//
/** aml_dma_cuda copy operator for 1D to 1D layouts **/
int aml_dma_cuda_copy_1D(struct aml_layout *dst,
const struct aml_layout *src,
void *arg);
/**
* @}
**/
#endif // AML_LAYOUT_CUDA_H
......@@ -8,8 +8,8 @@
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#ifndef AML_AREA_LAYOUT_CUDA_H
#define AML_AREA_LAYOUT_CUDA_H
#ifndef AML_LAYOUT_CUDA_H
#define AML_LAYOUT_CUDA_H
/**
* @defgroup aml_layout_cuda "AML Layout Cuda"
......@@ -34,6 +34,8 @@
struct aml_layout_cuda_data {
/** Pointer to data on device. **/
void *device_ptr;
/** device id where ptr is located **/
int device;
/** user expected layout order **/
int order;
/** layout num dims **/
......@@ -56,6 +58,7 @@ struct aml_layout_cuda_data {
* Create a new layout on device pointer with embedded layout.
* @param[out] out: A pointer to receive the newly allocated layout.
* @param[in] device_ptr: The pointer on which the layout has to work.
* @param[in] device: The device id where the device_ptr is allocated.
* @param[in] element_size: The size of elements in this layout.
* @param[in] order: Order of dimensions in the layout.
* @param[in] ndims: The number of dimensions in the layout.
......@@ -69,6 +72,7 @@ struct aml_layout_cuda_data {
**/
int aml_layout_cuda_create(struct aml_layout **out,
void *device_ptr,
int device,
const size_t element_size,
const int order,
const size_t ndims,
......@@ -115,4 +119,4 @@ extern struct aml_layout_ops aml_layout_cuda_ops;
* @}
**/
#endif // AML_AREA_LAYOUT_CUDA_H
#endif // AML_LAYOUT_CUDA_H
......@@ -50,7 +50,6 @@ LIB_SOURCES = \
$(TILING_SOURCES) \
$(LAYOUT_SOURCES) \
$(UTILS_SOURCES) \
$(CUDA_AREA_SOURCES) \
aml.c
lib_LTLIBRARIES = libaml.la
......@@ -67,10 +66,7 @@ AM_CPPFLAGS += $(CUDA_CFLAGS)
AM_LDFLAGS += $(CUDA_LIBS)
# Build .c sources using cuda runtime library.
libaml_la_SOURCES+=area/cuda.c
# Build files not requiring additional libraries
LAYOUT_SOURCES+=layout/cuda.c
libaml_la_SOURCES+=area/cuda.c layout/cuda.c dma/cuda.c
# Build .cu sources containing device code.
# CUDA_LO_FILES= # .lo files result of .cu files.
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include "aml.h"
#include "aml/dma/cuda.h"
#include "aml/layout/cuda.h"
/**
* Callback on dma stream to update all requests status
* When the stream is done with it work.
**/
void aml_dma_cuda_callback(void *userData)
{
struct aml_dma_cuda_request *req;
req = (struct aml_dma_cuda_request *)userData;
req->status = AML_DMA_CUDA_REQUEST_STATUS_DONE;
}
int aml_dma_cuda_create(struct aml_dma **dma, const enum cudaMemcpyKind kind)
{
int err = AML_SUCCESS;
struct aml_dma *cuda_dma;
struct aml_dma_cuda_data *data;
cudaStream_t stream;
// Argument check
if (dma == NULL)
return -AML_EINVAL;
// Create a stream for this dma
if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) !=
cudaSuccess)
return -AML_FAILURE;
// Create dma
cuda_dma = AML_INNER_MALLOC(struct aml_dma, struct aml_dma_cuda_data);
if (cuda_dma == NULL) {
err = -AML_EINVAL;
goto error_with_stream;
}
// Set dma fields
data = AML_INNER_MALLOC_GET_FIELD(cuda_dma, 2, struct aml_dma,
struct aml_dma_cuda_data);
data->stream = stream;
data->kind = kind;
cuda_dma->data = (struct aml_dma_data *)data;
cuda_dma->ops = &aml_dma_cuda_ops;
// Return success
*dma = cuda_dma;
return AML_SUCCESS;
error_with_stream:
cudaStreamDestroy(stream);
return err;
}
int aml_dma_cuda_destroy(struct aml_dma **dma)
{
struct aml_dma_cuda_data *data;
if (dma == NULL)
return -AML_EINVAL;
if (*dma == NULL)
return AML_SUCCESS;
data = (struct aml_dma_cuda_data *)((*dma)->data);
// Synchronize all requests.
cudaStreamSynchronize(data->stream);
// Cleanup
cudaStreamDestroy(data->stream);
free(*dma);
*dma = NULL;
return AML_SUCCESS;
}
int aml_dma_cuda_request_create(struct aml_dma_data *data,
struct aml_dma_request **req,
struct aml_layout *dest,
struct aml_layout *src,
aml_dma_operator op,
void *op_arg)
{
int err;
struct aml_dma_cuda_request *request;
struct aml_dma_cuda_data *dma_data;
(void)op_arg;
// Check input
if (data == NULL || req == NULL || dest == NULL || src == NULL)
return -AML_EINVAL;
dma_data = (struct aml_dma_cuda_data *)data;
// Set request
request = AML_INNER_MALLOC(struct aml_dma_cuda_request);
if (request == NULL)
return -AML_ENOMEM;
request->status = AML_DMA_CUDA_REQUEST_STATUS_PENDING;
// Submit request to cuda device
err = op(dest, src, (void *)(dma_data));
if (err != AML_SUCCESS) {
free(request);
return err;
}
// Also enqueue the callback to notfiy request is done.
if (cudaLaunchHostFunc(dma_data->stream, aml_dma_cuda_callback,
request) != cudaSuccess) {
free(request);
return -AML_FAILURE;
}
*req = (struct aml_dma_request *)request;
return AML_SUCCESS;
}
int aml_dma_cuda_request_wait(struct aml_dma_data *data,
struct aml_dma_request **req)
{
struct aml_dma_cuda_data *dma_data;
struct aml_dma_cuda_request *dma_req;
if (req == NULL || *req == NULL)
return -AML_EINVAL;
dma_data = (struct aml_dma_cuda_data *)(data);
dma_req = (struct aml_dma_cuda_request *)(*req);
// If already done, do nothing
if (dma_req->status == AML_DMA_CUDA_REQUEST_STATUS_DONE)
return AML_SUCCESS;
// Wait for the stream to finish and call its callback.
cudaStreamSynchronize(dma_data->stream);
// If status is not updated, either callback failed or
// the provided dma did not create the provided request.
if (dma_req->status != AML_DMA_CUDA_REQUEST_STATUS_DONE)
return -AML_EINVAL;
return AML_SUCCESS;
}
int aml_dma_cuda_request_destroy(struct aml_dma_data *data,
struct aml_dma_request **req)
{
struct aml_dma_cuda_data *dma_data;
struct aml_dma_cuda_request *dma_req;
if (req == NULL || *req == NULL)
return -AML_EINVAL;
dma_data = (struct aml_dma_cuda_data *)(data);
dma_req = (struct aml_dma_cuda_request *)(*req);
// If the request status is not done, wait for it to be done.
// This way, the stream callback will not update a deleted request.
if (dma_req->status != AML_DMA_CUDA_REQUEST_STATUS_DONE)
cudaStreamSynchronize(dma_data->stream);
// If status is not updated, either callback failed or
// the provided dma did not create the provided request.
if (dma_req->status != AML_DMA_CUDA_REQUEST_STATUS_DONE)
return -AML_EINVAL;
// Cleanup
free(dma_req);
*req = NULL;
return AML_SUCCESS;
}
int aml_dma_cuda_copy_1D(struct aml_layout *dst,
const struct aml_layout *src,
void *arg)
{
int err;
const void *src_ptr = aml_layout_rawptr(src);
void *dst_ptr = aml_layout_rawptr(dst);
struct aml_dma_cuda_data *dma_data = (struct aml_dma_cuda_data *)arg;
const struct aml_layout_cuda_data *cu_src =
(struct aml_layout_cuda_data *)(src->data);
struct aml_layout_cuda_data *cu_dst =
(struct aml_layout_cuda_data *)(dst->data);
size_t n = 0;
size_t size = 0;
err = aml_layout_dims(src, &n);
if (err != AML_SUCCESS)
return err;
size = aml_layout_element_size(src) * n;
if (dma_data->kind == cudaMemcpyHostToDevice ||
dma_data->kind == cudaMemcpyDeviceToHost) {
if (cudaMemcpyAsync(dst_ptr, src_ptr, size, dma_data->kind,
dma_data->stream) != cudaSuccess)
return -AML_FAILURE;
} else if (dma_data->kind == cudaMemcpyDeviceToDevice) {
if (cudaMemcpyPeerAsync(dst_ptr, cu_dst->device, src_ptr,
cu_src->device, size,
dma_data->stream) != cudaSuccess)
return -AML_FAILURE;
} else
memcpy(dst_ptr, src_ptr, size);
return AML_SUCCESS;
}
/** Default dma ops **/
struct aml_dma_ops aml_dma_cuda_ops = {
.create_request = aml_dma_cuda_request_create,
.destroy_request = aml_dma_cuda_request_destroy,
.wait_request = aml_dma_cuda_request_wait,
};
struct aml_dma_cuda_data aml_dma_cuda_data_host_to_device = {
.stream = 0,
.kind = cudaMemcpyHostToDevice,
};
struct aml_dma aml_dma_cuda_host_to_device = {
.ops = &aml_dma_cuda_ops,
.data = (struct aml_dma_data *)(&aml_dma_cuda_data_host_to_device),
};
struct aml_dma_cuda_data aml_dma_cuda_data_device_to_host = {
.stream = 0,
.kind = cudaMemcpyDeviceToHost,
};
struct aml_dma aml_dma_cuda_device_to_host = {
.ops = &aml_dma_cuda_ops,
.data = (struct aml_dma_data *)(&aml_dma_cuda_data_device_to_host),
};
struct aml_dma_cuda_data aml_dma_cuda_data_device_to_device = {
.stream = 0,
.kind = cudaMemcpyDeviceToDevice,
};
struct aml_dma aml_dma_cuda_device_to_device = {
.ops = &aml_dma_cuda_ops,
.data = (struct aml_dma_data *)(&aml_dma_cuda_data_device_to_device),
};
......@@ -15,6 +15,7 @@
int aml_layout_cuda_create(struct aml_layout **out,
void *device_ptr,
int device,
const size_t element_size,
const int order,
const size_t ndims,
......@@ -34,6 +35,7 @@ int aml_layout_cuda_create(struct aml_layout **out,
layout_data = AML_INNER_MALLOC_GET_FIELD(layout, 2, struct aml_layout,
struct aml_layout_cuda_data);
layout_data->device_ptr = device_ptr;
layout_data->device = device;
layout_data->order = order;
layout_data->ndims = ndims;
layout_data->dims = AML_INNER_MALLOC_GET_ARRAY(
......@@ -145,9 +147,18 @@ size_t aml_layout_cuda_element_size(const struct aml_layout_data *data)
return cudata->cpitch[0];
}
void *aml_layout_cuda_rawptr(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->device_ptr;
}
struct aml_layout_ops aml_layout_cuda_ops = {
.deref = aml_layout_cuda_deref,
.deref_native = aml_layout_cuda_deref_native,
.rawptr = aml_layout_cuda_rawptr,
.order = aml_layout_cuda_order,
.dims = aml_layout_cuda_dims,
.dims_native = aml_layout_cuda_dims_native,
......
......@@ -56,7 +56,7 @@ LAYOUT_TESTS = \
TILING_TESTS = tiling/test_tiling
DMA_LINUX_TESTS = dma/test_dma_linux_seq \
DMA_TESTS = dma/test_dma_linux_seq \
dma/test_dma_linux_par
SCRATCH_TESTS = scratch/test_scratch_seq \
......@@ -64,6 +64,7 @@ SCRATCH_TESTS = scratch/test_scratch_seq \
if HAVE_CUDA
AREA_TESTS += area/test_cuda
DMA_TESTS += dma/test_cuda
endif
if HAVE_HWLOC
......@@ -76,7 +77,7 @@ UNIT_TESTS = $(UTILS_TESTS) \
$(LAYOUT_TESTS) \
$(BINDING_TESTS) \
$(AREA_TESTS) \
$(DMA_LINUX_TESTS) \
$(DMA_TESTS) \
$(SCRATCH_TESTS)
# all tests
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "aml.h"
#include "aml/dma/cuda.h"
#include "aml/layout/cuda.h"
#include "aml/layout/dense.h"
// Data
#define size (1 << 24) // 16 MB
void *host_data;
void *device_data;
// Layout
const size_t ndims = 1;
const size_t dims = size;
const size_t element_size = 1;
const size_t stride = 0;
const size_t pitch = 1;
struct aml_layout *host_layout;
struct aml_layout *device_layout;
// Dma
struct aml_dma *host_device;
struct aml_dma *device_host;
void setup()
{
// Data
host_data = malloc(size);
assert(host_data != NULL);
assert(cudaMalloc(&device_data, size) == cudaSuccess);
// Layout
assert(aml_layout_dense_create(&host_layout, host_data,
AML_LAYOUT_ORDER_COLUMN_MAJOR,
element_size, ndims, &dims, &stride,
&pitch) == AML_SUCCESS);
assert(aml_layout_cuda_create(&device_layout, device_data, 0,
element_size,
AML_LAYOUT_ORDER_COLUMN_MAJOR, ndims,
&dims, &stride, &pitch) == AML_SUCCESS);
// Dma
assert(aml_dma_cuda_create(&host_device, cudaMemcpyHostToDevice) ==
AML_SUCCESS);
assert(aml_dma_cuda_create(&device_host, cudaMemcpyDeviceToHost) ==
AML_SUCCESS);
}
void teardown()
{
// Data
free(host_data);
cudaFree(device_data);
// Layout