Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
argo
aml
Commits
d3098a76
Commit
d3098a76
authored
Jul 10, 2020
by
Swann Perarnau
Browse files
Merge branch 'dma_cuda' into 'staging'
DMA Cuda See merge request
!140
parents
daabb8fe
f7021f4b
Pipeline
#10763
passed with stages
in 2 minutes and 58 seconds
Changes
11
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
benchmarks/Makefile.am
View file @
d3098a76
...
...
@@ -3,6 +3,16 @@ AM_COLOR_TESTS = yes
AM_CFLAGS
=
-I
$(top_srcdir)
/include
$(PTHREAD_CFLAGS)
$(OPENMP_CFLAGS)
AM_LDFLAGS
=
../src/libaml.la
$(PTHREAD_LIBS)
$(OPENMP_CFLAGS)
if
HAVE_CUDA
# LIBS is used instead of AM_LDFLAGS on purpose
# AM_LDFLAGS appends flags before libraries added before LDADD.
# Thus, when linking with libaml.la, linking with cuda is not done.
LIBS
+=
$(CUDA_CFLAGS)
LIBS
+=
$(CUDA_LIBS)
AM_CFLAGS
+=
$(CUDA_CFLAGS)
AM_LDFLAGS
+=
$(CUDA_LIBS)
endif
if
HAVE_HWLOC
AM_CFLAGS
+=
$(HWLOC_CFLAGS)
AM_LDFLAGS
+=
$(HWLOC_LIBS)
...
...
configure.ac
View file @
d3098a76
...
...
@@ -160,7 +160,7 @@ if [[ "x$have_nvcc" = xyes ]]; then
[AC_MSG_ERROR([could not find cuda.h])])
AC_CHECK_HEADER([cuda_runtime.h],,
[AC_MSG_ERROR([could not find cuda_runtime.h])])
AC_CHECK_LIB(cudart, cuda
Mallo
c,,
AC_CHECK_LIB(cudart, cuda
LaunchHostFun
c,,
AC_MSG_ERROR([could not find cudart library]))
LIBS=$saved_LIBS
CFLAGS=$saved_CFLAGS
...
...
include/Makefile.am
View file @
d3098a76
include_HEADERS
=
aml.h
include_aml_areadir
=
$(includedir)
/aml/area
include_aml_area_HEADERS
=
\
aml/area/linux.h
\
aml/area/cuda.h
if
HAVE_HWLOC
include_aml_area_HEADERS
+=
aml/area/hwloc.h
endif
include_aml_area_HEADERS
=
aml/area/linux.h
include_aml_layoutdir
=
$(includedir)
/aml/layout
include_aml_layout_HEADERS
=
\
...
...
@@ -43,3 +37,12 @@ include_amlutils_HEADERS = \
aml/utils/version.h
\
aml/utils/features.h
if
HAVE_HWLOC
include_aml_area_HEADERS
+=
aml/area/hwloc.h
endif
if
HAVE_CUDA
include_aml_area_HEADERS
+=
aml/area/cuda.h
include_aml_layout_HEADERS
+=
aml/layout/cuda.h
include_aml_dma_HEADERS
+=
aml/dma/cuda.h
endif
include/aml/area/cuda.h
View file @
d3098a76
...
...
@@ -8,6 +8,9 @@
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#ifndef AML_AREA_CUDA_H
#define AML_AREA_CUDA_H
/**
* @defgroup aml_area_cuda "AML Cuda Areas"
* @brief Cuda Implementation of Areas.
...
...
@@ -226,3 +229,5 @@ aml_area_cuda_munmap(const struct aml_area_data *area_data,
/**
* @}
**/
#endif // AML_AREA_CUDA_H
include/aml/dma/cuda.h
0 → 100644
View file @
d3098a76
/*****************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
****************************************************************************/
#ifndef AML_DMA_CUDA_H
#define AML_DMA_CUDA_H
/**
* @defgroup aml_dma_cuda "AML DMA Cuda"
* @brief dma between devices and host.
*
* Cuda dma is an implementation of aml dma to transfer data
* between devices and between host and devices.
*
* @code
* #include <aml/dma/cuda.h>
* @endcode
* @see aml_dma
* @{
**/
#include <cuda.h>
#include <cuda_runtime.h>
//--- DMA Requests --------------------------------------------------------//
#define AML_DMA_CUDA_REQUEST_STATUS_NONE 0
#define AML_DMA_CUDA_REQUEST_STATUS_PENDING 1
#define AML_DMA_CUDA_REQUEST_STATUS_DONE 2
/** Cuda DMA request. Only need a status flag is needed. **/
struct
aml_dma_cuda_request
{
int
status
;
};
/**
* AML dma cuda request creation operator.
* @return -AML_EINVAL if data, req, *req, dest or src is NULL.
* @return -AML_ENOMEM if allocation failed.
* @return AML_SUCCESS on success.
**/
int
aml_dma_cuda_request_create
(
struct
aml_dma_data
*
data
,
struct
aml_dma_request
**
req
,
struct
aml_layout
*
dest
,
struct
aml_layout
*
src
,
aml_dma_operator
op
,
void
*
op_arg
);
/**
* AML dma cuda request wait operator.
* @return -AML_EINVAL if dma, req, *req is NULL or if data was does not
* come from the dma used in request creation.
* @return AML_SUCCESS on success.
**/
int
aml_dma_cuda_request_wait
(
struct
aml_dma_data
*
dma
,
struct
aml_dma_request
**
req
);
/** AML dma cuda request deletion operator **/
int
aml_dma_cuda_request_destroy
(
struct
aml_dma_data
*
dma
,
struct
aml_dma_request
**
req
);
//--- DMA -----------------------------------------------------------------//
/**
* aml_dma data structure.
* AML dma cuda contains a single execution stream. When waiting
* a request, the whole request stream is synchronized and all
* the requests are waited.
**/
struct
aml_dma_cuda_data
{
cudaStream_t
stream
;
enum
cudaMemcpyKind
kind
;
};
/** Default dma ops used at dma creation **/
extern
struct
aml_dma_ops
aml_dma_cuda_ops
;
/** Dma on stream 0 to send data from host to device **/
extern
struct
aml_dma
aml_dma_cuda_host_to_device
;
/** Dma on stream 0 to send data from device to host **/
extern
struct
aml_dma
aml_dma_cuda_device_to_device
;
/** Dma on stream 0 to send data from device to device **/
extern
struct
aml_dma
aml_dma_cuda_device_to_host
;
/**
* Creation of a dma engine for cuda backend.
* @param dma: A pointer to set with a new allocated dma.
* @param kind: The kind of transfer performed: host to device,
* device to host, device to device, or host to host.
* @see struct aml_dma_cuda_data.
* @return -AML_EINVAL if dma can't be set.
* @return -AML_FAILURE if any cuda backend call failed.
* @return -AML_ENOMEM if allocation failed.
* @return AML_SUCCESS on success.
**/
int
aml_dma_cuda_create
(
struct
aml_dma
**
dma
,
const
enum
cudaMemcpyKind
kind
);
/** Destroy a created dma and set it to NULL **/
int
aml_dma_cuda_destroy
(
struct
aml_dma
**
dma
);
//--- DMA copy operators --------------------------------------------------//
/** aml_dma_cuda copy operator for 1D to 1D layouts **/
int
aml_dma_cuda_copy_1D
(
struct
aml_layout
*
dst
,
const
struct
aml_layout
*
src
,
void
*
arg
);
/**
* @}
**/
#endif // AML_LAYOUT_CUDA_H
include/aml/layout/cuda.h
View file @
d3098a76
...
...
@@ -8,8 +8,8 @@
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#ifndef AML_
AREA_
LAYOUT_CUDA_H
#define AML_
AREA_
LAYOUT_CUDA_H
#ifndef AML_LAYOUT_CUDA_H
#define AML_LAYOUT_CUDA_H
/**
* @defgroup aml_layout_cuda "AML Layout Cuda"
...
...
@@ -34,6 +34,8 @@
struct
aml_layout_cuda_data
{
/** Pointer to data on device. **/
void
*
device_ptr
;
/** device id where ptr is located **/
int
device
;
/** user expected layout order **/
int
order
;
/** layout num dims **/
...
...
@@ -56,6 +58,7 @@ struct aml_layout_cuda_data {
* Create a new layout on device pointer with embedded layout.
* @param[out] out: A pointer to receive the newly allocated layout.
* @param[in] device_ptr: The pointer on which the layout has to work.
* @param[in] device: The device id where the device_ptr is allocated.
* @param[in] element_size: The size of elements in this layout.
* @param[in] order: Order of dimensions in the layout.
* @param[in] ndims: The number of dimensions in the layout.
...
...
@@ -69,6 +72,7 @@ struct aml_layout_cuda_data {
**/
int
aml_layout_cuda_create
(
struct
aml_layout
**
out
,
void
*
device_ptr
,
int
device
,
const
size_t
element_size
,
const
int
order
,
const
size_t
ndims
,
...
...
@@ -115,4 +119,4 @@ extern struct aml_layout_ops aml_layout_cuda_ops;
* @}
**/
#endif // AML_
AREA_
LAYOUT_CUDA_H
#endif // AML_LAYOUT_CUDA_H
src/Makefile.am
View file @
d3098a76
...
...
@@ -50,7 +50,6 @@ LIB_SOURCES = \
$(TILING_SOURCES)
\
$(LAYOUT_SOURCES)
\
$(UTILS_SOURCES)
\
$(CUDA_AREA_SOURCES)
\
aml.c
lib_LTLIBRARIES
=
libaml.la
...
...
@@ -67,10 +66,7 @@ AM_CPPFLAGS += $(CUDA_CFLAGS)
AM_LDFLAGS
+=
$(CUDA_LIBS)
# Build .c sources using cuda runtime library.
libaml_la_SOURCES
+=
area/cuda.c
# Build files not requiring additional libraries
LAYOUT_SOURCES
+=
layout/cuda.c
libaml_la_SOURCES
+=
area/cuda.c layout/cuda.c dma/cuda.c
# Build .cu sources containing device code.
# CUDA_LO_FILES= # .lo files result of .cu files.
...
...
src/dma/cuda.c
0 → 100644
View file @
d3098a76
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include "aml.h"
#include "aml/dma/cuda.h"
#include "aml/layout/cuda.h"
/**
* Callback on dma stream to update all requests status
* When the stream is done with it work.
**/
void
aml_dma_cuda_callback
(
void
*
userData
)
{
struct
aml_dma_cuda_request
*
req
;
req
=
(
struct
aml_dma_cuda_request
*
)
userData
;
req
->
status
=
AML_DMA_CUDA_REQUEST_STATUS_DONE
;
}
int
aml_dma_cuda_create
(
struct
aml_dma
**
dma
,
const
enum
cudaMemcpyKind
kind
)
{
int
err
=
AML_SUCCESS
;
struct
aml_dma
*
cuda_dma
;
struct
aml_dma_cuda_data
*
data
;
cudaStream_t
stream
;
// Argument check
if
(
dma
==
NULL
)
return
-
AML_EINVAL
;
// Create a stream for this dma
if
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
)
!=
cudaSuccess
)
return
-
AML_FAILURE
;
// Create dma
cuda_dma
=
AML_INNER_MALLOC
(
struct
aml_dma
,
struct
aml_dma_cuda_data
);
if
(
cuda_dma
==
NULL
)
{
err
=
-
AML_EINVAL
;
goto
error_with_stream
;
}
// Set dma fields
data
=
AML_INNER_MALLOC_GET_FIELD
(
cuda_dma
,
2
,
struct
aml_dma
,
struct
aml_dma_cuda_data
);
data
->
stream
=
stream
;
data
->
kind
=
kind
;
cuda_dma
->
data
=
(
struct
aml_dma_data
*
)
data
;
cuda_dma
->
ops
=
&
aml_dma_cuda_ops
;
// Return success
*
dma
=
cuda_dma
;
return
AML_SUCCESS
;
error_with_stream:
cudaStreamDestroy
(
stream
);
return
err
;
}
int
aml_dma_cuda_destroy
(
struct
aml_dma
**
dma
)
{
struct
aml_dma_cuda_data
*
data
;
if
(
dma
==
NULL
)
return
-
AML_EINVAL
;
if
(
*
dma
==
NULL
)
return
AML_SUCCESS
;
data
=
(
struct
aml_dma_cuda_data
*
)((
*
dma
)
->
data
);
// Synchronize all requests.
cudaStreamSynchronize
(
data
->
stream
);
// Cleanup
cudaStreamDestroy
(
data
->
stream
);
free
(
*
dma
);
*
dma
=
NULL
;
return
AML_SUCCESS
;
}
int
aml_dma_cuda_request_create
(
struct
aml_dma_data
*
data
,
struct
aml_dma_request
**
req
,
struct
aml_layout
*
dest
,
struct
aml_layout
*
src
,
aml_dma_operator
op
,
void
*
op_arg
)
{
int
err
;
struct
aml_dma_cuda_request
*
request
;
struct
aml_dma_cuda_data
*
dma_data
;
(
void
)
op_arg
;
// Check input
if
(
data
==
NULL
||
req
==
NULL
||
dest
==
NULL
||
src
==
NULL
)
return
-
AML_EINVAL
;
dma_data
=
(
struct
aml_dma_cuda_data
*
)
data
;
// Set request
request
=
AML_INNER_MALLOC
(
struct
aml_dma_cuda_request
);
if
(
request
==
NULL
)
return
-
AML_ENOMEM
;
request
->
status
=
AML_DMA_CUDA_REQUEST_STATUS_PENDING
;
// Submit request to cuda device
err
=
op
(
dest
,
src
,
(
void
*
)(
dma_data
));
if
(
err
!=
AML_SUCCESS
)
{
free
(
request
);
return
err
;
}
// Also enqueue the callback to notfiy request is done.
if
(
cudaLaunchHostFunc
(
dma_data
->
stream
,
aml_dma_cuda_callback
,
request
)
!=
cudaSuccess
)
{
free
(
request
);
return
-
AML_FAILURE
;
}
*
req
=
(
struct
aml_dma_request
*
)
request
;
return
AML_SUCCESS
;
}
int
aml_dma_cuda_request_wait
(
struct
aml_dma_data
*
data
,
struct
aml_dma_request
**
req
)
{
struct
aml_dma_cuda_data
*
dma_data
;
struct
aml_dma_cuda_request
*
dma_req
;
if
(
req
==
NULL
||
*
req
==
NULL
)
return
-
AML_EINVAL
;
dma_data
=
(
struct
aml_dma_cuda_data
*
)(
data
);
dma_req
=
(
struct
aml_dma_cuda_request
*
)(
*
req
);
// If already done, do nothing
if
(
dma_req
->
status
==
AML_DMA_CUDA_REQUEST_STATUS_DONE
)
return
AML_SUCCESS
;
// Wait for the stream to finish and call its callback.
cudaStreamSynchronize
(
dma_data
->
stream
);
// If status is not updated, either callback failed or
// the provided dma did not create the provided request.
if
(
dma_req
->
status
!=
AML_DMA_CUDA_REQUEST_STATUS_DONE
)
return
-
AML_EINVAL
;
return
AML_SUCCESS
;
}
int
aml_dma_cuda_request_destroy
(
struct
aml_dma_data
*
data
,
struct
aml_dma_request
**
req
)
{
struct
aml_dma_cuda_data
*
dma_data
;
struct
aml_dma_cuda_request
*
dma_req
;
if
(
req
==
NULL
||
*
req
==
NULL
)
return
-
AML_EINVAL
;
dma_data
=
(
struct
aml_dma_cuda_data
*
)(
data
);
dma_req
=
(
struct
aml_dma_cuda_request
*
)(
*
req
);
// If the request status is not done, wait for it to be done.
// This way, the stream callback will not update a deleted request.
if
(
dma_req
->
status
!=
AML_DMA_CUDA_REQUEST_STATUS_DONE
)
cudaStreamSynchronize
(
dma_data
->
stream
);
// If status is not updated, either callback failed or
// the provided dma did not create the provided request.
if
(
dma_req
->
status
!=
AML_DMA_CUDA_REQUEST_STATUS_DONE
)
return
-
AML_EINVAL
;
// Cleanup
free
(
dma_req
);
*
req
=
NULL
;
return
AML_SUCCESS
;
}
int
aml_dma_cuda_copy_1D
(
struct
aml_layout
*
dst
,
const
struct
aml_layout
*
src
,
void
*
arg
)
{
int
err
;
const
void
*
src_ptr
=
aml_layout_rawptr
(
src
);
void
*
dst_ptr
=
aml_layout_rawptr
(
dst
);
struct
aml_dma_cuda_data
*
dma_data
=
(
struct
aml_dma_cuda_data
*
)
arg
;
const
struct
aml_layout_cuda_data
*
cu_src
=
(
struct
aml_layout_cuda_data
*
)(
src
->
data
);
struct
aml_layout_cuda_data
*
cu_dst
=
(
struct
aml_layout_cuda_data
*
)(
dst
->
data
);
size_t
n
=
0
;
size_t
size
=
0
;
err
=
aml_layout_dims
(
src
,
&
n
);
if
(
err
!=
AML_SUCCESS
)
return
err
;
size
=
aml_layout_element_size
(
src
)
*
n
;
if
(
dma_data
->
kind
==
cudaMemcpyHostToDevice
||
dma_data
->
kind
==
cudaMemcpyDeviceToHost
)
{
if
(
cudaMemcpyAsync
(
dst_ptr
,
src_ptr
,
size
,
dma_data
->
kind
,
dma_data
->
stream
)
!=
cudaSuccess
)
return
-
AML_FAILURE
;
}
else
if
(
dma_data
->
kind
==
cudaMemcpyDeviceToDevice
)
{
if
(
cudaMemcpyPeerAsync
(
dst_ptr
,
cu_dst
->
device
,
src_ptr
,
cu_src
->
device
,
size
,
dma_data
->
stream
)
!=
cudaSuccess
)
return
-
AML_FAILURE
;
}
else
memcpy
(
dst_ptr
,
src_ptr
,
size
);
return
AML_SUCCESS
;
}
/** Default dma ops **/
struct
aml_dma_ops
aml_dma_cuda_ops
=
{
.
create_request
=
aml_dma_cuda_request_create
,
.
destroy_request
=
aml_dma_cuda_request_destroy
,
.
wait_request
=
aml_dma_cuda_request_wait
,
};
struct
aml_dma_cuda_data
aml_dma_cuda_data_host_to_device
=
{
.
stream
=
0
,
.
kind
=
cudaMemcpyHostToDevice
,
};
struct
aml_dma
aml_dma_cuda_host_to_device
=
{
.
ops
=
&
aml_dma_cuda_ops
,
.
data
=
(
struct
aml_dma_data
*
)(
&
aml_dma_cuda_data_host_to_device
),
};
struct
aml_dma_cuda_data
aml_dma_cuda_data_device_to_host
=
{
.
stream
=
0
,
.
kind
=
cudaMemcpyDeviceToHost
,
};
struct
aml_dma
aml_dma_cuda_device_to_host
=
{
.
ops
=
&
aml_dma_cuda_ops
,
.
data
=
(
struct
aml_dma_data
*
)(
&
aml_dma_cuda_data_device_to_host
),
};
struct
aml_dma_cuda_data
aml_dma_cuda_data_device_to_device
=
{
.
stream
=
0
,
.
kind
=
cudaMemcpyDeviceToDevice
,
};
struct
aml_dma
aml_dma_cuda_device_to_device
=
{
.
ops
=
&
aml_dma_cuda_ops
,
.
data
=
(
struct
aml_dma_data
*
)(
&
aml_dma_cuda_data_device_to_device
),
};
src/layout/cuda.c
View file @
d3098a76
...
...
@@ -15,6 +15,7 @@
int
aml_layout_cuda_create
(
struct
aml_layout
**
out
,
void
*
device_ptr
,
int
device
,
const
size_t
element_size
,
const
int
order
,
const
size_t
ndims
,
...
...
@@ -34,6 +35,7 @@ int aml_layout_cuda_create(struct aml_layout **out,
layout_data
=
AML_INNER_MALLOC_GET_FIELD
(
layout
,
2
,
struct
aml_layout
,
struct
aml_layout_cuda_data
);
layout_data
->
device_ptr
=
device_ptr
;
layout_data
->
device
=
device
;
layout_data
->
order
=
order
;
layout_data
->
ndims
=
ndims
;
layout_data
->
dims
=
AML_INNER_MALLOC_GET_ARRAY
(
...
...
@@ -145,9 +147,18 @@ size_t aml_layout_cuda_element_size(const struct aml_layout_data *data)
return
cudata
->
cpitch
[
0
];
}
void
*
aml_layout_cuda_rawptr
(
const
struct
aml_layout_data
*
data
)
{
struct
aml_layout_cuda_data
*
cudata
;
cudata
=
(
struct
aml_layout_cuda_data
*
)
data
;
return
cudata
->
device_ptr
;
}
struct
aml_layout_ops
aml_layout_cuda_ops
=
{
.
deref
=
aml_layout_cuda_deref
,
.
deref_native
=
aml_layout_cuda_deref_native
,
.
rawptr
=
aml_layout_cuda_rawptr
,
.
order
=
aml_layout_cuda_order
,
.
dims
=
aml_layout_cuda_dims
,
.
dims_native
=
aml_layout_cuda_dims_native
,
...
...
tests/Makefile.am
View file @
d3098a76
...
...
@@ -56,7 +56,7 @@ LAYOUT_TESTS = \
TILING_TESTS
=
tiling/test_tiling
DMA_
LINUX_
TESTS
=
dma/test_dma_linux_seq
\
DMA_TESTS
=
dma/test_dma_linux_seq
\
dma/test_dma_linux_par
SCRATCH_TESTS
=
scratch/test_scratch_seq
\
...
...
@@ -64,6 +64,7 @@ SCRATCH_TESTS = scratch/test_scratch_seq \
if
HAVE_CUDA
AREA_TESTS
+=
area/test_cuda
DMA_TESTS
+=
dma/test_cuda
endif
if
HAVE_HWLOC
...
...
@@ -76,7 +77,7 @@ UNIT_TESTS = $(UTILS_TESTS) \
$(LAYOUT_TESTS)
\
$(BINDING_TESTS)
\
$(AREA_TESTS)
\
$(DMA_
LINUX_
TESTS)
\
$(DMA_TESTS)
\
$(SCRATCH_TESTS)
# all tests
...
...
tests/dma/test_cuda.c
0 → 100644
View file @
d3098a76
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "aml.h"
#include "aml/dma/cuda.h"
#include "aml/layout/cuda.h"
#include "aml/layout/dense.h"
// Data
#define size (1 << 24) // 16 MB
void
*
host_data
;
void
*
device_data
;
// Layout
const
size_t
ndims
=
1
;
const
size_t
dims
=
size
;
const
size_t
element_size
=
1
;
const
size_t
stride
=
0
;
const
size_t
pitch
=
1
;
struct
aml_layout
*
host_layout
;
struct
aml_layout
*
device_layout
;
// Dma
struct
aml_dma
*
host_device
;
struct
aml_dma
*
device_host
;
void
setup
()
{
// Data
host_data
=
malloc
(
size
);
assert
(
host_data
!=
NULL
);
assert
(
cudaMalloc
(
&
device_data
,
size
)
==
cudaSuccess
);
// Layout
assert
(
aml_layout_dense_create
(
&
host_layout
,
host_data
,
AML_LAYOUT_ORDER_COLUMN_MAJOR
,
element_size
,
ndims
,
&
dims
,
&
stride
,
&
pitch
)
==
AML_SUCCESS
);
assert
(
aml_layout_cuda_create
(
&
device_layout
,
device_data
,
0
,
element_size
,
AML_LAYOUT_ORDER_COLUMN_MAJOR
,
ndims
,
&
dims
,
&
stride
,
&
pitch
)
==
AML_SUCCESS
);
// Dma
assert
(
aml_dma_cuda_create
(
&
host_device
,
cudaMemcpyHostToDevice
)
==
AML_SUCCESS
);
assert
(
aml_dma_cuda_create
(
&
device_host
,
cudaMemcpyDeviceToHost
)
==
AML_SUCCESS
);
}
void
teardown
()
{
// Data
free
(
host_data
);
cudaFree
(
device_data
);
// Layout