Commit df3b0f85 authored by Nicolas Denoyelle's avatar Nicolas Denoyelle Committed by Swann Perarnau

### Cuda implementation of areas.

New area allow to allocate data on cuda devices.
Allocation optionally include the ability to map
host memory on device memory. See cuda area
documentation.

Includes libtool helper to link cuda device object files
with the remaining of the library.

An additional error code has been added to aml errors for handling busy cuda devices
Also, all CI stages as been set not to run on branches name starting with wip.
parent a97d2d96
......@@ -9,6 +9,9 @@ stages:
repoquality:
stage: style
except:
- /^wip.*/
- /^WIP.*/
script:
- nix run -f "$ARGOPKGS" repoquality --command repoquality
tags:
......@@ -16,6 +19,9 @@ repoquality:
checkpatch:
stage: style
except:
- /^wip.*/
- /^WIP.*/
tags:
- integration
script:
......@@ -86,6 +92,9 @@ make:knl:
readthedocs:
stage: docs
except:
- /^wip.*/
- /^WIP.*/
when: on_success
only:
- master
......@@ -97,6 +106,9 @@ readthedocs:
dist:
stage: release
except:
- /^wip.*/
- /^WIP.*/
when: on_success
only:
- tags
......
......@@ -5,3 +5,4 @@ Valentin Reis <fre@freux.fr>
Nicolas Denoyelle <ndenoyelle@anl.gov>
Clement Foyer <cfoyer@cray.com>
Brice Videau <bvideau@anl.gov>
Aleksandr Danilin <danilin96@gmail.com>
......@@ -27,7 +27,8 @@ AM_PROG_CC_C_O
AC_PROG_CPP
AC_TYPE_SIZE_T
AC_TYPE_INTPTR_T
AC_PROG_AWK
AC_PROG_GREP
AM_PROG_AR
LT_INIT
......@@ -38,8 +39,9 @@ AC_SUBST([PACKAGE_VERSION_MAJOR],[VERSION_MAJOR])
AC_SUBST([PACKAGE_VERSION_MINOR],[VERSION_MINOR])
AC_SUBST([PACKAGE_VERSION_PATCH],[VERSION_PATCH])
# support for testing with valgrind
###################################
AC_ARG_ENABLE(valgrind,
[AS_HELP_STRING([--enable-valgrind],[Also valgrind on checks (default is no).])],
[valgrind=true],[valgrind=false])
......@@ -53,6 +55,8 @@ fi
AM_CONDITIONAL([TEST_VALGRIND],[test "x$valgrind" = xtrue])
# support for compiling benchmarks
##################################
AC_ARG_ENABLE(benchmarks,
[AS_HELP_STRING([--enable-benchmarks],[Compile additional benchmarks (default is no).])],
[benchmarks=true],[benchmarks=false])
......@@ -66,17 +70,22 @@ AM_CONDITIONAL([ADD_BENCHMARKS],[test "x$benchmarks" = xtrue])
AC_CHECK_LIB(dl, dlopen)
# add pthread support.
######################
# doc in m4/ax_pthread.m4. Defines automake PTHREAD_CFLAGS and PTHREAD_LIBS
AX_PTHREAD([],[AC_MSG_ERROR([Cannot find how to compile with pthreads.])])
CC="$PTHREAD_CC"
# NUMA support
##############
AC_CHECK_HEADERS([numa.h],,[AC_MSG_ERROR([AML requires libnuma headers.])])
AC_CHECK_HEADERS([numaif.h],,[AC_MSG_ERROR([AML requires libnuma headers.])])
AC_CHECK_LIB(numa, mbind,,[AC_MSG_ERROR([AML requires libnuma.])])
# check doxygen + sphinx for documentation build
################################################
AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [no])
AC_CHECK_PROG([SPHINXBUILD], [sphinx-build], [sphinx-build], [no])
if [[ "x$DOXYGEN" != xno ]]; then
......@@ -93,6 +102,47 @@ else
fi
AM_CONDITIONAL([BUILD_DOCS],[ test "x$BUILD_DOCS" = xyes ])
# check nvidia compiler and libraries
#####################################
BUILD_CUDA=no
AC_DEFINE([HAVE_CUDA], [0], [Whether aml support cuda library calls.])
AC_DEFINE([RUN_CUDA], [0], [Whether the machine on which aml is compiled can run cuda code.])
# Check compilation features
AC_CHECK_PROG([NVCC], [nvcc], [nvcc], [no])
AC_CHECK_LIB(cudart, cudaMalloc, [CUDART=yes], [CUDART=no])
AC_CHECK_HEADERS([cuda.h], [CUDA_H=yes], [CUDA_H=no])
AC_CHECK_HEADERS([cuda_runtime.h], [CUDA_RUNTIME_H=yes], [CUDA_RUNTIME_H=no])
if [[ "x$NVCC" != xno ]] && \
[[ "x$CUDART" = xyes ]] && \
[[ "x$CUDA_H" = xyes ]] && \
[[ "x$CUDA_RUNTIME_H" = xyes ]]
then
BUILD_CUDA=yes
AC_DEFINE([HAVE_CUDA], [1], [Whether aml support cuda library calls.])
fi
AM_CONDITIONAL([BUILD_CUDA],[ test "x$BUILD_CUDA" = xyes ])
# Check runtime features
if [[ "x$BUILD_CUDA" = xyes ]]; then
LIBS="$LIBS -lcudart"
RUN_CUDA=no
AC_MSG_CHECKING([that cudart code runs without error])
AC_RUN_IFELSE(
[AC_LANG_PROGRAM([[
#include <cuda.h>
#include <cuda_runtime.h>]],
[int device; return cudaGetDevice(&device) == cudaSuccess ? 0 : 1;])],
[AC_DEFINE([RUN_CUDA], [1], [Whether the machine on which aml is compiled can run cuda code.])
RUN_CUDA=yes],[])
AC_MSG_RESULT($RUN_CUDA)
fi
AM_CONDITIONAL([RUN_CUDA],[ test "x$RUN_CUDA" = xyes ])
# Output
########
......@@ -104,6 +154,7 @@ AC_CONFIG_FILES([Makefile
tests/Makefile
doc/Makefile
benchmarks/Makefile
o2lo
aml.pc
include/aml/utils/version.h])
include/aml/utils/version.h], [chmod +x o2lo])
AC_OUTPUT
......@@ -2,7 +2,8 @@ include_HEADERS=aml.h
include_aml_areadir=$(includedir)/aml/area
include_aml_area_HEADERS = \
aml/area/linux.h
aml/area/linux.h \
aml/area/cuda.h
include_aml_layoutdir=$(includedir)/aml/layout
include_aml_layout_HEADERS = \
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
/**
* @defgroup aml_area_cuda "AML Cuda Areas"
* @brief Cuda Implementation of Areas.
* #include <aml/area/cuda.h>
*
* Cuda implementation of AML areas.
* This building block relies on Cuda implementation of
* malloc/free to provide mmap/munmap on device memory.
* Additional documentation of cuda runtime API can be found here:
* @see https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
*
* AML cuda areas may be created to allocate current or specific cuda devices.
* Also allocations can be private to a single device or shared across devices.
* Finally allocations can be backed by host memory allocation.
*
* @{
**/
/**
* Structure containing aml area hooks for cuda implementation.
* For now there is only a single implementation of the hooks.
* This implementation will choose between different cuda functions.
**/
extern struct aml_area_ops aml_area_cuda_ops;
/**
* Default cuda area with private mapping in current device.
* Can be used out of the box with aml_area_*() functions.
**/
extern struct aml_area aml_area_cuda;
/**
* Allocation flags to pass to cudaMallocManaged().
* @see cuda runtime API documentation / memory management.
**/
enum aml_area_cuda_flags {
AML_AREA_CUDA_ATTACH_GLOBAL,
AML_AREA_CUDA_ATTACH_HOST,
};
/** Implementation of aml_area_data for cuda areas. **/
struct aml_area_cuda_data {
/** allocation flags in cuda format **/
int flags;
/** The device id on which allocation is done. **/
int device;
};
/**
* \brief Cuda area creation.
*
* @param[out] area pointer to an uninitialized struct aml_area pointer to
* receive the new area.
* @param[in] device: A valid cuda device id, i.e from 0 to num_devices-1.
* If device id is negative, then no cuda device will be selected when
* using aml_area_cuda_mmap().
* @param[in] flags: Allocation flags.
*
* @return AML_SUCCESS on success and area points to the new aml_area.
* @return -AML_FAILURE if cuda API failed to provide the number of devices.
* @return -AML_EINVAL if device id is greater than or equal to the number
* of devices.
* @return -AML_ENOMEM if space to carry area cannot be allocated.
*
* @see enum aml_area_cuda_flags.
**/
int aml_area_cuda_create(struct aml_area **area,
const int device,
const enum aml_area_cuda_flags flags);
/**
* \brief Cuda area destruction.
*
* Destroy (finalize and free resources) a struct aml_area created by
* aml_area_cuda_create().
*
* @param[in, out] area is NULL after this call.
**/
void aml_area_cuda_destroy(struct aml_area **area);
/**
* \brief Cuda implementation of mmap operation for aml area.
*
* This function is a wrapper on cuda alloc functions.
* It uses area settings to: select device on which to perform allocation,
* select allocation function and set its parameters.
* Allocations can be standalone on device, shared across multiple devices,
* and backed with cpu memory.
* Device selection is not thread safe and requires to set the global
* state of cuda library. When selecting a device, allocation may succeed
* while setting device back to original context devices may fail. In that
* case, you need to set aml_errno to AML_SUCCESS prior to calling this
* function in order to catch the error when return value is not NULL.
*
* @param[in] area_data: The structure containing cuda area settings.
* @param[in, out] ptr: If ptr is NULL, then call cudaMallocManaged() with
* area flags. Memory will be allocated only device side.
* If ptr is not NULL:
* * ptr must point to a valid memory area.
* Device side memory will be mapped on this host side memory.
* According to cuda runtime API documentation
* (cudaHostRegister()), host side memory pages will be locked or allocation
* will fail.
* @param[in] size: The size to allocate.
*
* @return A cuda pointer to allocated device memory on success, NULL on
* failure. If failure occures, aml_errno variable is set with one of the
* following values:
* * AML_ENOTSUP is one of the cuda calls failed with error:
* cudaErrorInsufficientDriver, cudaErrorNoDevice.
* * AML_EINVAL if target device id is not valid.
* * AML_EBUSY if a specific device was requested and call to failed with error
* cudaErrorDeviceAlreadyInUse, or if region was already mapped on device.
* * AML_ENOMEM if memory allocation failed with error
* cudaErrorMemoryAllocation.
* * AML_FAILURE if one of the cuda calls resulted in error
* cudaErrorInitializationError.
**/
void *aml_area_cuda_mmap(const struct aml_area_data *area_data,
void *ptr, size_t size);
/**
* \brief munmap hook for aml area.
*
* unmap memory mapped with aml_area_cuda_mmap().
* @param[in] area_data: Ignored
* @param[in, out] ptr: The virtual memory to unmap.
* @param[in] size: The size of virtual memory to unmap.
* @return -AML_EINVAL if cudaFree() returned cudaErrorInvalidValue.
* @return AML_SUCCESS otherwise.
**/
int
aml_area_cuda_munmap(const struct aml_area_data *area_data,
void *ptr, const size_t size);
/**
* @}
**/
......@@ -78,10 +78,16 @@ void aml_perror(const char *msg);
**/
#define AML_ENOTSUP 5
/**
* Invoked AML abstraction function is has failed
* because the resource it works on was busy.
**/
#define AML_EBUSY 6
/**
* Max allowed value for errors.
**/
#define AML_ERROR_MAX 6
#define AML_ERROR_MAX 7
/**
* @}
......
#!/bin/sh
## ---------------------------------------------------------- ##
## $1: .lo file ##
## $2: .cu.o file object file with dynamic symbol relocation ##
## $3: .o file object file with static symbol relocation ##
## ---------------------------------------------------------- ##
AWK=@AWK@
GREP=@GREP@
LIBTOOL=@abs_top_builddir@/libtool
if test $# -lt 3; then
echo "Create a libtool .lo file provided the target.o file"
echo ""
echo "\t$0 <file.lo> <file.cu.o> <file.o>"
fi
LO_FILE=$1
PIC_FILE=$(echo $2 | awk -F '/' '{print $NF}')
O_FILE=$(echo $3 | awk -F '/' '{print $NF}')
LIB_PIC_FILE=.libs/$O_FILE
BASENAME=$(echo $2 | awk -F '/' '{for(i=1; i<NF; i++) {print $i "/"}}')
PROGRAM=$($GREP -m 1 PROGRAM= $LIBTOOL | cut -d = -f 2)
PACKAGE=$($GREP -m 1 PACKAGE= $LIBTOOL | cut -d = -f 2)
VERSION=$($GREP -m 1 VERSION= $LIBTOOL | cut -d = -f 2)
must_libtool_define() {
eval "var=\$$1"
if [ -z "$var" ]; then
echo "libtool script did not contain $1 variable used to build .lo file."
exit
fi
}
must_libtool_define "PROGRAM"
must_libtool_define "PACKAGE"
must_libtool_define "VERSION"
must_exist() {
if [ ! -f $1 ]; then
echo "File $1 is missing."
exit
fi
}
must_exist $PWD/$BASENAME$O_FILE
must_exist $PWD/$BASENAME$PIC_FILE
mv $PWD/$BASENAME$PIC_FILE $PWD/$BASENAME$LIB_PIC_FILE
echo "# Generated by $PROGRAM (GNU $PACKAGE) $VERSION" > ${LO_FILE}
echo "# Generated by $PROGRAM (GNU $PACKAGE) $VERSION" >> ${LO_FILE}
echo "# $LO_FILE - a libtool object file" >> ${LO_FILE}
echo "# Please DO NOT delete this file!" >> ${LO_FILE}
echo "# It is necessary for linking the library." >> ${LO_FILE}
echo "" >> ${LO_FILE}
echo "# Name of the PIC object." >> ${LO_FILE}
echo "pic_object=$LIB_PIC_FILE" >> ${LO_FILE}
echo "" >> ${LO_FILE}
echo "# Name of the non-PIC object" >> ${LO_FILE}
echo "non_pic_object=$O_FILE" >> ${LO_FILE}
SUFFIXES=.c .cu
AM_CPPFLAGS = -I$(top_srcdir)/include $(PTHREAD_CFLAGS)
AM_LDFLAGS = $(PTHREAD_LIBS)
noinst_LTLIBRARIES=
#############################################
# .C sources
AREA_SOURCES = \
area/area.c \
......@@ -38,7 +42,36 @@ LIB_SOURCES = \
$(TILING_SOURCES) \
$(LAYOUT_SOURCES) \
$(UTILS_SOURCES) \
$(CUDA_AREA_SOURCES) \
aml.c
lib_LTLIBRARIES = libaml.la
libaml_la_SOURCES = $(LIB_SOURCES)
libaml_la_LDFLAGS=
libaml_la_SOURCES=$(LIB_SOURCES)
#############################################
# Cuda sources
if BUILD_CUDA
# Build .c sources using cuda runtime library.
libaml_la_SOURCES+=area/cuda.c
libaml_la_LDFLAGS+=-lcudart
# Build .cu sources containing device code.
#
# CUDA_LO_FILES= # .lo files result of .cu files.
# CUDA_FLAGS=--x=cu
#
# .cu.lo: .cu
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -dc -o $<.cu.o $<
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -c -o $<.o $<
# $(top_builddir)/o2lo $@ $<.cu.o $<.o
#
# libamlcuda.la: $(CUDA_LO_FILES)
# $(LIBTOOL) --tag=CC --mode=link $(NVCC) -dlink -o $@ $< -lcudart
#
# libaml_la_LIBADD+=libamlcuda.la
endif
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "aml.h"
#include "aml/area/cuda.h"
/*******************************************************************************
* Implementation
******************************************************************************/
static int aml_set_cuda_device(const int device, int *current_device)
{
if (current_device != NULL && device != *current_device) {
switch (cudaGetDevice(current_device)) {
case cudaErrorInsufficientDriver:
return -AML_ENOTSUP;
case cudaErrorNoDevice:
return -AML_ENOTSUP;
case cudaErrorInitializationError:
return -AML_FAILURE;
default:
break;
}
}
if (current_device != NULL && device != *current_device) {
switch (cudaSetDevice(device)) {
case cudaErrorInvalidDevice:
return -AML_EINVAL;
case cudaErrorDeviceAlreadyInUse:
return -AML_EBUSY;
case cudaErrorInsufficientDriver:
return -AML_ENOTSUP;
case cudaErrorNoDevice:
return -AML_ENOTSUP;
case cudaErrorInitializationError:
return -AML_FAILURE;
default:
return AML_SUCCESS;
}
}
return AML_SUCCESS;
}
static inline int handle_malloc_error(const int cuda_error)
{
switch (cuda_error) {
case cudaErrorInvalidValue:
aml_errno = AML_EINVAL;
return 1;
case cudaErrorMemoryAllocation:
aml_errno = AML_ENOMEM;
return 1;
case cudaErrorNotSupported:
aml_errno = AML_ENOTSUP;
return 1;
case cudaErrorInsufficientDriver:
aml_errno = AML_ENOTSUP;
return 1;
case cudaErrorNoDevice:
aml_errno = AML_ENOTSUP;
return 1;
case cudaErrorInitializationError:
aml_errno = AML_FAILURE;
return 1;
case cudaErrorHostMemoryAlreadyRegistered:
aml_errno = AML_EBUSY;
return 1;
default:
return 0;
}
}
void *aml_area_cuda_mmap(const struct aml_area_data *area_data,
void *ptr, size_t size)
{
(void)ptr;
int aml_error;
int cuda_error;
int current_device;
void *ret;
struct aml_area_cuda_data *data =
(struct aml_area_cuda_data *)area_data;
// Set area target device.
if (data->device >= 0) {
aml_error = aml_set_cuda_device(data->device, &current_device);
if (aml_error != AML_SUCCESS) {
aml_errno = -aml_error;
return NULL;
}
}
// Actual allocation
if (ptr == NULL)
cuda_error = cudaMallocManaged(&ret, size, data->flags);
else {
// ptr is allocated cpu memory. Then we have to map it on device
// memory.
cuda_error =
cudaHostRegister(ptr, size,
cudaHostRegisterPortable);
if (handle_malloc_error(cuda_error))
return NULL;
cuda_error = cudaHostGetDevicePointer(&ret, ptr, 0);
}
// Attempt to restore to original device.
// If it fails, attempt to set aml_errno.
// However, it might be overwritten when handling allocation
// error code..
if (data->device >= 0 && current_device != data->device) {
aml_error = aml_set_cuda_device(current_device, NULL);
aml_errno = aml_error != AML_SUCCESS ? -aml_error : aml_errno;
}
// Handle allocation error code.
if (handle_malloc_error(cuda_error))
return NULL;
return ret;
}
int aml_area_cuda_munmap(const struct aml_area_data *area_data,
void *ptr, const size_t size)
{
(void) (area_data);
(void) (size);
int cuda_error = cudaHostUnregister(ptr);
if (cuda_error == cudaErrorHostMemoryNotRegistered ||
cuda_error == cudaErrorInvalidValue){
cuda_error = cudaFree(ptr);
}
switch (cuda_error) {
case cudaErrorInvalidValue:
return -AML_EINVAL;
case cudaSuccess:
return AML_SUCCESS;
default:
printf("cudaError: %s\n", cudaGetErrorString(cuda_error));
return -AML_FAILURE;
}
}
/*******************************************************************************
* Areas Initialization
******************************************************************************/
int aml_area_cuda_create(struct aml_area **area,
const int device,
const enum aml_area_cuda_flags flags)
{
struct aml_area *ret;
struct aml_area_cuda_data *data;
int max_devices;
if (cudaGetDeviceCount(&max_devices) != cudaSuccess)
return -AML_FAILURE;
ret = AML_INNER_MALLOC_2(struct aml_area, struct aml_area_cuda_data);
if (ret == NULL)
return -AML_ENOMEM;
data = AML_INNER_MALLOC_NEXTPTR(ret, struct aml_area,
struct aml_area_cuda_data);
ret->ops = &aml_area_cuda_ops;
ret->data = (struct aml_area_data *)data;
switch (flags) {
case AML_AREA_CUDA_ATTACH_GLOBAL:
data->flags = cudaMemAttachGlobal;
break;
case AML_AREA_CUDA_ATTACH_HOST:
data->flags = cudaMemAttachHost;
break;
default:
data->flags = cudaMemAttachHost;
break;
}
data->device = device < 0 || device >= max_devices ? -1 : device;
*area = ret;
return AML_SUCCESS;
}
void aml_area_cuda_destroy(struct aml_area **area)
{
if (*area == NULL)
return;
free(*area);
*area = NULL;
}
/*******************************************************************************
* Areas declaration
******************************************************************************/
struct aml_area_cuda_data aml_area_cuda_data_default = {
.flags = cudaMemAttachHost,
.device = -1,
};
struct aml_area_ops aml_area_cuda_ops = {
.mmap = aml_area_cuda_mmap,
.munmap = aml_area_cuda_munmap
};
struct aml_area aml_area_cuda = {
.ops = &aml_area_cuda_ops,
.data = (struct aml_area_data *)(&aml_area_cuda_data_default)
};
......@@ -17,12 +17,13 @@ static const char * const aml_error_strings[] = {
[AML_ENOMEM] = "Not enough memory",
[AML_EINVAL] = "Invalid argument",
[AML_EDOM] = "Value out of bound",
[AML_EBUSY] = "Underlying resource is not available for operation",
[AML_ENOTSUP] = "Operation not supported",
};
const char *aml_strerror(const int err)
{
if (err < 0 || err < AML_ERROR_MAX)
if (err < 0 || err >= AML_ERROR_MAX)
return "Unknown error";
return aml_error_strings[err];
}
......
......@@ -29,6 +29,10 @@ DMA_LINUX_TESTS = dma/test_dma_linux_seq \
SCRATCH_TESTS = scratch/test_scratch_seq \
scratch/test_scratch_par
if RUN_CUDA
AREA_TESTS += area/test_cuda
endif
# unit tests
UNIT_TESTS = $(UTILS_TESTS) \
$(TILING_TESTS) \
......
......@@ -21,7 +21,7 @@ void test_map(const struct aml_area *area){
const size_t sizes[4] = {1, 32, 4096, 1<<20};
for(s = 0; s<sizeof(sizes)/sizeof(*sizes); s++){
ptr = aml_area_mmap(area, &ptr, sizes[s]);
ptr = aml_area_mmap(area, NULL, sizes[s]);
assert(ptr != NULL);
memset(ptr, 0, sizes[s]);
assert(aml_area_munmap(area, ptr, sizes[s]) == AML_SUCCESS);
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#include "aml.h"
#include "config.h"
#include "aml/area/cuda.h"
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
const size_t sizes[4] = {1, 32, 4096, 1<<20};
void test_mmap(const struct aml_area *area, const size_t size)
{
int err;
void *host_data;
void *host_copy;
void *device_data;
int flags;
flags = ((struct aml_area_cuda_data *)area->data)->flags;
host_data = malloc(size);
assert(host_data);
memset(host_data, 1, size);
host_copy = malloc(size);
assert(host_copy);
memset(host_copy, 0, size);
// Test standalone GPU side allocation.
device_data = aml_area_mmap(area, NULL, size);
assert(device_data);
err = cudaMemcpy(device_data, host_data, size, cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
err = cudaMemcpy(host_copy, device_data, size, cudaMemcpyDeviceToHost);
assert(err == cudaSuccess);
assert(!memcmp(host_data, host_copy, size));