Commit 001799b6 authored by Swann Perarnau's avatar Swann Perarnau

Merge branch 'area_cuda' into 'master'

Area cuda

See merge request !76
parents a97d2d96 df3b0f85
Pipeline #8326 passed with stages
in 10 minutes and 48 seconds
......@@ -9,6 +9,9 @@ stages:
repoquality:
stage: style
except:
- /^wip.*/
- /^WIP.*/
script:
- nix run -f "$ARGOPKGS" repoquality --command repoquality
tags:
......@@ -16,6 +19,9 @@ repoquality:
checkpatch:
stage: style
except:
- /^wip.*/
- /^WIP.*/
tags:
- integration
script:
......@@ -86,6 +92,9 @@ make:knl:
readthedocs:
stage: docs
except:
- /^wip.*/
- /^WIP.*/
when: on_success
only:
- master
......@@ -97,6 +106,9 @@ readthedocs:
dist:
stage: release
except:
- /^wip.*/
- /^WIP.*/
when: on_success
only:
- tags
......
......@@ -5,3 +5,4 @@ Valentin Reis <fre@freux.fr>
Nicolas Denoyelle <ndenoyelle@anl.gov>
Clement Foyer <cfoyer@cray.com>
Brice Videau <bvideau@anl.gov>
Aleksandr Danilin <danilin96@gmail.com>
......@@ -27,7 +27,8 @@ AM_PROG_CC_C_O
AC_PROG_CPP
AC_TYPE_SIZE_T
AC_TYPE_INTPTR_T
AC_PROG_AWK
AC_PROG_GREP
AM_PROG_AR
LT_INIT
......@@ -38,8 +39,9 @@ AC_SUBST([PACKAGE_VERSION_MAJOR],[VERSION_MAJOR])
AC_SUBST([PACKAGE_VERSION_MINOR],[VERSION_MINOR])
AC_SUBST([PACKAGE_VERSION_PATCH],[VERSION_PATCH])
# support for testing with valgrind
###################################
AC_ARG_ENABLE(valgrind,
[AS_HELP_STRING([--enable-valgrind],[Also valgrind on checks (default is no).])],
[valgrind=true],[valgrind=false])
......@@ -53,6 +55,8 @@ fi
AM_CONDITIONAL([TEST_VALGRIND],[test "x$valgrind" = xtrue])
# support for compiling benchmarks
##################################
AC_ARG_ENABLE(benchmarks,
[AS_HELP_STRING([--enable-benchmarks],[Compile additional benchmarks (default is no).])],
[benchmarks=true],[benchmarks=false])
......@@ -66,17 +70,22 @@ AM_CONDITIONAL([ADD_BENCHMARKS],[test "x$benchmarks" = xtrue])
AC_CHECK_LIB(dl, dlopen)
# add pthread support.
######################
# doc in m4/ax_pthread.m4. Defines automake PTHREAD_CFLAGS and PTHREAD_LIBS
AX_PTHREAD([],[AC_MSG_ERROR([Cannot find how to compile with pthreads.])])
CC="$PTHREAD_CC"
# NUMA support
##############
AC_CHECK_HEADERS([numa.h],,[AC_MSG_ERROR([AML requires libnuma headers.])])
AC_CHECK_HEADERS([numaif.h],,[AC_MSG_ERROR([AML requires libnuma headers.])])
AC_CHECK_LIB(numa, mbind,,[AC_MSG_ERROR([AML requires libnuma.])])
# check doxygen + sphinx for documentation build
################################################
AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [no])
AC_CHECK_PROG([SPHINXBUILD], [sphinx-build], [sphinx-build], [no])
if [[ "x$DOXYGEN" != xno ]]; then
......@@ -93,6 +102,47 @@ else
fi
AM_CONDITIONAL([BUILD_DOCS],[ test "x$BUILD_DOCS" = xyes ])
# check nvidia compiler and libraries
#####################################
BUILD_CUDA=no
AC_DEFINE([HAVE_CUDA], [0], [Whether aml support cuda library calls.])
AC_DEFINE([RUN_CUDA], [0], [Whether the machine on which aml is compiled can run cuda code.])
# Check compilation features
AC_CHECK_PROG([NVCC], [nvcc], [nvcc], [no])
AC_CHECK_LIB(cudart, cudaMalloc, [CUDART=yes], [CUDART=no])
AC_CHECK_HEADERS([cuda.h], [CUDA_H=yes], [CUDA_H=no])
AC_CHECK_HEADERS([cuda_runtime.h], [CUDA_RUNTIME_H=yes], [CUDA_RUNTIME_H=no])
if [[ "x$NVCC" != xno ]] && \
[[ "x$CUDART" = xyes ]] && \
[[ "x$CUDA_H" = xyes ]] && \
[[ "x$CUDA_RUNTIME_H" = xyes ]]
then
BUILD_CUDA=yes
AC_DEFINE([HAVE_CUDA], [1], [Whether aml support cuda library calls.])
fi
AM_CONDITIONAL([BUILD_CUDA],[ test "x$BUILD_CUDA" = xyes ])
# Check runtime features
if [[ "x$BUILD_CUDA" = xyes ]]; then
LIBS="$LIBS -lcudart"
RUN_CUDA=no
AC_MSG_CHECKING([that cudart code runs without error])
AC_RUN_IFELSE(
[AC_LANG_PROGRAM([[
#include <cuda.h>
#include <cuda_runtime.h>]],
[int device; return cudaGetDevice(&device) == cudaSuccess ? 0 : 1;])],
[AC_DEFINE([RUN_CUDA], [1], [Whether the machine on which aml is compiled can run cuda code.])
RUN_CUDA=yes],[])
AC_MSG_RESULT($RUN_CUDA)
fi
AM_CONDITIONAL([RUN_CUDA],[ test "x$RUN_CUDA" = xyes ])
# Output
########
......@@ -104,6 +154,7 @@ AC_CONFIG_FILES([Makefile
tests/Makefile
doc/Makefile
benchmarks/Makefile
o2lo
aml.pc
include/aml/utils/version.h])
include/aml/utils/version.h], [chmod +x o2lo])
AC_OUTPUT
......@@ -2,7 +2,8 @@ include_HEADERS=aml.h
include_aml_areadir=$(includedir)/aml/area
include_aml_area_HEADERS = \
aml/area/linux.h
aml/area/linux.h \
aml/area/cuda.h
include_aml_layoutdir=$(includedir)/aml/layout
include_aml_layout_HEADERS = \
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
/**
* @defgroup aml_area_cuda "AML Cuda Areas"
* @brief Cuda Implementation of Areas.
* #include <aml/area/cuda.h>
*
* Cuda implementation of AML areas.
* This building block relies on Cuda implementation of
* malloc/free to provide mmap/munmap on device memory.
* Additional documentation of cuda runtime API can be found here:
* @see https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
*
* AML cuda areas may be created to allocate current or specific cuda devices.
* Also allocations can be private to a single device or shared across devices.
* Finally allocations can be backed by host memory allocation.
*
* @{
**/
/**
* Structure containing aml area hooks for cuda implementation.
* For now there is only a single implementation of the hooks.
* This implementation will choose between different cuda functions.
**/
extern struct aml_area_ops aml_area_cuda_ops;
/**
* Default cuda area with private mapping in current device.
* Can be used out of the box with aml_area_*() functions.
**/
extern struct aml_area aml_area_cuda;
/**
* Allocation flags to pass to cudaMallocManaged().
* @see cuda runtime API documentation / memory management.
**/
enum aml_area_cuda_flags {
AML_AREA_CUDA_ATTACH_GLOBAL,
AML_AREA_CUDA_ATTACH_HOST,
};
/** Implementation of aml_area_data for cuda areas. **/
struct aml_area_cuda_data {
/** allocation flags in cuda format **/
int flags;
/** The device id on which allocation is done. **/
int device;
};
/**
* \brief Cuda area creation.
*
* @param[out] area pointer to an uninitialized struct aml_area pointer to
* receive the new area.
* @param[in] device: A valid cuda device id, i.e from 0 to num_devices-1.
* If device id is negative, then no cuda device will be selected when
* using aml_area_cuda_mmap().
* @param[in] flags: Allocation flags.
*
* @return AML_SUCCESS on success and area points to the new aml_area.
* @return -AML_FAILURE if cuda API failed to provide the number of devices.
* @return -AML_EINVAL if device id is greater than or equal to the number
* of devices.
* @return -AML_ENOMEM if space to carry area cannot be allocated.
*
* @see enum aml_area_cuda_flags.
**/
int aml_area_cuda_create(struct aml_area **area,
const int device,
const enum aml_area_cuda_flags flags);
/**
* \brief Cuda area destruction.
*
* Destroy (finalize and free resources) a struct aml_area created by
* aml_area_cuda_create().
*
* @param[in, out] area is NULL after this call.
**/
void aml_area_cuda_destroy(struct aml_area **area);
/**
* \brief Cuda implementation of mmap operation for aml area.
*
* This function is a wrapper on cuda alloc functions.
* It uses area settings to: select device on which to perform allocation,
* select allocation function and set its parameters.
* Allocations can be standalone on device, shared across multiple devices,
* and backed with cpu memory.
* Device selection is not thread safe and requires to set the global
* state of cuda library. When selecting a device, allocation may succeed
* while setting device back to original context devices may fail. In that
* case, you need to set aml_errno to AML_SUCCESS prior to calling this
* function in order to catch the error when return value is not NULL.
*
* @param[in] area_data: The structure containing cuda area settings.
* @param[in, out] ptr: If ptr is NULL, then call cudaMallocManaged() with
* area flags. Memory will be allocated only device side.
* If ptr is not NULL:
* * ptr must point to a valid memory area.
* Device side memory will be mapped on this host side memory.
* According to cuda runtime API documentation
* (cudaHostRegister()), host side memory pages will be locked or allocation
* will fail.
* @param[in] size: The size to allocate.
*
* @return A cuda pointer to allocated device memory on success, NULL on
* failure. If failure occures, aml_errno variable is set with one of the
* following values:
* * AML_ENOTSUP is one of the cuda calls failed with error:
* cudaErrorInsufficientDriver, cudaErrorNoDevice.
* * AML_EINVAL if target device id is not valid.
* * AML_EBUSY if a specific device was requested and call to failed with error
* cudaErrorDeviceAlreadyInUse, or if region was already mapped on device.
* * AML_ENOMEM if memory allocation failed with error
* cudaErrorMemoryAllocation.
* * AML_FAILURE if one of the cuda calls resulted in error
* cudaErrorInitializationError.
**/
void *aml_area_cuda_mmap(const struct aml_area_data *area_data,
void *ptr, size_t size);
/**
* \brief munmap hook for aml area.
*
* unmap memory mapped with aml_area_cuda_mmap().
* @param[in] area_data: Ignored
* @param[in, out] ptr: The virtual memory to unmap.
* @param[in] size: The size of virtual memory to unmap.
* @return -AML_EINVAL if cudaFree() returned cudaErrorInvalidValue.
* @return AML_SUCCESS otherwise.
**/
int
aml_area_cuda_munmap(const struct aml_area_data *area_data,
void *ptr, const size_t size);
/**
* @}
**/
......@@ -78,10 +78,16 @@ void aml_perror(const char *msg);
**/
#define AML_ENOTSUP 5
/**
* Invoked AML abstraction function is has failed
* because the resource it works on was busy.
**/
#define AML_EBUSY 6
/**
* Max allowed value for errors.
**/
#define AML_ERROR_MAX 6
#define AML_ERROR_MAX 7
/**
* @}
......
#!/bin/sh
## ---------------------------------------------------------- ##
## $1: .lo file ##
## $2: .cu.o file object file with dynamic symbol relocation ##
## $3: .o file object file with static symbol relocation ##
## ---------------------------------------------------------- ##
AWK=@AWK@
GREP=@GREP@
LIBTOOL=@abs_top_builddir@/libtool
if test $# -lt 3; then
echo "Create a libtool .lo file provided the target.o file"
echo ""
echo "\t$0 <file.lo> <file.cu.o> <file.o>"
fi
LO_FILE=$1
PIC_FILE=$(echo $2 | awk -F '/' '{print $NF}')
O_FILE=$(echo $3 | awk -F '/' '{print $NF}')
LIB_PIC_FILE=.libs/$O_FILE
BASENAME=$(echo $2 | awk -F '/' '{for(i=1; i<NF; i++) {print $i "/"}}')
PROGRAM=$($GREP -m 1 PROGRAM= $LIBTOOL | cut -d = -f 2)
PACKAGE=$($GREP -m 1 PACKAGE= $LIBTOOL | cut -d = -f 2)
VERSION=$($GREP -m 1 VERSION= $LIBTOOL | cut -d = -f 2)
must_libtool_define() {
eval "var=\$$1"
if [ -z "$var" ]; then
echo "libtool script did not contain $1 variable used to build .lo file."
exit
fi
}
must_libtool_define "PROGRAM"
must_libtool_define "PACKAGE"
must_libtool_define "VERSION"
must_exist() {
if [ ! -f $1 ]; then
echo "File $1 is missing."
exit
fi
}
must_exist $PWD/$BASENAME$O_FILE
must_exist $PWD/$BASENAME$PIC_FILE
mv $PWD/$BASENAME$PIC_FILE $PWD/$BASENAME$LIB_PIC_FILE
echo "# Generated by $PROGRAM (GNU $PACKAGE) $VERSION" > ${LO_FILE}
echo "# Generated by $PROGRAM (GNU $PACKAGE) $VERSION" >> ${LO_FILE}
echo "# $LO_FILE - a libtool object file" >> ${LO_FILE}
echo "# Please DO NOT delete this file!" >> ${LO_FILE}
echo "# It is necessary for linking the library." >> ${LO_FILE}
echo "" >> ${LO_FILE}
echo "# Name of the PIC object." >> ${LO_FILE}
echo "pic_object=$LIB_PIC_FILE" >> ${LO_FILE}
echo "" >> ${LO_FILE}
echo "# Name of the non-PIC object" >> ${LO_FILE}
echo "non_pic_object=$O_FILE" >> ${LO_FILE}
SUFFIXES=.c .cu
AM_CPPFLAGS = -I$(top_srcdir)/include $(PTHREAD_CFLAGS)
AM_LDFLAGS = $(PTHREAD_LIBS)
noinst_LTLIBRARIES=
#############################################
# .C sources
AREA_SOURCES = \
area/area.c \
......@@ -38,7 +42,36 @@ LIB_SOURCES = \
$(TILING_SOURCES) \
$(LAYOUT_SOURCES) \
$(UTILS_SOURCES) \
$(CUDA_AREA_SOURCES) \
aml.c
lib_LTLIBRARIES = libaml.la
libaml_la_SOURCES = $(LIB_SOURCES)
libaml_la_LDFLAGS=
libaml_la_SOURCES=$(LIB_SOURCES)
#############################################
# Cuda sources
if BUILD_CUDA
# Build .c sources using cuda runtime library.
libaml_la_SOURCES+=area/cuda.c
libaml_la_LDFLAGS+=-lcudart
# Build .cu sources containing device code.
#
# CUDA_LO_FILES= # .lo files result of .cu files.
# CUDA_FLAGS=--x=cu
#
# .cu.lo: .cu
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -dc -o $<.cu.o $<
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -c -o $<.o $<
# $(top_builddir)/o2lo $@ $<.cu.o $<.o
#
# libamlcuda.la: $(CUDA_LO_FILES)
# $(LIBTOOL) --tag=CC --mode=link $(NVCC) -dlink -o $@ $< -lcudart
#
# libaml_la_LIBADD+=libamlcuda.la
endif
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "aml.h"
#include "aml/area/cuda.h"
/*******************************************************************************
* Implementation
******************************************************************************/
static int aml_set_cuda_device(const int device, int *current_device)
{
if (current_device != NULL && device != *current_device) {
switch (cudaGetDevice(current_device)) {
case cudaErrorInsufficientDriver:
return -AML_ENOTSUP;
case cudaErrorNoDevice:
return -AML_ENOTSUP;
case cudaErrorInitializationError:
return -AML_FAILURE;
default:
break;
}
}
if (current_device != NULL && device != *current_device) {
switch (cudaSetDevice(device)) {
case cudaErrorInvalidDevice:
return -AML_EINVAL;
case cudaErrorDeviceAlreadyInUse:
return -AML_EBUSY;
case cudaErrorInsufficientDriver:
return -AML_ENOTSUP;
case cudaErrorNoDevice:
return -AML_ENOTSUP;
case cudaErrorInitializationError:
return -AML_FAILURE;
default:
return AML_SUCCESS;
}
}
return AML_SUCCESS;
}
static inline int handle_malloc_error(const int cuda_error)
{
switch (cuda_error) {
case cudaErrorInvalidValue:
aml_errno = AML_EINVAL;
return 1;
case cudaErrorMemoryAllocation:
aml_errno = AML_ENOMEM;
return 1;
case cudaErrorNotSupported:
aml_errno = AML_ENOTSUP;
return 1;
case cudaErrorInsufficientDriver:
aml_errno = AML_ENOTSUP;
return 1;
case cudaErrorNoDevice:
aml_errno = AML_ENOTSUP;
return 1;
case cudaErrorInitializationError:
aml_errno = AML_FAILURE;
return 1;
case cudaErrorHostMemoryAlreadyRegistered:
aml_errno = AML_EBUSY;
return 1;
default:
return 0;
}
}
void *aml_area_cuda_mmap(const struct aml_area_data *area_data,
void *ptr, size_t size)
{
(void)ptr;
int aml_error;
int cuda_error;
int current_device;
void *ret;
struct aml_area_cuda_data *data =
(struct aml_area_cuda_data *)area_data;
// Set area target device.
if (data->device >= 0) {
aml_error = aml_set_cuda_device(data->device, &current_device);
if (aml_error != AML_SUCCESS) {
aml_errno = -aml_error;
return NULL;
}
}
// Actual allocation
if (ptr == NULL)
cuda_error = cudaMallocManaged(&ret, size, data->flags);
else {
// ptr is allocated cpu memory. Then we have to map it on device
// memory.
cuda_error =
cudaHostRegister(ptr, size,
cudaHostRegisterPortable);
if (handle_malloc_error(cuda_error))
return NULL;
cuda_error = cudaHostGetDevicePointer(&ret, ptr, 0);
}
// Attempt to restore to original device.
// If it fails, attempt to set aml_errno.
// However, it might be overwritten when handling allocation
// error code..
if (data->device >= 0 && current_device != data->device) {
aml_error = aml_set_cuda_device(current_device, NULL);
aml_errno = aml_error != AML_SUCCESS ? -aml_error : aml_errno;
}
// Handle allocation error code.
if (handle_malloc_error(cuda_error))
return NULL;
return ret;
}
int aml_area_cuda_munmap(const struct aml_area_data *area_data,
void *ptr, const size_t size)
{
(void) (area_data);
(void) (size);
int cuda_error = cudaHostUnregister(ptr);
if (cuda_error == cudaErrorHostMemoryNotRegistered ||
cuda_error == cudaErrorInvalidValue){
cuda_error = cudaFree(ptr);
}
switch (cuda_error) {
case cudaErrorInvalidValue:
return -AML_EINVAL;
case cudaSuccess:
return AML_SUCCESS;
default:
printf("cudaError: %s\n", cudaGetErrorString(cuda_error));
return -AML_FAILURE;
}
}
/*******************************************************************************
* Areas Initialization
******************************************************************************/
int aml_area_cuda_create(struct aml_area **area,
const int device,
const enum aml_area_cuda_flags flags)
{
struct aml_area *ret;
struct aml_area_cuda_data *data;
int max_devices;
if (cudaGetDeviceCount(&max_devices) != cudaSuccess)
return -AML_FAILURE;
ret = AML_INNER_MALLOC_2(struct aml_area, struct aml_area_cuda_data);
if (ret == NULL)
return -AML_ENOMEM;
data = AML_INNER_MALLOC_NEXTPTR(ret, struct aml_area,
struct aml_area_cuda_data);
ret->ops = &aml_area_cuda_ops;
ret->data = (struct aml_area_data *)data;
switch (flags) {
case AML_AREA_CUDA_ATTACH_GLOBAL:
data->flags = cudaMemAttachGlobal;
break;
case AML_AREA_CUDA_ATTACH_HOST:
data->flags = cudaMemAttachHost;
break;
default:
data->flags = cudaMemAttachHost;
break;
}
data->device = device < 0 || device >= max_devices ? -1 : device;
*area = ret;
return AML_SUCCESS;
}
void aml_area_cuda_destroy(struct aml_area **area)
{
if (*area == NULL)
return;
free(*area);
*area = NULL;
}
/*******************************************************************************
* Areas declaration
******************************************************************************/
struct aml_area_cuda_data aml_area_cuda_data_default = {
.flags = cudaMemAttachHost,
.device = -1,
};
struct aml_area_ops aml_area_cuda_ops = {
.mmap = aml_area_cuda_mmap,
.munmap = aml_area_cuda_munmap
};
struct aml_area aml_area_cuda = {
.ops = &aml_area_cuda_ops,
.data = (struct aml_area_data *)(&aml_area_cuda_data_default)
};
......@@ -17,12 +17,13 @@ static const char * const aml_error_strings[] = {
[AML_ENOMEM] = "Not enough memory",
[AML_EINVAL] = "Invalid argument",
[AML_EDOM] = "Value out of bound",
[AML_EBUSY] = "Underlying resource is not available for operation",
[AML_ENOTSUP] = "Operation not supported",
};
const char *aml_strerror(const int err)
{
if (err < 0 || err < AML_ERROR_MAX)
if (err < 0 || err >= AML_ERROR_MAX)
return "Unknown error";
return aml_error_strings[err];
}
......
......@@ -29,6 +29,10 @@ DMA_LINUX_TESTS = dma/test_dma_linux_seq \
SCRATCH_TESTS = scratch/test_scratch_seq \
scratch/test_scratch_par
if RUN_CUDA
AREA_TESTS += area/test_cuda
endif
# unit tests
UNIT_TESTS = $(UTILS_TESTS) \
$(TILING_TESTS) \
......
......@@ -21,7 +21,7 @@ void test_map(const struct aml_area *area){
const size_t sizes[4] = {1, 32, 4096, 1<<20};
for(s = 0; s<sizeof(sizes)/sizeof(*sizes); s++){
ptr = aml_area_mmap(area, &ptr, sizes[s]);
ptr = aml_area_mmap(area, NULL, sizes[s]);
assert(ptr != NULL);
memset(ptr, 0, sizes[s]);
assert(aml_area_munmap(area, ptr, sizes[s]) == AML_SUCCESS);
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#include "aml.h"
#include "config.h"
#include "aml/area/cuda.h"
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
const size_t sizes[4] = {1, 32, 4096, 1<<20};
void test_mmap(const struct aml_area *area, const size_t size)
{
int err;
void *host_data;
void *host_copy;
void *device_data;
int flags;
flags = ((struct aml_area_cuda_data *)area->data)->flags;
host_data = malloc(size);
assert(host_data);
memset(host_data, 1, size);
host_copy = malloc(size);
assert(host_copy);
memset(host_copy, 0, size);
// Test standalone GPU side allocation.
device_data = aml_area_mmap(area, NULL, size);
assert(device_data);
err = cudaMemcpy(device_data, host_data, size, cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
err = cudaMemcpy(host_copy, device_data, size, cudaMemcpyDeviceToHost);
assert(err == cudaSuccess);
assert(!memcmp(host_data, host_copy, size));
assert(aml_area_munmap(area, device_data, size) == AML_SUCCESS);
// Test memory mapped allocation
memset(host_copy, 0, size);
device_data = aml_area_mmap(area, host_data, size);
assert(device_data);
err = cudaMemcpy(host_copy, device_data, size, cudaMemcpyDeviceToHost);
assert(err == cudaSuccess);
assert(!memcmp(host_data, host_copy, size));
assert(aml_area_munmap(area, device_data, size) == AML_SUCCESS);
free(host_data);
free(host_copy);
}