Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
argo
aml
Commits
001799b6
Commit
001799b6
authored
Aug 23, 2019
by
Swann Perarnau
Browse files
Merge branch 'area_cuda' into 'master'
Area cuda See merge request
!76
parents
a97d2d96
df3b0f85
Pipeline
#8326
passed with stages
in 10 minutes and 48 seconds
Changes
13
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
001799b6
...
...
@@ -9,6 +9,9 @@ stages:
repoquality
:
stage
:
style
except
:
-
/^wip.*/
-
/^WIP.*/
script
:
-
nix run -f "$ARGOPKGS" repoquality --command repoquality
tags
:
...
...
@@ -16,6 +19,9 @@ repoquality:
checkpatch
:
stage
:
style
except
:
-
/^wip.*/
-
/^WIP.*/
tags
:
-
integration
script
:
...
...
@@ -86,6 +92,9 @@ make:knl:
readthedocs
:
stage
:
docs
except
:
-
/^wip.*/
-
/^WIP.*/
when
:
on_success
only
:
-
master
...
...
@@ -97,6 +106,9 @@ readthedocs:
dist
:
stage
:
release
except
:
-
/^wip.*/
-
/^WIP.*/
when
:
on_success
only
:
-
tags
...
...
AUTHORS
View file @
001799b6
...
...
@@ -5,3 +5,4 @@ Valentin Reis <fre@freux.fr>
Nicolas Denoyelle <ndenoyelle@anl.gov>
Clement Foyer <cfoyer@cray.com>
Brice Videau <bvideau@anl.gov>
Aleksandr Danilin <danilin96@gmail.com>
configure.ac
View file @
001799b6
...
...
@@ -27,7 +27,8 @@ AM_PROG_CC_C_O
AC_PROG_CPP
AC_TYPE_SIZE_T
AC_TYPE_INTPTR_T
AC_PROG_AWK
AC_PROG_GREP
AM_PROG_AR
LT_INIT
...
...
@@ -38,8 +39,9 @@ AC_SUBST([PACKAGE_VERSION_MAJOR],[VERSION_MAJOR])
AC_SUBST([PACKAGE_VERSION_MINOR],[VERSION_MINOR])
AC_SUBST([PACKAGE_VERSION_PATCH],[VERSION_PATCH])
# support for testing with valgrind
###################################
AC_ARG_ENABLE(valgrind,
[AS_HELP_STRING([--enable-valgrind],[Also valgrind on checks (default is no).])],
[valgrind=true],[valgrind=false])
...
...
@@ -53,6 +55,8 @@ fi
AM_CONDITIONAL([TEST_VALGRIND],[test "x$valgrind" = xtrue])
# support for compiling benchmarks
##################################
AC_ARG_ENABLE(benchmarks,
[AS_HELP_STRING([--enable-benchmarks],[Compile additional benchmarks (default is no).])],
[benchmarks=true],[benchmarks=false])
...
...
@@ -66,17 +70,22 @@ AM_CONDITIONAL([ADD_BENCHMARKS],[test "x$benchmarks" = xtrue])
AC_CHECK_LIB(dl, dlopen)
# add pthread support.
######################
# doc in m4/ax_pthread.m4. Defines automake PTHREAD_CFLAGS and PTHREAD_LIBS
AX_PTHREAD([],[AC_MSG_ERROR([Cannot find how to compile with pthreads.])])
CC="$PTHREAD_CC"
# NUMA support
##############
AC_CHECK_HEADERS([numa.h],,[AC_MSG_ERROR([AML requires libnuma headers.])])
AC_CHECK_HEADERS([numaif.h],,[AC_MSG_ERROR([AML requires libnuma headers.])])
AC_CHECK_LIB(numa, mbind,,[AC_MSG_ERROR([AML requires libnuma.])])
# check doxygen + sphinx for documentation build
################################################
AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [no])
AC_CHECK_PROG([SPHINXBUILD], [sphinx-build], [sphinx-build], [no])
if [[ "x$DOXYGEN" != xno ]]; then
...
...
@@ -93,6 +102,47 @@ else
fi
AM_CONDITIONAL([BUILD_DOCS],[ test "x$BUILD_DOCS" = xyes ])
# check nvidia compiler and libraries
#####################################
BUILD_CUDA=no
AC_DEFINE([HAVE_CUDA], [0], [Whether aml support cuda library calls.])
AC_DEFINE([RUN_CUDA], [0], [Whether the machine on which aml is compiled can run cuda code.])
# Check compilation features
AC_CHECK_PROG([NVCC], [nvcc], [nvcc], [no])
AC_CHECK_LIB(cudart, cudaMalloc, [CUDART=yes], [CUDART=no])
AC_CHECK_HEADERS([cuda.h], [CUDA_H=yes], [CUDA_H=no])
AC_CHECK_HEADERS([cuda_runtime.h], [CUDA_RUNTIME_H=yes], [CUDA_RUNTIME_H=no])
if [[ "x$NVCC" != xno ]] && \
[[ "x$CUDART" = xyes ]] && \
[[ "x$CUDA_H" = xyes ]] && \
[[ "x$CUDA_RUNTIME_H" = xyes ]]
then
BUILD_CUDA=yes
AC_DEFINE([HAVE_CUDA], [1], [Whether aml support cuda library calls.])
fi
AM_CONDITIONAL([BUILD_CUDA],[ test "x$BUILD_CUDA" = xyes ])
# Check runtime features
if [[ "x$BUILD_CUDA" = xyes ]]; then
LIBS="$LIBS -lcudart"
RUN_CUDA=no
AC_MSG_CHECKING([that cudart code runs without error])
AC_RUN_IFELSE(
[AC_LANG_PROGRAM([[
#include <cuda.h>
#include <cuda_runtime.h>]],
[int device; return cudaGetDevice(&device) == cudaSuccess ? 0 : 1;])],
[AC_DEFINE([RUN_CUDA], [1], [Whether the machine on which aml is compiled can run cuda code.])
RUN_CUDA=yes],[])
AC_MSG_RESULT($RUN_CUDA)
fi
AM_CONDITIONAL([RUN_CUDA],[ test "x$RUN_CUDA" = xyes ])
# Output
########
...
...
@@ -104,6 +154,7 @@ AC_CONFIG_FILES([Makefile
tests/Makefile
doc/Makefile
benchmarks/Makefile
o2lo
aml.pc
include/aml/utils/version.h])
include/aml/utils/version.h]
, [chmod +x o2lo]
)
AC_OUTPUT
include/Makefile.am
View file @
001799b6
...
...
@@ -2,7 +2,8 @@ include_HEADERS=aml.h
include_aml_areadir
=
$(includedir)
/aml/area
include_aml_area_HEADERS
=
\
aml/area/linux.h
aml/area/linux.h
\
aml/area/cuda.h
include_aml_layoutdir
=
$(includedir)
/aml/layout
include_aml_layout_HEADERS
=
\
...
...
include/aml/area/cuda.h
0 → 100644
View file @
001799b6
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
/**
* @defgroup aml_area_cuda "AML Cuda Areas"
* @brief Cuda Implementation of Areas.
* #include <aml/area/cuda.h>
*
* Cuda implementation of AML areas.
* This building block relies on Cuda implementation of
* malloc/free to provide mmap/munmap on device memory.
* Additional documentation of cuda runtime API can be found here:
* @see https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
*
* AML cuda areas may be created to allocate current or specific cuda devices.
* Also allocations can be private to a single device or shared across devices.
* Finally allocations can be backed by host memory allocation.
*
* @{
**/
/**
* Structure containing aml area hooks for cuda implementation.
* For now there is only a single implementation of the hooks.
* This implementation will choose between different cuda functions.
**/
extern
struct
aml_area_ops
aml_area_cuda_ops
;
/**
* Default cuda area with private mapping in current device.
* Can be used out of the box with aml_area_*() functions.
**/
extern
struct
aml_area
aml_area_cuda
;
/**
* Allocation flags to pass to cudaMallocManaged().
* @see cuda runtime API documentation / memory management.
**/
enum
aml_area_cuda_flags
{
AML_AREA_CUDA_ATTACH_GLOBAL
,
AML_AREA_CUDA_ATTACH_HOST
,
};
/** Implementation of aml_area_data for cuda areas. **/
struct
aml_area_cuda_data
{
/** allocation flags in cuda format **/
int
flags
;
/** The device id on which allocation is done. **/
int
device
;
};
/**
* \brief Cuda area creation.
*
* @param[out] area pointer to an uninitialized struct aml_area pointer to
* receive the new area.
* @param[in] device: A valid cuda device id, i.e from 0 to num_devices-1.
* If device id is negative, then no cuda device will be selected when
* using aml_area_cuda_mmap().
* @param[in] flags: Allocation flags.
*
* @return AML_SUCCESS on success and area points to the new aml_area.
* @return -AML_FAILURE if cuda API failed to provide the number of devices.
* @return -AML_EINVAL if device id is greater than or equal to the number
* of devices.
* @return -AML_ENOMEM if space to carry area cannot be allocated.
*
* @see enum aml_area_cuda_flags.
**/
int
aml_area_cuda_create
(
struct
aml_area
**
area
,
const
int
device
,
const
enum
aml_area_cuda_flags
flags
);
/**
* \brief Cuda area destruction.
*
* Destroy (finalize and free resources) a struct aml_area created by
* aml_area_cuda_create().
*
* @param[in, out] area is NULL after this call.
**/
void
aml_area_cuda_destroy
(
struct
aml_area
**
area
);
/**
* \brief Cuda implementation of mmap operation for aml area.
*
* This function is a wrapper on cuda alloc functions.
* It uses area settings to: select device on which to perform allocation,
* select allocation function and set its parameters.
* Allocations can be standalone on device, shared across multiple devices,
* and backed with cpu memory.
* Device selection is not thread safe and requires to set the global
* state of cuda library. When selecting a device, allocation may succeed
* while setting device back to original context devices may fail. In that
* case, you need to set aml_errno to AML_SUCCESS prior to calling this
* function in order to catch the error when return value is not NULL.
*
* @param[in] area_data: The structure containing cuda area settings.
* @param[in, out] ptr: If ptr is NULL, then call cudaMallocManaged() with
* area flags. Memory will be allocated only device side.
* If ptr is not NULL:
* * ptr must point to a valid memory area.
* Device side memory will be mapped on this host side memory.
* According to cuda runtime API documentation
* (cudaHostRegister()), host side memory pages will be locked or allocation
* will fail.
* @param[in] size: The size to allocate.
*
* @return A cuda pointer to allocated device memory on success, NULL on
* failure. If failure occures, aml_errno variable is set with one of the
* following values:
* * AML_ENOTSUP is one of the cuda calls failed with error:
* cudaErrorInsufficientDriver, cudaErrorNoDevice.
* * AML_EINVAL if target device id is not valid.
* * AML_EBUSY if a specific device was requested and call to failed with error
* cudaErrorDeviceAlreadyInUse, or if region was already mapped on device.
* * AML_ENOMEM if memory allocation failed with error
* cudaErrorMemoryAllocation.
* * AML_FAILURE if one of the cuda calls resulted in error
* cudaErrorInitializationError.
**/
void
*
aml_area_cuda_mmap
(
const
struct
aml_area_data
*
area_data
,
void
*
ptr
,
size_t
size
);
/**
* \brief munmap hook for aml area.
*
* unmap memory mapped with aml_area_cuda_mmap().
* @param[in] area_data: Ignored
* @param[in, out] ptr: The virtual memory to unmap.
* @param[in] size: The size of virtual memory to unmap.
* @return -AML_EINVAL if cudaFree() returned cudaErrorInvalidValue.
* @return AML_SUCCESS otherwise.
**/
int
aml_area_cuda_munmap
(
const
struct
aml_area_data
*
area_data
,
void
*
ptr
,
const
size_t
size
);
/**
* @}
**/
include/aml/utils/error.h
View file @
001799b6
...
...
@@ -78,10 +78,16 @@ void aml_perror(const char *msg);
**/
#define AML_ENOTSUP 5
/**
* Invoked AML abstraction function is has failed
* because the resource it works on was busy.
**/
#define AML_EBUSY 6
/**
* Max allowed value for errors.
**/
#define AML_ERROR_MAX
6
#define AML_ERROR_MAX
7
/**
* @}
...
...
o2lo.in
0 → 100644
View file @
001799b6
#!/bin/sh
## ---------------------------------------------------------- ##
## $1: .lo file ##
## $2: .cu.o file object file with dynamic symbol relocation ##
## $3: .o file object file with static symbol relocation ##
## ---------------------------------------------------------- ##
AWK
=
@AWK@
GREP
=
@GREP@
LIBTOOL
=
@abs_top_builddir@/libtool
if
test
$#
-lt
3
;
then
echo
"Create a libtool .lo file provided the target.o file"
echo
""
echo
"
\t
$0
<file.lo> <file.cu.o> <file.o>"
fi
LO_FILE
=
$1
PIC_FILE
=
$(
echo
$2
|
awk
-F
'/'
'{print $NF}'
)
O_FILE
=
$(
echo
$3
|
awk
-F
'/'
'{print $NF}'
)
LIB_PIC_FILE
=
.libs/
$O_FILE
BASENAME
=
$(
echo
$2
|
awk
-F
'/'
'{for(i=1; i<NF; i++) {print $i "/"}}'
)
PROGRAM
=
$(
$GREP
-m
1
PROGRAM
=
$LIBTOOL
|
cut
-d
=
-f
2
)
PACKAGE
=
$(
$GREP
-m
1
PACKAGE
=
$LIBTOOL
|
cut
-d
=
-f
2
)
VERSION
=
$(
$GREP
-m
1
VERSION
=
$LIBTOOL
|
cut
-d
=
-f
2
)
must_libtool_define
()
{
eval
"var=
\$
$1
"
if
[
-z
"
$var
"
]
;
then
echo
"libtool script did not contain
$1
variable used to build .lo file."
exit
fi
}
must_libtool_define
"PROGRAM"
must_libtool_define
"PACKAGE"
must_libtool_define
"VERSION"
must_exist
()
{
if
[
!
-f
$1
]
;
then
echo
"File
$1
is missing."
exit
fi
}
must_exist
$PWD
/
$BASENAME$O_FILE
must_exist
$PWD
/
$BASENAME$PIC_FILE
mv
$PWD
/
$BASENAME$PIC_FILE
$PWD
/
$BASENAME$LIB_PIC_FILE
echo
"# Generated by
$PROGRAM
(GNU
$PACKAGE
)
$VERSION
"
>
${
LO_FILE
}
echo
"# Generated by
$PROGRAM
(GNU
$PACKAGE
)
$VERSION
"
>>
${
LO_FILE
}
echo
"#
$LO_FILE
- a libtool object file"
>>
${
LO_FILE
}
echo
"# Please DO NOT delete this file!"
>>
${
LO_FILE
}
echo
"# It is necessary for linking the library."
>>
${
LO_FILE
}
echo
""
>>
${
LO_FILE
}
echo
"# Name of the PIC object."
>>
${
LO_FILE
}
echo
"pic_object=
$LIB_PIC_FILE
"
>>
${
LO_FILE
}
echo
""
>>
${
LO_FILE
}
echo
"# Name of the non-PIC object"
>>
${
LO_FILE
}
echo
"non_pic_object=
$O_FILE
"
>>
${
LO_FILE
}
src/Makefile.am
View file @
001799b6
SUFFIXES
=
.c .cu
AM_CPPFLAGS
=
-I
$(top_srcdir)
/include
$(PTHREAD_CFLAGS)
AM_LDFLAGS
=
$(PTHREAD_LIBS)
noinst_LTLIBRARIES
=
#############################################
# .C sources
AREA_SOURCES
=
\
area/area.c
\
...
...
@@ -38,7 +42,36 @@ LIB_SOURCES = \
$(TILING_SOURCES)
\
$(LAYOUT_SOURCES)
\
$(UTILS_SOURCES)
\
$(CUDA_AREA_SOURCES)
\
aml.c
lib_LTLIBRARIES
=
libaml.la
libaml_la_SOURCES
=
$(LIB_SOURCES)
libaml_la_LDFLAGS
=
libaml_la_SOURCES
=
$(LIB_SOURCES)
#############################################
# Cuda sources
if
BUILD_CUDA
# Build .c sources using cuda runtime library.
libaml_la_SOURCES
+=
area/cuda.c
libaml_la_LDFLAGS
+=
-lcudart
# Build .cu sources containing device code.
#
# CUDA_LO_FILES= # .lo files result of .cu files.
# CUDA_FLAGS=--x=cu
#
# .cu.lo: .cu
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -dc -o $<.cu.o $<
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -c -o $<.o $<
# $(top_builddir)/o2lo $@ $<.cu.o $<.o
#
# libamlcuda.la: $(CUDA_LO_FILES)
# $(LIBTOOL) --tag=CC --mode=link $(NVCC) -dlink -o $@ $< -lcudart
#
# libaml_la_LIBADD+=libamlcuda.la
endif
src/area/cuda.c
0 → 100644
View file @
001799b6
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "aml.h"
#include "aml/area/cuda.h"
/*******************************************************************************
* Implementation
******************************************************************************/
static
int
aml_set_cuda_device
(
const
int
device
,
int
*
current_device
)
{
if
(
current_device
!=
NULL
&&
device
!=
*
current_device
)
{
switch
(
cudaGetDevice
(
current_device
))
{
case
cudaErrorInsufficientDriver
:
return
-
AML_ENOTSUP
;
case
cudaErrorNoDevice
:
return
-
AML_ENOTSUP
;
case
cudaErrorInitializationError
:
return
-
AML_FAILURE
;
default:
break
;
}
}
if
(
current_device
!=
NULL
&&
device
!=
*
current_device
)
{
switch
(
cudaSetDevice
(
device
))
{
case
cudaErrorInvalidDevice
:
return
-
AML_EINVAL
;
case
cudaErrorDeviceAlreadyInUse
:
return
-
AML_EBUSY
;
case
cudaErrorInsufficientDriver
:
return
-
AML_ENOTSUP
;
case
cudaErrorNoDevice
:
return
-
AML_ENOTSUP
;
case
cudaErrorInitializationError
:
return
-
AML_FAILURE
;
default:
return
AML_SUCCESS
;
}
}
return
AML_SUCCESS
;
}
static
inline
int
handle_malloc_error
(
const
int
cuda_error
)
{
switch
(
cuda_error
)
{
case
cudaErrorInvalidValue
:
aml_errno
=
AML_EINVAL
;
return
1
;
case
cudaErrorMemoryAllocation
:
aml_errno
=
AML_ENOMEM
;
return
1
;
case
cudaErrorNotSupported
:
aml_errno
=
AML_ENOTSUP
;
return
1
;
case
cudaErrorInsufficientDriver
:
aml_errno
=
AML_ENOTSUP
;
return
1
;
case
cudaErrorNoDevice
:
aml_errno
=
AML_ENOTSUP
;
return
1
;
case
cudaErrorInitializationError
:
aml_errno
=
AML_FAILURE
;
return
1
;
case
cudaErrorHostMemoryAlreadyRegistered
:
aml_errno
=
AML_EBUSY
;
return
1
;
default:
return
0
;
}
}
void
*
aml_area_cuda_mmap
(
const
struct
aml_area_data
*
area_data
,
void
*
ptr
,
size_t
size
)
{
(
void
)
ptr
;
int
aml_error
;
int
cuda_error
;
int
current_device
;
void
*
ret
;
struct
aml_area_cuda_data
*
data
=
(
struct
aml_area_cuda_data
*
)
area_data
;
// Set area target device.
if
(
data
->
device
>=
0
)
{
aml_error
=
aml_set_cuda_device
(
data
->
device
,
&
current_device
);
if
(
aml_error
!=
AML_SUCCESS
)
{
aml_errno
=
-
aml_error
;
return
NULL
;
}
}
// Actual allocation
if
(
ptr
==
NULL
)
cuda_error
=
cudaMallocManaged
(
&
ret
,
size
,
data
->
flags
);
else
{
// ptr is allocated cpu memory. Then we have to map it on device
// memory.
cuda_error
=
cudaHostRegister
(
ptr
,
size
,
cudaHostRegisterPortable
);
if
(
handle_malloc_error
(
cuda_error
))
return
NULL
;
cuda_error
=
cudaHostGetDevicePointer
(
&
ret
,
ptr
,
0
);
}
// Attempt to restore to original device.
// If it fails, attempt to set aml_errno.
// However, it might be overwritten when handling allocation
// error code..
if
(
data
->
device
>=
0
&&
current_device
!=
data
->
device
)
{
aml_error
=
aml_set_cuda_device
(
current_device
,
NULL
);
aml_errno
=
aml_error
!=
AML_SUCCESS
?
-
aml_error
:
aml_errno
;
}
// Handle allocation error code.
if
(
handle_malloc_error
(
cuda_error
))
return
NULL
;
return
ret
;
}
int
aml_area_cuda_munmap
(
const
struct
aml_area_data
*
area_data
,
void
*
ptr
,
const
size_t
size
)
{
(
void
)
(
area_data
);
(
void
)
(
size
);
int
cuda_error
=
cudaHostUnregister
(
ptr
);
if
(
cuda_error
==
cudaErrorHostMemoryNotRegistered
||
cuda_error
==
cudaErrorInvalidValue
){
cuda_error
=
cudaFree
(
ptr
);
}
switch
(
cuda_error
)
{
case
cudaErrorInvalidValue
:
return
-
AML_EINVAL
;
case
cudaSuccess
:
return
AML_SUCCESS
;
default:
printf
(
"cudaError: %s
\n
"
,
cudaGetErrorString
(
cuda_error
));
return
-
AML_FAILURE
;
}
}
/*******************************************************************************
* Areas Initialization
******************************************************************************/
int
aml_area_cuda_create
(
struct
aml_area
**
area
,
const
int
device
,
const
enum
aml_area_cuda_flags
flags
)
{
struct
aml_area
*
ret
;
struct
aml_area_cuda_data
*
data
;
int
max_devices
;
if
(
cudaGetDeviceCount
(
&
max_devices
)
!=
cudaSuccess
)
return
-
AML_FAILURE
;
ret
=
AML_INNER_MALLOC_2
(
struct
aml_area
,
struct
aml_area_cuda_data
);
if
(
ret
==
NULL
)
return
-
AML_ENOMEM
;
data
=
AML_INNER_MALLOC_NEXTPTR
(
ret
,
struct
aml_area
,
struct
aml_area_cuda_data
);
ret
->
ops
=
&
aml_area_cuda_ops
;
ret
->
data
=
(
struct
aml_area_data
*
)
data
;
switch
(
flags
)
{
case
AML_AREA_CUDA_ATTACH_GLOBAL
:
data
->
flags
=
cudaMemAttachGlobal
;
break
;
case
AML_AREA_CUDA_ATTACH_HOST
:
data
->
flags
=
cudaMemAttachHost
;
break
;
default:
data
->
flags
=
cudaMemAttachHost
;
break
;
}
data
->
device
=
device
<
0
||
device
>=
max_devices
?
-
1
:
device
;
*
area
=
ret
;
return
AML_SUCCESS
;
}
void
aml_area_cuda_destroy
(
struct
aml_area
**
area
)
{