Commit bfc976af authored by Swann Perarnau's avatar Swann Perarnau

Merge branch 'staging' into 'master'

Sync-up master and staging

See merge request !99
parents 80431541 e6dc5dd3
Pipeline #8966 passed with stages
in 10 minutes and 7 seconds
variables:
NMC_FE1_SLURM_PARAMETERS: "--nodes=1 --partition=ecp-p9-4v100"
ANL_THETA_PROJECT_SERVICE_USER: "cscstss"
ANL_THETA_SCHEDULER_PARAMETERS: "-A CSC250STPR19 -n 1 -t 20 -q debug-flat-quad"
stages:
- build
nmc:batch:env:
stage: build
tags:
- nmc
only:
refs:
- master
- staging
variables:
- $ECP_MIRROR == "NMC"
script:
- uname -a
- module avail
- lspci
- module load hwloc numactl cuda
- module avail
- lstopo
- env
- which gcc
- gcc --version
- compgen -c gcc
make:nmc-cuda-check:
stage: build
tags:
- nmc
only:
refs:
- master
- staging
variables:
- $ECP_MIRROR == "NMC"
script:
- module load numactl hwloc cuda
- env | grep -i cuda
- cat /proc/self/status
- ./autogen.sh
- ./configure
- make -j
- make check VERBOSE=1
artifacts:
when: on_failure
paths:
- tests/test-suite.log
- config.log
make:nmc-cuda-home:
stage: build
tags:
- nmc
only:
refs:
- master
- staging
variables:
- $ECP_MIRROR == "NMC"
script:
- module load numactl hwloc cuda
- env | grep -i cuda
- ./autogen.sh
- ./configure --with-cuda=$CUDA_HOME
- make -j
- make check VERBOSE=1
artifacts:
when: on_failure
paths:
- tests/test-suite.log
- config.log
make:theta-batch:
stage: build
tags:
- ecp-theta
- batch
only:
refs:
- master
- staging
variables:
- $ECP_MIRROR == "THETA"
script:
- |
cat > aml-ci-script.sh << EOF
#!/bin/bash
module list
./autogen.sh
./configure
make -j63
make check VERBOSE=1
EOF
- cat aml-ci-script.sh
- chmod +x aml-ci-script.sh
- aprun aml-ci-script.sh
artifacts:
when: on_failure
paths:
- tests/*.log
- config.log
......@@ -16,7 +16,12 @@ repoquality:
- git ls-files *.c *.h | grep -v -e benchmarks > .repoquality
- nix run -f "$ARGOPKGS" repoquality --command repoquality
tags:
- integration
- nix
- kvm
artifacts:
when: on_failure
paths:
- .repoquality
checkpatch:
stage: style
......@@ -24,7 +29,8 @@ checkpatch:
- /^wip.*/
- /^WIP.*/
tags:
- integration
- nix
- kvm
script:
- git ls-files *.c *.h | grep -v -e benchmarks >> .checkpatch.conf
- nix run -f "$ARGOPKGS" checkpatch --command checkpatch.pl
......@@ -35,7 +41,8 @@ style:docs:
- /^wip.*/
- /^WIP.*/
tags:
- integration
- nix
- kvm
script:
- |
nix-shell "$ARGOPKGS" -A aml-dist --arg aml-src ./. --run bash << EOF
......@@ -50,6 +57,8 @@ style:docs:
- config.log
make:generic:
tags:
- knl
stage: build
except:
- /^wip.*/
......@@ -70,6 +79,8 @@ make:generic:
- tests/*.log
make:out-of-tree:
tags:
- knl
stage: build
except:
- /^wip.*/
......@@ -115,30 +126,15 @@ readthedocs:
stage: docs
when: on_success
only:
- staging
- master
- /v[0-9]+\.[0-9]+\.x/
tags:
- integration
- nix
- kvm
environment:
name: rtd/$CI_COMMIT_REF_NAME
url: https://argo-aml.readthedocs.io/en/$CI_COMMIT_REF_NAME
script:
- nix run nixpkgs.curl -c curl -X POST -d "branch=$CI_COMMIT_REF_NAME" -d "token=$READTHEDOCS_TOKEN" https://readthedocs.org/api/v2/webhook/argo-aml/83161/
dist:
stage: release
except:
- /^wip.*/
- /^WIP.*/
when: on_success
only:
- tags
tags:
- integration
script:
- nix-build "$ARGOPKGS" -A aml-dist --arg aml-src ./.
- nix-shell "$ARGOPKGS" -A aml-dist --arg aml-src ./. --run "./release.sh CREATE $CI_JOB_ID $CI_PROJECT_ID $RELEASE_TOKEN $CI_COMMIT_REF_NAME"
artifacts:
when: on_success
paths:
- result/*.tar.gz
- CHECKSUM
expire_in: 1000y
- nix run nixpkgs.curl -c curl -X POST -d "branches=$CI_COMMIT_REF_NAME" -d "token=$READTHEDOCS_TOKEN" https://readthedocs.org/api/v2/webhook/argo-aml/83161/
......@@ -6,3 +6,4 @@ Nicolas Denoyelle <ndenoyelle@anl.gov>
Clement Foyer <cfoyer@cray.com>
Brice Videau <bvideau@anl.gov>
Aleksandr Danilin <danilin96@gmail.com>
Kyle Shaver <kshaver@anl.gov>
......@@ -104,46 +104,77 @@ if [[ "x$docs" = xtrue ]]; then
fi
AM_CONDITIONAL([BUILD_DOCS],[ test "x$docs" == xtrue ])
# check nvidia compiler and libraries
# check CUDA compiler and libraries
#####################################
BUILD_CUDA=no
AC_DEFINE([HAVE_CUDA], [0], [Whether aml support cuda library calls.])
AC_DEFINE([RUN_CUDA], [0], [Whether the machine on which aml is compiled can run cuda code.])
# Check compilation features
AC_CHECK_PROG([NVCC], [nvcc], [nvcc], [no])
AC_CHECK_LIB(cudart, cudaMalloc, [CUDART=yes], [CUDART=no])
AC_CHECK_HEADERS([cuda.h], [CUDA_H=yes], [CUDA_H=no])
AC_CHECK_HEADERS([cuda_runtime.h], [CUDA_RUNTIME_H=yes], [CUDA_RUNTIME_H=no])
if [[ "x$NVCC" != xno ]] && \
[[ "x$CUDART" = xyes ]] && \
[[ "x$CUDA_H" = xyes ]] && \
[[ "x$CUDA_RUNTIME_H" = xyes ]]
then
BUILD_CUDA=yes
AC_DEFINE([HAVE_CUDA], [1], [Whether aml support cuda library calls.])
have_cuda=0
AC_ARG_WITH([cuda],
[AS_HELP_STRING([--with-cuda@<:@=yes|no|DIR@:>@],
[support cuda inside the library (default is check)])],
[
if [[ "x$withval" = xno ]]; then
want_cuda="no"
elif [[ "x$withval" = xyes ]]; then
want_cuda="yes"
cuda_home_path="$CUDA_HOME"
else
want_cuda="yes"
cuda_home_path=$withval
fi
],
[
want_cuda="check"
cuda_home_path="$CUDA_HOME"
])
if [[ "x$want_cuda" != xno ]]; then
AC_MSG_NOTICE([starting checks for CUDA])
if [[ -n "$cuda_home_path" ]]; then
nvcc_search_dirs="$PATH$PATH_SEPARATOR$cuda_home_path/bin"
else
nvcc_search_dirs="$PATH"
fi
AC_PATH_PROG([NVCC], [nvcc], [], [$nvcc_search_dirs])
if [[ -n "$NVCC" ]]; then
have_nvcc="yes"
fi
else
AC_MSG_NOTICE([will not check for CUDA])
fi
AM_CONDITIONAL([BUILD_CUDA],[ test "x$BUILD_CUDA" = xyes ])
# Check runtime features
if [[ "x$BUILD_CUDA" = xyes ]]; then
LIBS="$LIBS -lcudart"
RUN_CUDA=no
AC_MSG_CHECKING([that cudart code runs without error])
AC_RUN_IFELSE(
[AC_LANG_PROGRAM([[
#include <cuda.h>
#include <cuda_runtime.h>]],
[int device; return cudaGetDevice(&device) == cudaSuccess ? 0 : 1;])],
[AC_DEFINE([RUN_CUDA], [1], [Whether the machine on which aml is compiled can run cuda code.])
RUN_CUDA=yes],[])
AC_MSG_RESULT($RUN_CUDA)
if [[ "x$have_nvcc" = xyes ]]; then
if [[ -n "$cuda_home_path" ]]; then
CUDA_CFLAGS="-I$cuda_home_path/include"
CUDA_LIBS="-L$cuda_home_path/lib64 -lcudart"
else
CUDA_CFLAGS="-I/usr/local/cuda/include"
CUDA_LIBS="-L/usr/local/cuda/lib64 -lcudart"
fi
saved_LIBS=$LIBS
saved_CFLAGS=$CFLAGS
LIBS="$LIBS $CUDA_LIBS"
CFLAGS="$CFLAGS $CUDA_CFLAGS"
AC_CHECK_HEADER([cuda.h],,
[AC_MSG_ERROR([could not find cuda.h])])
AC_CHECK_HEADER([cuda_runtime.h],,
[AC_MSG_ERROR([could not find cuda_runtime.h])])
AC_CHECK_LIB(cudart, cudaMalloc,,
AC_MSG_ERROR([could not find cudart library]))
LIBS=$saved_LIBS
CFLAGS=$saved_CFLAGS
have_cuda=1
fi
AM_CONDITIONAL([RUN_CUDA],[ test "x$RUN_CUDA" = xyes ])
AC_DEFINE_UNQUOTED([HAVE_CUDA], [$have_cuda], [Whether aml support cuda library calls.])
AC_SUBST([HAVE_CUDA],[$have_cuda])
AM_CONDITIONAL([HAVE_CUDA], [ test "$have_cuda" == "1" ])
AC_SUBST(CUDA_CFLAGS)
AC_SUBST(CUDA_LIBS)
AC_SUBST(NVCC)
# Output
########
......@@ -156,7 +187,8 @@ AC_CONFIG_FILES([Makefile
tests/Makefile
doc/Makefile
benchmarks/Makefile
o2lo
aml.pc
include/aml/utils/version.h], [chmod +x o2lo])
include/aml/utils/version.h
include/aml/utils/features.h])
AC_CONFIG_FILES([o2lo], [chmod +x o2lo])
AC_OUTPUT
......@@ -34,5 +34,6 @@ include_amlutils_HEADERS = \
aml/utils/error.h \
aml/utils/inner-malloc.h \
aml/utils/vector.h \
aml/utils/version.h
aml/utils/version.h \
aml/utils/features.h
......@@ -38,6 +38,7 @@
#include "aml/utils/inner-malloc.h"
#include "aml/utils/vector.h"
#include "aml/utils/version.h"
#include "aml/utils/features.h"
////////////////////////////////////////////////////////////////////////////////
......
......@@ -32,6 +32,10 @@
* Structure of a dense layout.
**/
struct aml_layout_dense {
/** base pointer of the address range **/
void *ptr;
/** number of dimensions **/
size_t ndims;
/**
* dimensions, in element size, of the data structure,
* by order of appearance in memory.
......@@ -42,20 +46,11 @@ struct aml_layout_dense {
* Offset in number of elements.
**/
size_t *stride;
/**
* distances between two elements of the next dimension
* (or total dimension of the layout in this dimension).
**/
size_t *pitch;
/**
* cumulative distances between two elements in the same
* dimension (pitch[0] is the element size in bytes).
**/
size_t *cpitch;
/** base pointer of the address range **/
void *ptr;
/** number of dimensions **/
size_t ndims;
};
/**
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#ifndef AML_FEATURES_H
#define AML_FEATURES_H
/**
* @defgroup aml_features "AML Features Detection API"
* @brief AML Features Detection API
*
* This module provides the compile time and runtime detection of backends
* used in AML.
* This feature is usefull for instance when testing AML. When AML is compiled
* with cuda backend support but runs on a machine without cuda devices, we can
* disable cuda tests.
* @{
**/
/** Whether aml had libnuma at compile time. (always true) **/
#define AML_HAVE_BACKEND_LIBNUMA 1
/** Whether aml had cuda capabilities at compile time **/
#define AML_HAVE_BACKEND_CUDA @HAVE_CUDA@
/** Flag for checking runtime suport for libnuma **/
#define AML_BACKEND_LIBNUMA (1UL<<1)
/** Flag for checking runtime suport for cuda **/
#define AML_BACKEND_CUDA (1UL<<2)
/**
* Check if a set of backends can be used at runtime.
*
* Checking for cuda and libnuma support at runtime:
* ```
* if (aml_support_backend(AML_BACKEND_LIBNUMA_FLAG | AML_BACKEND_CUDA_FLAG)
* { ... }
* ```
* @param[in] backends: The backend to check.
* @return 1 if the backend works, else 0.
**/
int aml_support_backends(const unsigned long backends);
/**
* @}
**/
#endif // AML_FEATURES_H
......@@ -34,7 +34,8 @@ TILING_SOURCES = \
UTILS_SOURCES = \
utils/bitmap.c \
utils/error.c \
utils/vector.c
utils/vector.c \
utils/features.c
LIB_SOURCES = \
$(AREA_SOURCES) \
......@@ -54,11 +55,13 @@ libaml_la_SOURCES=$(LIB_SOURCES)
#############################################
# Cuda sources
if BUILD_CUDA
if HAVE_CUDA
AM_CPPFLAGS += $(CUDA_CFLAGS)
AM_LDFLAGS += $(CUDA_LIBS)
# Build .c sources using cuda runtime library.
libaml_la_SOURCES+=area/cuda.c
libaml_la_LDFLAGS+=-lcudart
# Build .cu sources containing device code.
#
......
......@@ -217,15 +217,16 @@ int aml_area_linux_create(struct aml_area **area,
/* check if the nodemask is compatible with the nodeset */
if (nodemask != NULL) {
int aml_last = aml_bitmap_last(nodemask);
int allowed_last = numa_bitmask_weight(data->nodeset);
for (int i = 0; i < AML_BITMAP_MAX; i++) {
int ours, theirs;
while (!numa_bitmask_isbitset(data->nodeset, --allowed_last))
;
ours = aml_bitmap_isset(nodemask, i);
theirs = numa_bitmask_isbitset(data->nodeset, i);
if (aml_last > allowed_last) {
err = -AML_EDOM;
goto err_f_node;
if (ours && !theirs) {
err = -AML_EDOM;
goto err_f_node;
}
}
aml_bitmap_copy_to_ulong(nodemask,
data->nodeset->maskp,
......
......@@ -20,7 +20,7 @@ static int aml_layout_dense_alloc(struct aml_layout **ret,
layout = AML_INNER_MALLOC_EXTRA(struct aml_layout,
struct aml_layout_dense,
size_t, 4*ndims);
size_t, 3*ndims);
if (layout == NULL) {
*ret = NULL;
return -AML_ENOMEM;
......@@ -30,6 +30,10 @@ static int aml_layout_dense_alloc(struct aml_layout **ret,
struct aml_layout,
struct aml_layout_dense);
layout->data = (struct aml_layout_data *) data;
data->ptr = NULL;
data->ndims = ndims;
data->dims = AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense,
......@@ -41,16 +45,10 @@ static int aml_layout_dense_alloc(struct aml_layout **ret,
for (size_t i = 0; i < ndims; i++)
data->stride[i] = 1;
data->pitch = AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense,
size_t, ndims*2);
data->cpitch = AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense,
size_t, ndims*3);
data->ptr = NULL;
data->ndims = ndims;
size_t, ndims*2);
*ret = layout;
return AML_SUCCESS;
}
......@@ -68,9 +66,7 @@ void aml_layout_dense_init_cpitch(struct aml_layout *layout,
data->ptr = ptr;
memcpy(data->dims, dims, ndims * sizeof(size_t));
memcpy(data->stride, stride, ndims * sizeof(size_t));
memcpy(data->cpitch, cpitch, (ndims + 1) * sizeof(size_t));
for (size_t i = 0; i < ndims; i++)
data->pitch[i] = cpitch[i+1]/cpitch[i];
memcpy(data->cpitch, cpitch, ndims * sizeof(size_t));
}
int aml_layout_dense_create(struct aml_layout **layout,
......@@ -98,6 +94,7 @@ int aml_layout_dense_create(struct aml_layout **layout,
data = (struct aml_layout_dense *)l->data;
data->ptr = ptr;
data->cpitch[0] = element_size;
size_t _pitch[ndims];
switch (AML_LAYOUT_ORDER(order)) {
......@@ -108,9 +105,9 @@ int aml_layout_dense_create(struct aml_layout **layout,
if (stride)
data->stride[i] = stride[ndims-i-1];
if (pitch)
data->pitch[i] = pitch[ndims-i-1];
_pitch[i] = pitch[ndims-i-1];
else
data->pitch[i] = dims[ndims-i-1];
_pitch[i] = dims[ndims-i-1];
}
break;
......@@ -120,17 +117,17 @@ int aml_layout_dense_create(struct aml_layout **layout,
if (stride)
memcpy(data->stride, stride, ndims * sizeof(size_t));
if (pitch)
memcpy(data->pitch, pitch, ndims * sizeof(size_t));
memcpy(_pitch, pitch, ndims * sizeof(size_t));
else
memcpy(data->pitch, dims, ndims * sizeof(size_t));
memcpy(_pitch, dims, ndims * sizeof(size_t));
break;
default:
free(l);
return -AML_EINVAL;
}
for (size_t i = 1; i <= ndims; i++)
data->cpitch[i] = data->cpitch[i-1]*data->pitch[i-1];
for (size_t i = 1; i < ndims; i++)
data->cpitch[i] = data->cpitch[i-1]*_pitch[i-1];
*layout = l;
return AML_SUCCESS;
......@@ -236,7 +233,7 @@ static void merge_dims(const size_t ndims,
}
dim_index++;
}
new_cpitch[new_dim_index + 1] = cpitch[dim_index + 1];
new_cpitch[new_dim_index + 1] = 0;
*new_ndims = new_dim_index + 1;
}
......@@ -256,6 +253,7 @@ static int reshape_dims(const struct aml_layout_dense *d,
size_t m_ndims;
size_t m_dims[d->ndims];
size_t m_stride[d->ndims];
/* for simplicity, the underlying algorithm needs one more slot */
size_t m_cpitch[d->ndims + 1];
/* First obtain a canonical representation of the layout
......@@ -306,6 +304,7 @@ int aml_layout_column_reshape(const struct aml_layout_data *data,
struct aml_layout *layout;
const struct aml_layout_dense *d;
size_t stride[ndims];
/* for simplicity, the underlying algorithm needs one more slot */
size_t cpitch[ndims + 1];
d = (const struct aml_layout_dense *)data;
......@@ -350,16 +349,12 @@ int aml_layout_column_slice(const struct aml_layout_data *data,
if (err)
return err;
size_t cpitch[d->ndims + 1];
size_t cpitch[d->ndims];
size_t new_strides[d->ndims];
cpitch[d->ndims] = d->cpitch[d->ndims];
for (size_t i = 0; i < d->ndims; i++) {
cpitch[i] = d->cpitch[i];
new_strides[i] = strides[i] * d->stride[i];
cpitch[d->ndims] -= cpitch[i] * offsets[i] * d->stride[i];
}
aml_layout_dense_init_cpitch(layout,
......@@ -433,6 +428,7 @@ int aml_layout_row_reshape(const struct aml_layout_data *data,
struct aml_layout *layout;
const struct aml_layout_dense *d;
size_t stride[ndims];
/* for simplicity, the underlying algorithm needs one more slot */
size_t cpitch[ndims + 1];
size_t n_dims[ndims];
int err;
......@@ -476,7 +472,7 @@ int aml_layout_row_slice(const struct aml_layout_data *data,
d = (const struct aml_layout_dense *)data;
size_t cpitch[d->ndims + 1];
size_t cpitch[d->ndims];
size_t n_offsets[d->ndims];
size_t n_dims[d->ndims];
size_t n_strides[d->ndims];
......@@ -491,11 +487,9 @@ int aml_layout_row_slice(const struct aml_layout_data *data,
n_strides[i] = strides[d->ndims - i - 1];
}
cpitch[d->ndims] = d->cpitch[d->ndims];
for (size_t i = 0; i < d->ndims; i++) {
cpitch[i] = d->cpitch[i];
n_strides[i] *= d->stride[i];
cpitch[d->ndims] -= cpitch[i] * n_offsets[i] * d->stride[i];
}
ptr = aml_layout_column_deref(data, n_offsets);
......@@ -524,18 +518,16 @@ int aml_layout_row_slice_native(const struct aml_layout_data *data,
d = (const struct aml_layout_dense *)data;
size_t cpitch[d->ndims + 1];
size_t cpitch[d->ndims];
size_t new_strides[d->ndims];
err = aml_layout_dense_alloc(&layout, d->ndims);
if (err)
return err;
cpitch[d->ndims] = d->cpitch[d->ndims];
for (size_t i = 0; i < d->ndims; i++) {
cpitch[i] = d->cpitch[i];
new_strides[i] = strides[i] * d->stride[i];
cpitch[d->ndims] -= cpitch[i] * offsets[i] * d->stride[i];
}
ptr = aml_layout_column_deref(data, offsets);
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#include "aml/utils/features.h"
#include "config.h"
#if HAVE_CUDA == 1
#include <cuda.h>
#include <cuda_runtime.h>
#endif
static int aml_support_cuda(void)
{
#if HAVE_CUDA == 0
return 0;
#else
int x;
if (cudaGetDeviceCount(&x) != cudaSuccess || x <= 0)
return 0;
return 1;
#endif
}
int aml_support_backends(const unsigned long backends)
{
// Cuda check: compilation support and runtime support must be present.
if ((backends & AML_BACKEND_CUDA) &&
!(AML_HAVE_BACKEND_CUDA && aml_support_cuda()))
return 0;
return 1;
}
......@@ -2,6 +2,23 @@ AM_COLOR_TESTS = yes
AM_CFLAGS = -I$(top_srcdir)/include $(PTHREAD_CFLAGS)
AM_LDFLAGS = ../src/libaml.la $(PTHREAD_LIBS)
if HAVE_CUDA
# LIBS is used instead of AM_LDFLAGS on purpose
# AM_LDFLAGS appends flags before libraries added before LDADD.
# Thus, when linking with libaml.la, linking with cuda is not done.
LIBS += $(CUDA_CFLAGS)
LIBS += $(CUDA_LIBS)
endif
if HAVE_CUDA
# LIBS is used instead of AM_LDFLAGS on purpose
# AM_LDFLAGS appends flags before libraries added before LDADD.
# Thus, when linking with libaml.la, linking with cuda is not done.
LIBS += $(CUDA_CFLAGS)
LIBS += $(CUDA_LIBS)
AM_CFLAGS += $(CUDA_CFLAGS)
AM_LDFLAGS += $(CUDA_LIBS)
endif
# valgrind support
if TEST_VALGRIND
......@@ -18,7 +35,22 @@ AREA_TESTS = \
area/test_area \
area/test_linux
LAYOUT_TESTS = layout/test_layout
noinst_LTLIBRARIES = liblayout_test.la
liblayout_test_la_CPPFLAGS =
liblayout_test_la_LDFLAGS =
liblayout_test_la_SOURCES = \
layout/coords.c \
layout/dense.c \
layout/layout.c \
layout/reshape.c
liblayout_test_la_LIBADD = ../src/libaml.la
LDADD = liblayout_test.la
LAYOUT_TESTS = \
layout/test_coords \
layout/test_dense \
layout/test_reshape \
layout/test_pad
TILING_TESTS = tiling/test_tiling
......@@ -28,8 +60,7 @@ DMA_LINUX_TESTS = dma/test_dma_linux_seq \
SCRATCH_TESTS = scratch/test_scratch_seq \
scratch/test_scratch_par