...
 
Commits (19)
......@@ -32,7 +32,8 @@ checkpatch:
- nix
- kvm
script:
- git ls-files *.c *.h | grep -v -e benchmarks >> .checkpatch.conf
- nix run -f "$ARGOPKGS" checkpatch --command checkpatch.pl --ignore TRAILING_SEMICOLON --ignore MULTISTATEMENT_MACRO_USE_DO_WHILE include/aml/utils/inner-malloc.h
- git ls-files *.c *.h | grep -v -e benchmarks | grep -v -e inner-malloc >> .checkpatch.conf
- nix run -f "$ARGOPKGS" checkpatch --command checkpatch.pl
style:docs:
......@@ -45,12 +46,12 @@ style:docs:
- kvm
script:
- |
nix-shell "$ARGOPKGS" -A aml-dist --arg aml-src ./. --run bash << EOF
./autogen.sh
mkdir build
./configure --prefix=`pwd`/build --enable-docs
make install-data
EOF
nix-shell "$ARGOPKGS" -A aml-dist --arg aml-src ./. --run bash <<< '
./autogen.sh
mkdir build
./configure --prefix=`pwd`/build --enable-docs
make install-data
'
artifacts:
when: on_failure
paths:
......@@ -58,7 +59,8 @@ style:docs:
make:generic:
tags:
- knl
- nix
- kvm
stage: build
except:
- /^wip.*/
......@@ -66,12 +68,15 @@ make:generic:
variables:
CFLAGS: "-std=c99 -pedantic -Wall -Wextra -Werror -Wno-unused-but-set-parameter"
script:
- ./autogen.sh
- mkdir build
- ./configure --prefix=`pwd`/build
- make
- make check
- make install
- |
nix-shell "$ARGOPKGS" -A aml --arg aml-src ./. --run bash <<< '
./autogen.sh
mkdir build
./configure --prefix=`pwd`/build
make
make check
make install
'
artifacts:
when: on_failure
paths:
......@@ -80,7 +85,8 @@ make:generic:
make:out-of-tree:
tags:
- knl
- nix
- kvm
stage: build
except:
- /^wip.*/
......@@ -88,13 +94,16 @@ make:out-of-tree:
variables:
CFLAGS: "-std=c99 -pedantic -Wall -Wextra -Werror -Wno-unused-but-set-parameter"
script:
- ./autogen.sh
- mkdir out
- cd out
- mkdir build
- ../configure --prefix=`pwd`/build
- make
- make check
- |
nix-shell "$ARGOPKGS" -A aml --arg aml-src ./. --run bash <<< '
./autogen.sh
mkdir out
cd out
mkdir build
../configure --prefix=`pwd`/build
make
make check
'
artifacts:
when: on_failure
paths:
......
include_HEADERS=aml.h
include_aml_areadir=$(includedir)/aml/area
include_aml_area_HEADERS = \
aml/area/linux.h \
aml/area/cuda.h
include_aml_area_HEADERS = aml/area/linux.h
include_aml_layoutdir=$(includedir)/aml/layout
include_aml_layout_HEADERS = \
......@@ -34,6 +32,12 @@ include_amlutils_HEADERS = \
aml/utils/error.h \
aml/utils/inner-malloc.h \
aml/utils/vector.h \
aml/utils/queue.h \
aml/utils/async.h \
aml/utils/version.h \
aml/utils/features.h
if HAVE_CUDA
include_aml_area_HEADERS += aml/area/cuda.h
include_aml_dma_HEADERS += aml/dma/cuda.h
endif
......@@ -37,6 +37,8 @@
#include "aml/utils/error.h"
#include "aml/utils/inner-malloc.h"
#include "aml/utils/vector.h"
#include "aml/utils/queue.h"
#include "aml/utils/async.h"
#include "aml/utils/version.h"
#include "aml/utils/features.h"
......@@ -730,13 +732,41 @@ struct aml_layout *aml_tiling_index_byid(const struct aml_tiling *tiling,
* @brief Management of low-level memory movements.
*
* AML DMA (inspired by Direct Memory Access engines) is an abstraction over the
* ability to move data between places. A DMAs presents an interface that allows
* clients to create an asynchronous request to move data and to wait for this
* request to complete. Depending on the exact operation it is configured to do,
* the DMA might transform the data during the operation.
* ability to move data between areas and between layouts.
* @see aml_area
* @see aml_layout
* Through DMAs, AML exposes an interface allowing clients to instanciate
* asynchronous requests to copy data and wait for these requests to complete.
* @see aml_dma_copy()
* @see aml_dma_async_copy()
* @see aml_dma_request
*
* The performance of the execution of a dma is highly dependent on the type of
* source and destination layouts. For instance a copy of continuous and
* contiguous address space can easily be vectorized, packed, etc...
* The optimization of such a copy is encapsulated by the aml_dma_operator
* abstraction. aml_dma_operator is a function provided by the dma implementer
* that is passed to aml_dma_copy() and aml_dma_async_copy() by the user.
* With the knowledge of source and destination layouts, the user is able to
* select a good aml_dma_operator to make the copy faster.
* @see aml_dma_operator
*
* Implementations are mostly responsible for providing access to various types
* of execution engine for data movement itself.
* Implementing a DMA requires to implement 4 abstractions:
* * struct aml_dma_request
* * struct aml_dma_data
* * struct aml_dma_ops
* * fn aml_dma_operator
* A DMA engine is an implementation of a struct aml_dma that is further
* provided to aml_dma_*() functions from the high level interface.
* The struct contains an internal data handle (aml_dma_data) to manage the
* engine, and a set of functions (aml_dma_ops) required to implement the
* high level interface. aml_dma_ops will allow to create a aml_dma_request
* to handle a transfer. A request is responsible for managing the transfer
* performed by a aml_dma_operator. When a request is iniciated by a user,
* the latter will need to provide a performant implementation of the
* aml_dma_operator, which is provided by the dma implementation.
* @see aml_dma_data
* @see aml_dma_ops
*
* @image html dma.png width=600
* @{
......@@ -745,52 +775,43 @@ struct aml_layout *aml_tiling_index_byid(const struct aml_tiling *tiling,
////////////////////////////////////////////////////////////////////////////////
/**
* Internal macros used for tracking DMA request types.
* Invalid request type. Used for marking inactive requests in the vector.
**/
#define AML_DMA_REQUEST_TYPE_INVALID -1
/**
* The request is in the format (dest layout, src layout)
**/
#define AML_DMA_REQUEST_TYPE_LAYOUT 0
/**
* aml_dma is mainly used to asynchronously move data.
* aml_dma_request is an opaque structure containing information
* about ongoing request for data movement in a dma operation.
* Handle output of aml_dma_async_copy() to wait handle dma completion
* and errors.
* Implementers of struct aml_dma_ops will populate and destroy requests.
* @see aml_dma_ops
* @see aml_dma_async_copy()
**/
struct aml_dma_request;
/**
* Opaque handle implemented by each aml_dma implementations.
* Internal data, specific to a dma engine implementation.
* Should not be used by end-users.
**/
struct aml_dma_data;
/**
* Type of the function used to perform the DMA between two layouts.
* @param dst: destination layout
* @param src: source layout
* @param arg: extra argument needed by the operator
* Function called in a request to perform the copy between two areas and two
* layouts.
* @param[in] dst: destination layout
* @param[in] src: source layout
* @param[in] arg: extra argument needed by the operator.
* See operator implementators for more details.
* @param[out] out: Output of the operator.
* See operator implementators for more details.
**/
typedef int (*aml_dma_operator)(struct aml_layout *dst,
const struct aml_layout *src, void *arg);
/**
aml_dma_ops is a structure containing operations for a specific
* aml_dma implementation.
* These operation are operation are detailed in the structure.
* They are specific in:
* - the type of aml_area source and destination,
* - the progress engine performing the operation,
* - the type of of source and destination data structures.
*
* Each different combination of these three points may require a different
* set of dma operations.
**/
const struct aml_layout *src,
void *arg, void **out);
/**
* aml_dma_ops is a structure containing operations for a specific
* aml_dma implementation. These operations are detailed in the structure.
* They are specific in:
* - the type of aml_area source and destination,
* - the progress engine performing the operation (thread, hardware dma, ...).
* Each different combination of these two points may require a different
* set of dma operations.
**/
struct aml_dma_ops {
/**
* Initiate a data movement, from a source pointer to a destination
......@@ -832,11 +853,9 @@ struct aml_dma_ops {
};
/**
* aml_dma is an abstraction for (asynchronously) moving data
* from one area to another. The implementation of dma to use
* is depends on the source and destination areas. The appropriate
* dma choice is delegated to the user.
* @see struct aml_area.
* This is the high level structure containing the implementation
* specific data and methods.
* @see struct aml_dma.
**/
struct aml_dma {
/** @see aml_dma_ops **/
......@@ -854,8 +873,10 @@ struct aml_dma {
* @param op_arg: optional argument to the operator
* @return 0 if successful; an error code otherwise.
**/
int aml_dma_copy_custom(struct aml_dma *dma, struct aml_layout *dest,
struct aml_layout *src, aml_dma_operator op, void *op_arg);
int aml_dma_copy(struct aml_dma *dma,
struct aml_layout *dest,
struct aml_layout *src,
aml_dma_operator op, void *op_arg);
/**
* Requests a data copy between two different buffers.This is an asynchronous
......@@ -869,14 +890,15 @@ int aml_dma_copy_custom(struct aml_dma *dma, struct aml_layout *dest,
* @param op_arg: optional argument to the operator
* @return 0 if successful; an error code otherwise.
**/
int aml_dma_async_copy_custom(struct aml_dma *dma, struct aml_dma_request **req,
int aml_dma_async_copy(struct aml_dma *dma,
struct aml_dma_request **req,
struct aml_layout *dest,
struct aml_layout *src,
aml_dma_operator op, void *op_arg);
#define aml_dma_copy(dma, d, s) aml_dma_copy_custom(dma, d, s, NULL, NULL)
#define aml_dma_async_copy(dma, r, d, s) \
aml_dma_async_copy_custom(dma, r, d, s, NULL, NULL)
#define aml_dma_copy_helper(dma, d, s) aml_dma_copy(dma, d, s, NULL, NULL)
#define aml_dma_async_copy_helper(dma, r, d, s) \
aml_dma_async_copy(dma, r, d, s, NULL, NULL)
/**
* Waits for an asynchronous DMA request to complete.
......@@ -894,16 +916,6 @@ int aml_dma_wait(struct aml_dma *dma, struct aml_dma_request **req);
**/
int aml_dma_cancel(struct aml_dma *dma, struct aml_dma_request **req);
/**
* Generic helper to copy from one layout to another.
* @param[out] dst: destination layout
* @param[in] src: source layout
* @param[in] arg: unused (should be NULL)
*/
int aml_copy_layout_generic(struct aml_layout *dst,
const struct aml_layout *src, void *arg);
////////////////////////////////////////////////////////////////////////////////
/**
......
......@@ -8,6 +8,9 @@
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#ifndef AML_AREA_CUDA_NUMA_H
#define AML_AREA_CUDA_NUMA_H
/**
* @defgroup aml_area_cuda "AML Cuda Areas"
* @brief Cuda Implementation of Areas.
......@@ -226,3 +229,5 @@ aml_area_cuda_munmap(const struct aml_area_data *area_data,
/**
* @}
**/
#endif // AML_AREA_CUDA_NUMA_H
......@@ -21,6 +21,17 @@
* @{
**/
/**
* Internal macros used for tracking DMA request types.
* Invalid request type. Used for marking inactive requests in the vector.
**/
#define AML_DMA_REQUEST_TYPE_INVALID -1
/**
* The request is in the format (dest layout, src layout)
**/
#define AML_DMA_REQUEST_TYPE_LAYOUT 0
/**
* Default table of dma request operations for linux
* sequential dma.
......@@ -112,6 +123,18 @@ void aml_dma_linux_seq_destroy(struct aml_dma **dma);
int aml_dma_linux_seq_do_copy(struct aml_dma_linux_seq_data *dma,
struct aml_dma_request_linux_seq *req);
/**
* Generic helper to copy from one layout to another.
* @param[out] dst: destination layout
* @param[in] src: source layout
* @param[in] arg: unused (should be NULL)
* @param[out] out: A pointer where to store output of the function.
*/
int aml_copy_layout_generic(struct aml_layout *dst,
const struct aml_layout *src,
void *arg,
void **out);
/**
* @}
**/
......
/******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#ifndef AML_AREA_LAYOUT_CUDA_H
#define AML_AREA_LAYOUT_CUDA_H
/**
* @defgroup aml_layout_cuda "AML Layout Cuda"
* @brief Layout on device pointer.
*
* Cuda layout is a wrapper on other layout.
* All operations are deferred to the embedded layout.
* deref operation of the embedded layout is used to compute offset
* on device pointer and return the appropriate offset.
* Operations on this layout cannot be used on device side.
* However the layout pointer (if it is a device pointer) can be used
* on device side.
*
* @code
* #include <aml/layout/cuda.h>
* @endcode
* @see aml_layout
* @{
**/
#include <aml/area/cuda.h>
/** aml_layout data structure **/
struct aml_layout_cuda_data {
/** Pointer to data on device. **/
void *device_ptr;
/** Meta data on how is pointer allocated. **/
struct aml_area_cuda_data *data;
/** user expected layout order **/
int order;
/** layout num dims **/
size_t ndims;
/** layout dims stored in row major order **/
size_t *dims;
/**
* Offset between elements of the same dimension.
* Offset is in number of elements.
**/
size_t *stride;
/**
* cumulative distances between two elements in the same
* dimension (pitch[0] is the element size in bytes).
**/
size_t *cpitch;
};
/**
* Create a new layout on device pointer with embedded layout.
* @param[out] out: A pointer to receive the newly allocated layout.
* @param[in] device_ptr: The pointer on which the layout has to work.
* @param[in] element_size: The size of elements in this layout.
* @param[in] order: Order of dimensions in the layout.
* @param[in] ndims: The number of dimensions in the layout.
* @param[in] dims: The dimensions in the layout.
* @param[in] stride: The empty -- in number of elements -- space between
* consecutive elements of the same dimension, in number of elements.
* @param[in] pitch: The space -- in number of element -- between 2 elements in
* the next dimension.
* @param[in] data: The area data used to allocate device_ptr.
* data is not owned by the layout and must live longer than the layout.
* data is used as an additional source of information on pointer
* when performing operations.
* @return AML_SUCCESS or -AML_ENOMEM if the memory allocation for layout
* failed.
**/
int aml_layout_cuda_create(struct aml_layout **out,
void *device_ptr,
const size_t element_size,
const int order,
const size_t ndims,
const size_t *dims,
const size_t *stride,
const size_t *pitch,
struct aml_area_cuda_data *data);
/**
* Destroy a layout obtained with aml_layout_cuda_create().
* @param[in, out] layout: A pointer to the layout to destroy.
* On exit, the pointer content is set to NULL.
* @return AML_SUCCESS or -AML_EINVAL if layout or *layout is NULL.
**/
int aml_layout_cuda_destroy(struct aml_layout **layout);
/** Always returns the pointer to device_ptr, whatever the coordinates. **/
void *aml_layout_cuda_deref(const struct aml_layout_data *data,
const size_t *coords);
/** Always returns the pointer to device_ptr, whatever the coordinates. **/
void *aml_layout_cuda_deref_native(const struct aml_layout_data *data,
const size_t *coords);
/** Returns layout order **/
int aml_layout_cuda_order(const struct aml_layout_data *data);
/** Copies layout dims with user order. **/
int aml_layout_cuda_dims(const struct aml_layout_data *data, size_t *dims);
/** Copies layout dims in row major order. **/
int aml_layout_cuda_dims_native(const struct aml_layout_data *data,
size_t *dims);
/** Returns the number of dimensions in the layout. **/
size_t aml_layout_cuda_ndims(const struct aml_layout_data *data);
/** Returns the size of an element in the layout. **/
size_t aml_layout_cuda_element_size(const struct aml_layout_data *data);
/** Cuda layout operations **/
extern struct aml_layout_ops aml_layout_cuda_ops;
/**
* @}
**/
#endif // AML_AREA_LAYOUT_CUDA_H
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#ifndef AML_ASYNC_H
#define AML_ASYNC_H
/**
* @defgroup aml_async "AML Asynchronous work utils"
* @brief AML Asynchronous work utils
*
* This module is used internally in the library to manage asynchronous
* optimizations.
* In particular, it defines a task abstraction and a work queue with
* a thread pool used by dma operations to speedup work.
* @{
**/
//----------------------------------------------------------------------------//
// User task abstraction (see tests/utils/test_async.c)
//----------------------------------------------------------------------------//
/** Input to an asynchronous task **/
struct aml_task_in;
/** Output from an asynchronous task **/
struct aml_task_out;
/** Task meta data **/
struct aml_task_data;
/** Function to be executed in a task**/
typedef struct aml_task_out *(*aml_task_work) (struct aml_task_in *);
/** Task abstraction **/
struct aml_task {
/** Input **/
struct aml_task_in *in;
/** Where to store output **/
struct aml_task_out *out;
/** Work to do **/
aml_task_work fn;
/** Metadata **/
struct aml_task_data *data;
};
//----------------------------------------------------------------------------//
// Implementer abstraction
//----------------------------------------------------------------------------//
/** Metadata of a thread pool **/
struct aml_sched_data;
/** Methods that thread pools must implement **/
struct aml_sched_ops {
/** Submit a task to the pool **/
int (*submit)(struct aml_sched_data *data, struct aml_task *task);
/** Wait for a specific task to be completed **/
int (*wait)(struct aml_sched_data *data, struct aml_task *task);
/** Pull the next executed task from the pool **/
struct aml_task *(*wait_any)(struct aml_sched_data *data);
};
/** Thread pool abstraction **/
struct aml_sched {
/** Metadata **/
struct aml_sched_data *data;
/** Methods **/
struct aml_sched_ops *ops;
};
//----------------------------------------------------------------------------//
// User interface
//----------------------------------------------------------------------------//
/** Submit a task to the pool **/
int aml_sched_submit_task(struct aml_sched *pool, struct aml_task *task);
/** Wait for a specific task to be completed **/
int aml_sched_wait_task(struct aml_sched *pool, struct aml_task *task);
/** Pull the next executed task from the pool **/
struct aml_task *aml_sched_wait_any(struct aml_sched *pool);
//----------------------------------------------------------------------------//
// Simple task scheduler with pthread worker.
//----------------------------------------------------------------------------//
/**
* Create an active pool of "nt" threads" to run asynchronously tasks queued
* in a FIFO queue.
* If nt == 0 then progress is made
* from caller thread on aml_sched_wait_task() and aml_sched_wait_any().
**/
struct aml_sched *aml_active_sched_create(const size_t nt);
/** Destroy an active thread pool and set it to NULL **/
void aml_active_sched_destroy(struct aml_sched **sched);
/** Get the number of tasks pushed to the scheduler and not yet pulled out. **/
int aml_active_sched_num_tasks(struct aml_sched *sched);
/**
* @}
**/
#endif //AML_ASYNC_H
......@@ -24,59 +24,154 @@
* This code is all macros to handle the type specific logic we need.
**/
/** Returns the allocation size required to handle two objects side-by-side.
*
* Use an anonymous struct to ask the compiler what size an allocation should be
* so that the second object is properly aligned too.
*/
#define AML_SIZEOF_ALIGNED(a, b) \
(sizeof(struct { a __e1; b __e2; }))
//---------------------------------------------------------------------------//
// Inner utils
//---------------------------------------------------------------------------//
/** Returns the offset of the second object when allocated side-by-side.
*
* Use the same anonymous struct trick to figure out what offset the pointer is
* at.
*/
#define AML_OFFSETOF_ALIGNED(a, b) \
(offsetof(struct { a __e1; b __e2; }, __e2))
// Stringify macro
#define STRINGIFY(a) STRINGIFY_(a)
#define STRINGIFY_(a) #a
/** Allocate a pointer that can be used to contain two types.
*
// Concatenate two arguments into a macro name
#define CONCATENATE(arg1, arg2) CONCATENATE1(arg1, arg2)
#define CONCATENATE1(arg1, arg2) CONCATENATE2(arg1, arg2)
#define CONCATENATE2(arg1, arg2) arg1##arg2
// Expand to number of variadic arguments for up to 8 args.
// VA_NARG(a,b,c)
// PP_ARG_N(a,b,c,8,7,6,5,4,3,2,1,0)
// 3
#define VA_NARG(...) PP_ARG_N(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#define VA_NARG(...) PP_ARG_N(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#define PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
// Arithmetic
#define PLUS_1_1 2
#define PLUS_1_2 3
#define PLUS_1_3 4
#define PLUS_1_4 5
#define PLUS_1_5 6
#define PLUS_1_6 7
#define PLUS_1_7 8
#define PLUS_1(N) CONCATENATE(PLUS_1_, N)
// Field name in struct: __f1 for N = 1
#define AML_FIELD(N) CONCATENATE(__f, N)
// struct fields declaration.
// one field: f1 __f1;
// two fields: f2 __f1; f1 __f2;
// three fields: f3 __f1; f2 __f2; f1 __f3;
// We want fx fields to appear in the order of types provided by users.
// We want __fx names to appear in the reverse order, such that if the user
// wants the second fields it can name it with __f2.
#define AML_DECL_1(N, f1, ...) f1 AML_FIELD(N);
#define AML_DECL_2(N, f2, ...) \
f2 AML_FIELD(N); AML_DECL_1(PLUS_1(N), __VA_ARGS__)
#define AML_DECL_3(N, f3, ...) \
f3 AML_FIELD(N); AML_DECL_2(PLUS_1(N), __VA_ARGS__)
#define AML_DECL_4(N, f4, ...) \
f4 AML_FIELD(N); AML_DECL_3(PLUS_1(N), __VA_ARGS__)
#define AML_DECL_5(N, f5, ...) \
f5 AML_FIELD(N); AML_DECL_4(PLUS_1(N), __VA_ARGS__)
#define AML_DECL_6(N, f6, ...) \
f6 AML_FIELD(N); AML_DECL_5(PLUS_1(N), __VA_ARGS__)
#define AML_DECL_7(N, f7, ...) \
f7 AML_FIELD(N); AML_DECL_6(PLUS_1(N), __VA_ARGS__)
#define AML_DECL_8(N, f8, ...) \
f8 AML_FIELD(N); AML_DECL_7(PLUS_1(N), __VA_ARGS__)
// Declare a structure with up to 8 fields.
// (Pick the adequate AML_DECL_ macro and call it.)
#define AML_STRUCT_DECL(...) \
struct { \
CONCATENATE(AML_DECL_, VA_NARG(__VA_ARGS__))(1, __VA_ARGS__, 0) \
}
/** Returns the size required for allocation of up to 8 types **/
#define AML_SIZEOF_ALIGNED(...) sizeof(AML_STRUCT_DECL(__VA_ARGS__))
/**
* Returns the size required for allocation of up to 7 types plus one array.
* @param n: The number of elements in array.
* @param type: The type of array elements.
* @param ...: Up to 7 fields type preceding array allocation space.
**/
#define AML_INNER_MALLOC_2(a, b) calloc(1, AML_SIZEOF_ALIGNED(a, b))
#define AML_SIZEOF_ALIGNED_ARRAY(n, type, ...) \
(sizeof(AML_STRUCT_DECL(__VA_ARGS__, type)) + \
((n)-1) * sizeof(type))
/** Allocate a pointer that can be used to contain two types plus an extra area
* aligned on a third type.
*
/** Returns the offset of the nth type of a list of up to 8 types. **/
#define AML_OFFSETOF_ALIGNED(N, ...) \
offsetof(AML_STRUCT_DECL(__VA_ARGS__), AML_FIELD(N))
//---------------------------------------------------------------------------//
// User Macros
//---------------------------------------------------------------------------//
/**
* Allocate space aligned on a page boundary for up to 8 fields aligned as
* in a struct
* @param ...: types contained in allocation. (Up to 8)
**/
#define AML_INNER_MALLOC_EXTRA(a, b, c, sz) \
calloc(1, AML_SIZEOF_ALIGNED(struct { a __f1; b __f2; }, c) + \
(sizeof(c)*(sz)))
#define AML_INNER_MALLOC(...) calloc(1, AML_SIZEOF_ALIGNED(__VA_ARGS__))
/** Allocate a pointer that can be used to contain two types plus an extra area
* aligned on a third type, and extra bytes after that.
*
/**
* Allocate space aligned on a page boundary. It may contain up to 7 fields
* aligned as in a struct, and one array.
* @param n: Number of elements in array.
* @param type: Type of array elements.
* @param ...: Up to 7 fields type preceding array allocation space.
**/
#define AML_INNER_MALLOC_4(a, b, c, sz, d) \
calloc(1, AML_SIZEOF_ALIGNED(struct { a __f1; b __f2; }, c) + \
(sizeof(c)*(sz)) + d)
#define AML_INNER_MALLOC_ARRAY(n, type, ...) \
calloc(1, AML_SIZEOF_ALIGNED_ARRAY(n, type, __VA_ARGS__))
/** Returns the next pointer after an AML_INNER_MALLOC.
*
* Can be used to iterate over the pointers we need, using the last two types as
* parameters.
/**
* Allocate space aligned on a page boundary. It may contain up to 7 fields
* aligned as in a struct, one aligned array and arbitrary extra space.
* @param n: Number of elements in array.
* @param type: Type of array elements.
* @param size: The extra space in bytes to allocate.
* @param ...: Up to 7 fields type preceding array allocation space.
**/
#define AML_INNER_MALLOC_NEXTPTR(ptr, a, b) \
(void *)(((intptr_t) ptr) + AML_OFFSETOF_ALIGNED(a, b))
#define AML_INNER_MALLOC_EXTRA(n, type, size, ...) \
calloc(1, AML_SIZEOF_ALIGNED_ARRAY(n, type, __VA_ARGS__) + size)
/** Returns a pointer inside the extra zone after an AML_INNER_MALLOC_EXTRA.
*
* Can be used to iterate over the pointers we need.
/**
* Returns the nth __VA__ARGS__ field pointer from AML_INNER_MALLOC*()
* allocation.
* @param ptr: A pointer obtained from AML_INNER_MALLOC*()
* @param N: The field number. N must be a number (1, 2, 3, 4, 5, 6, 7, 8)
* and not a variable.
* @param ...: types contained in allocation. (Up to 8)
* @return A pointer to Nth field after ptr.
**/
#define AML_INNER_MALLOC_GET_FIELD(ptr, N, ...) \
(void *)(((intptr_t) ptr) + AML_OFFSETOF_ALIGNED(N, __VA_ARGS__))
/**
* Returns a pointer to the array after __VA_ARGS__ fields.
* @param ptr: Pointer returned by AML_INNER_MALLOC_ARRAY() or
* AML_INNER_MALLOC_EXTRA().
* @param type: Type of array elements.
* @param ...: Other types contained in allocation. (Up to 7)
**/
#define AML_INNER_MALLOC_GET_ARRAY(ptr, type, ...) \
AML_INNER_MALLOC_GET_FIELD(ptr, \
PLUS_1(VA_NARG(__VA_ARGS__)), \
__VA_ARGS__, type)
/**
* Returns a pointer to extra space allocated with
* AML_INNER_MALLOC_EXTRA().
* @param ptr: Pointer returned by AML_INNER_MALLOC_EXTRA().
* @param n: Number of elements in the array.
* @param type: Type of elements in the array.
* @param ...: Other types contained in allocation. (Up to 7)
**/
#define AML_INNER_MALLOC_EXTRA_NEXTPTR(ptr, a, b, c, off) \
(void *)(((intptr_t) ptr) + \
AML_OFFSETOF_ALIGNED(struct { a __f1; b __f2; }, c) + \
((off)*sizeof(c)))
#define AML_INNER_MALLOC_GET_EXTRA(ptr, n, type, ...) \
(void *)(((intptr_t) ptr) + \
AML_SIZEOF_ALIGNED_ARRAY(n, type, __VA_ARGS__))
/**
* @}
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#ifndef AML_QUEUE_H
#define AML_QUEUE_H
/**
* @defgroup aml_queue "AML Queue API"
* @brief AML Queue API
*
* Generic queue type allocated on heap:
* Serial queue for pushing and poping pointers.
* @{
**/
/** queue struct definition **/
struct aml_queue {
/** Maximum capacity. Is extended if reached **/
size_t max;
/** Index of head **/
size_t head;
/** Index of tail **/
size_t tail;
/** Elements in the queue **/
void **elems;
};
/**
* Create a queue with max pre-allocated space for max elements.
* @param[in] max: The number of elements fitting in the queue before
* trigerring a resize.
* @return NULL if memory allocation failed.
**/
struct aml_queue *aml_queue_create(const size_t max);
/**
* Forget about elements stored in the queue.
**/
void aml_queue_clear(struct aml_queue *q);
/**
* Free queue. Calling free() directly on queue is ok.
**/
void aml_queue_destroy(struct aml_queue *q);
/**
* Get the number of elements in the queue.
*@return 0 if q is NULL.
**/
size_t aml_queue_len(const struct aml_queue *q);
/**
* Add an element at the queue tail.
* @return -AML_ENOMEM if queue needed to be extended and allocation failed.
**/
int aml_queue_push(struct aml_queue **q, void *element);
/**
* Get an element out of the queue.
* @return NULL if queue is empty.
**/
void *aml_queue_pop(struct aml_queue *q);
/**
* Take an element out of the queue.
* @return NULL if queue does not contain the element.
**/
void *aml_queue_take(struct aml_queue *q, void *element);
/**
* @}
**/
#endif //AML_QUEUE_H
......@@ -22,7 +22,7 @@
* Major version changes in AML
* denotes ABI changes which prevent
* compatibility with previous major version ABI.
*
*
**/
#define AML_VERSION_MAJOR @PACKAGE_VERSION_MAJOR@
......
......@@ -35,6 +35,8 @@ UTILS_SOURCES = \
utils/bitmap.c \
utils/error.c \
utils/vector.c \
utils/queue.c \
utils/async.c \
utils/features.c
LIB_SOURCES = \
......@@ -63,8 +65,11 @@ AM_LDFLAGS += $(CUDA_LIBS)
# Build .c sources using cuda runtime library.
libaml_la_SOURCES+=area/cuda.c
# Build files not requiring additional libraries
LAYOUT_SOURCES+=layout/cuda.c
DMA_SOURCES+=dma/cuda.c
# Build .cu sources containing device code.
#
# CUDA_LO_FILES= # .lo files result of .cu files.
# CUDA_FLAGS=--x=cu
#
......
......@@ -223,12 +223,13 @@ int aml_area_cuda_create(struct aml_area **area,
if (device >= max_devices)
return -AML_EINVAL;
ret = AML_INNER_MALLOC_2(struct aml_area, struct aml_area_cuda_data);
ret = AML_INNER_MALLOC(struct aml_area,
struct aml_area_cuda_data);
if (ret == NULL)
return -AML_ENOMEM;
data = AML_INNER_MALLOC_NEXTPTR(ret, struct aml_area,
struct aml_area_cuda_data);
data = AML_INNER_MALLOC_GET_FIELD(ret, 2, struct aml_area,
struct aml_area_cuda_data);
ret->ops = &aml_area_cuda_ops;
ret->data = (struct aml_area_data *)data;
......
......@@ -196,12 +196,14 @@ int aml_area_linux_create(struct aml_area **area,
*area = NULL;
ret = AML_INNER_MALLOC_2(struct aml_area, struct aml_area_linux_data);
ret = AML_INNER_MALLOC(struct aml_area,
struct aml_area_linux_data);
if (ret == NULL)
return -AML_ENOMEM;
ret->data = AML_INNER_MALLOC_NEXTPTR(ret, struct aml_area,
struct aml_area_linux_data);
ret->data = AML_INNER_MALLOC_GET_FIELD(ret, 2,
struct aml_area,
struct aml_area_linux_data);
ret->ops = &aml_area_linux_ops;
data = (struct aml_area_linux_data *)ret->data;
......
#include "aml.h"
#include "aml/dma/cuda.h"
struct aml_dma_ops aml_dma_cuda_ops = {
.create_request = aml_dma_cuda_request_create,
.destroy_request = aml_dma_cuda_request_destroy,
.wait_request = aml_dma_cuda_request_wait,
};
int aml_dma_cuda_create(struct aml_dma **dma,
const enum cudaMemcpyKind kind,
const unsigned int num_streams)
{
int err = AML_SUCCESS;
struct aml_dma *cuda_dma;
struct aml_dma_cuda_data *data;
cudaStream_t *streams;
if (dma == NULL)
return -AML_EINVAL;
cuda_dma = AML_INNER_MALLOC_ARRAY(num_streams, cudaStream_t,
struct aml_dma,
struct aml_dma_cuda_data);
if (cuda_dma == NULL)
return -AML_EINVAL;
data = AML_INNER_MALLOC_GET_FIELD(cuda_dma, 2,
struct aml_dma,
struct aml_dma_cuda_data);
data->kind = kind;
data->num_streams = num_streams;
data->stream_cursor = 0;
data->streams = AML_INNER_MALLOC_GET_ARRAY(data,
cudaStream_t,
struct aml_dma,
struct aml_dma_cuda_data);
cuda_dma->ops = &aml_dma_cuda_ops;
cuda_dma->data = (struct aml_dma_data *)data;
for (size_t i = 0; i < num_streams; i++)
switch (cudaStreamCreateWithFlags(&data->streams[i],
cudaStreamNonBlocking)) {
cudaSuccess:
break;
default:
while (i--)
cudaStreamDestroy(data->streams[i]);
free(cuda_dma);
return -AML_FAILURE;
}
*dma = cuda_dma;
return AML_SUCCESS;
}
int aml_dma_cuda_destroy(struct aml_dma **dma, const int wait)
{
int err = AML_SUCCESS;
struct aml_dma_cuda_data *data;
if (dma == NULL)
return -AML_EINVAL;
if (*dma == NULL)
return AML_SUCCESS;
data = (struct aml_dma_cuda_data *)(*dma)->data;
for (size_t i = 0; i < data->num_streams; i++) {
if (wait) {
switch (cudaStreamSynchronize(data->streams[i])) {
case cudaSuccess:
break;
case cudaErrorInvalidResourceHandle:
err = -AML_FAILURE;
break;
default:
break;
}
}
switch (cudaStreamDestroy(data->streams[i])) {
case cudaSuccess:
break;
case cudaErrorInvalidValue:
err = -AML_FAILURE;
break;
case cudaErrorInvalidResourceHandle:
err = -AML_FAILURE;
break;
default:
break;
}
}
free(*dma);
*dma = NULL;
return err;
}
/**
* Struct used in aml_dma_cuda_request_create().
* It is used to pick dma streams and set request streams.
* This is different from op_args in aml_dma_cuda_request_create(),
* aml_dma_async_copy() and aml_dma_copy();
**/
struct aml_dma_cuda_op_args {
/** The dma data for stream selection **/
struct aml_dma_cuda_data *dma_data;
/** The dma request whre to set used streams. **/
struct aml_dma_cuda_request *dma_req;
/** Additional args provded in op_args **/
void *extra_args;
};
int aml_dma_cuda_request_create(struct aml_dma_data *data,
struct aml_dma_request **req,
struct aml_layout *dest,
struct aml_layout *src,
aml_dma_operator op, void *op_arg)
{
struct aml_dma_cuda_op_args args = { NULL, NULL, op_arg };
struct aml_dma_cuda_request *request;
struct aml_dma_cuda_data *dma_data;
if (data == NULL || req == NULL || dest == NULL || src == NULL)
return -AML_EINVAL;
dma_data = (struct aml_dma_cuda_data *)data;
request = AML_INNER_MALLOC_ARRAY(dma_data->num_streams, cudaStream_t,
struct aml_dma_cuda_request);
if (request == NULL)
return -AML_ENOMEM;
request->kind = dma_data->kind;
request->dest = dest;
request->src = src;
request->streams = AML_INNER_MALLOC_GET_ARRAY(request,
cudaStream_t
struct aml_dma_cuda_request);
*req = (struct aml_dma_request *)request;
if (dma_data->num_streams > 0) {
args.dma_data = dma_data;
args.dma_req = request;
}
return op(dest, src, &args, NULL);
}
int aml_dma_cuda_request_wait(struct aml_dma_data *dma,
struct aml_dma_request **req)
{
int err = AML_SUCCESS;
(void)dma;
struct aml_dma_cuda_request *dma_req;
if (req == NULL || *req == NULL)
return -AML_EINVAL;
dma_req = (struct aml_dma_cuda_request *)(*req);
for (size_t i = 0; i < dma_req->num_streams; i++)
switch (cudaStreamSynchronize(dma_req->streams[i])) {
case cudaSuccess:
break;
case cudaErrorInvalidResourceHandle:
err = -AML_FAILURE;
break;
default:
break;
}
return err != AML_SUCCESS ? err : aml_dma_cuda_request_destroy(dma,
req);
}
int aml_dma_cuda_request_destroy(struct aml_dma_data *dma,
struct aml_dma_request **req)
{
int err = AML_SUCCESS;
(void)dma;
if (req == NULL || *req == NULL)
return -AML_EINVAL;
free(*req);
*req = NULL;
return AML_SUCCESS;
}
......@@ -13,61 +13,6 @@
#include <assert.h>
/*******************************************************************************
* Generic DMA Copy implementations
*
* Needed by most DMAs. We don't provide introspection or any fancy API to it at
* this point.
******************************************************************************/
static inline void aml_copy_layout_generic_helper(size_t d,
struct aml_layout *dst,
const struct aml_layout *src,
const size_t *elem_number,
size_t elem_size,
size_t *coords)
{
if (d == 1) {
for (size_t i = 0; i < elem_number[0]; i += 1) {
coords[0] = i;
memcpy(aml_layout_deref_native(dst, coords),
aml_layout_deref_native(src, coords),
elem_size);
}
} else {
for (size_t i = 0; i < elem_number[d - 1]; i += 1) {
coords[d - 1] = i;
aml_copy_layout_generic_helper(d - 1, dst, src,
elem_number, elem_size,
coords);
}
}
}
int aml_copy_layout_generic(struct aml_layout *dst,
const struct aml_layout *src, void *arg)
{
size_t d;
size_t elem_size;
(void)arg;
assert(aml_layout_ndims(dst) == aml_layout_ndims(src));
d = aml_layout_ndims(dst);
assert(aml_layout_element_size(dst) == aml_layout_element_size(src));
elem_size = aml_layout_element_size(dst);
size_t coords[d];
size_t elem_number[d];
size_t elem_number2[d];
aml_layout_dims_native(src, elem_number);
aml_layout_dims_native(dst, elem_number2);
for (size_t i = 0; i < d; i += 1)
assert(elem_number[i] == elem_number2[i]);
aml_copy_layout_generic_helper(d, dst, src, elem_number, elem_size,
coords);
return 0;
}
/*******************************************************************************
* Generic DMA API:
* Most of the stuff is dispatched to a different layer, using type-specific
......@@ -77,7 +22,7 @@ int aml_copy_layout_generic(struct aml_layout *dst,
* abstract the request creation after this layer.
******************************************************************************/
int aml_dma_copy_custom(struct aml_dma *dma, struct aml_layout *dest,
int aml_dma_copy(struct aml_dma *dma, struct aml_layout *dest,
struct aml_layout *src, aml_dma_operator op, void *op_arg)
{
int ret;
......@@ -93,7 +38,7 @@ int aml_dma_copy_custom(struct aml_dma *dma, struct aml_layout *dest,
return ret;
}
int aml_dma_async_copy_custom(struct aml_dma *dma, struct aml_dma_request **req,
int aml_dma_async_copy(struct aml_dma *dma, struct aml_dma_request **req,
struct aml_layout *dest, struct aml_layout *src,
aml_dma_operator op, void *op_arg)
{
......
......@@ -9,6 +9,7 @@
*******************************************************************************/
#include "aml.h"
#include "aml/dma/linux-seq.h"
#include "aml/dma/linux-par.h"
#include "aml/layout/dense.h"
......@@ -60,7 +61,7 @@ void *aml_dma_linux_par_do_thread(void *arg)
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
if (req->type != AML_DMA_REQUEST_TYPE_INVALID)
req->op(req->dest, req->src, req->op_arg);
req->op(req->dest, req->src, req->op_arg, NULL);
return NULL;
}
......@@ -175,12 +176,13 @@ int aml_dma_linux_par_create(struct aml_dma **dma, size_t nbreqs,
*dma = NULL;
ret = AML_INNER_MALLOC_2(struct aml_dma, struct aml_dma_linux_par);
ret = AML_INNER_MALLOC(struct aml_dma, struct aml_dma_linux_par);
if (ret == NULL)
return -AML_ENOMEM;
ret->data = AML_INNER_MALLOC_NEXTPTR(ret, struct aml_dma,
struct aml_dma_linux_par);
ret->data = AML_INNER_MALLOC_GET_FIELD(ret, 2,
struct aml_dma,
struct aml_dma_linux_par);
ret->ops = &aml_dma_linux_par_ops;
d = (struct aml_dma_linux_par *)ret->data;
d->ops = aml_dma_linux_par_inner_ops;
......
......@@ -11,11 +11,70 @@
#include "aml.h"
#include "aml/dma/linux-seq.h"
#include "aml/layout/dense.h"
#include "aml/layout/native.h"
#include <assert.h>
#include <errno.h>
#include <sys/mman.h>
/*******************************************************************************
* Generic DMA Copy implementations
*
* Needed by most DMAs. We don't provide introspection or any fancy API to it at
* this point.
******************************************************************************/
static inline void aml_copy_layout_generic_helper(size_t d,
struct aml_layout *dst,
const struct aml_layout *src,
const size_t *elem_number,
size_t elem_size,
size_t *coords)
{
if (d == 1) {
for (size_t i = 0; i < elem_number[0]; i += 1) {
coords[0] = i;
memcpy(aml_layout_deref_native(dst, coords),
aml_layout_deref_native(src, coords),
elem_size);
}
} else {
for (size_t i = 0; i < elem_number[d - 1]; i += 1) {
coords[d - 1] = i;
aml_copy_layout_generic_helper(d - 1, dst, src,
elem_number, elem_size,
coords);
}
}
}
int aml_copy_layout_generic(struct aml_layout *dst,
const struct aml_layout *src,
void *arg,
void **out)
{
size_t d;
size_t elem_size;
(void)arg;
(void)out;
assert(aml_layout_ndims(dst) == aml_layout_ndims(src));
d = aml_layout_ndims(dst);
assert(aml_layout_element_size(dst) == aml_layout_element_size(src));
elem_size = aml_layout_element_size(dst);
size_t coords[d];
size_t elem_number[d];
size_t elem_number2[d];
aml_layout_dims_native(src, elem_number);
aml_layout_dims_native(dst, elem_number2);
for (size_t i = 0; i < d; i += 1)
assert(elem_number[i] == elem_number2[i]);
aml_copy_layout_generic_helper(d, dst, src, elem_number, elem_size,
coords);
return 0;
}
/*******************************************************************************
* Linux-backed, sequential dma
* The dma itself is organized into several different components
......@@ -60,7 +119,7 @@ int aml_dma_linux_seq_do_copy(struct aml_dma_linux_seq_data *dma,
assert(dma != NULL);
assert(req != NULL);
assert(req->op != NULL);
return req->op(req->dest, req->src, req->op_arg);
return req->op(req->dest, req->src, req->op_arg, NULL);
}
struct aml_dma_linux_seq_inner_ops aml_dma_linux_seq_inner_ops = {
......@@ -161,12 +220,13 @@ int aml_dma_linux_seq_create(struct aml_dma **dma, size_t nbreqs,
*dma = NULL;
ret = AML_INNER_MALLOC_2(struct aml_dma, struct aml_dma_linux_seq);
ret = AML_INNER_MALLOC(struct aml_dma, struct aml_dma_linux_seq);
if (ret == NULL)
return -AML_ENOMEM;
ret->data = AML_INNER_MALLOC_NEXTPTR(ret, struct aml_dma,
struct aml_dma_linux_seq);
ret->data = AML_INNER_MALLOC_GET_FIELD(ret, 2,
struct aml_dma,
struct aml_dma_linux_seq);
ret->ops = &aml_dma_linux_seq_ops;
d = (struct aml_dma_linux_seq *)ret->data;
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include "aml.h"
#include "aml/utils/inner-malloc.h"
#include "aml/layout/cuda.h"
int aml_layout_cuda_create(struct aml_layout **out,
void *device_ptr,
const size_t element_size,
const int order,
const size_t ndims,
const size_t *dims,
const size_t *stride,
const size_t *pitch,
struct aml_area_cuda_data *data)
{
struct aml_layout *layout;
struct aml_layout_cuda_data *layout_data;
layout = AML_INNER_MALLOC_EXTRA(struct aml_layout,
struct aml_layout_cuda_data,
size_t, 3*ndims);
if (layout == NULL)
return -AML_ENOMEM;
layout_data = AML_INNER_MALLOC_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_cuda_data);
layout_data->device_ptr = device_ptr;
layout_data->order = order;
layout_data->ndims = ndims;
layout_data->data = data;
layout_data->dims =
AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_cuda_data,
size_t, 0);
layout_data->stride =
AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_cuda_data,
size_t, ndims);
layout_data->cpitch =
AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_cuda_data,
size_t, ndims*2);
// Store dims, stride and cpitch are internally stored in fortran
// row major.
layout_data->cpitch[0] = element_size;
if (order == AML_LAYOUT_ORDER_COLUMN_MAJOR) {
layout_data->dims[0] = dims[ndims-1];
layout_data->stride[0] = stride[ndims-1];
for (size_t i = 1; i < ndims; i++) {
layout_data->dims[i] = dims[ndims-1-i];
layout_data->stride[i] = stride[ndims-1-i];
layout_data->cpitch[i] = layout_data->cpitch[i-1] *
pitch[ndims-1-i];
}
} else {
memcpy(layout_data->dims, dims, ndims * sizeof(size_t));
memcpy(layout_data->stride, stride, ndims * sizeof(size_t));
for (size_t i = 1; i < ndims; i++)
layout_data->cpitch[i] = layout_data->cpitch[i-1] *
pitch[i];
}
layout->data = (struct aml_layout_data *) layout_data;
layout->ops = &aml_layout_cuda_ops;
*out = layout;
return AML_SUCCESS;
}
int aml_layout_cuda_destroy(struct aml_layout **layout)
{
if (layout == NULL || *layout == NULL)
return -AML_EINVAL;
free(*layout);
*layout = NULL;
return AML_SUCCESS;
}
void *aml_layout_cuda_deref(const struct aml_layout_data *data,
const size_t *coords)
{
struct aml_layout_cuda_data *cudata;
(void)coords;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->device_ptr;
}
void *aml_layout_cuda_deref_native(const struct aml_layout_data *data,
const size_t *coords)
{
struct aml_layout_cuda_data *cudata;
(void)coords;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->device_ptr;
}
int aml_layout_cuda_order(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->order;
}
int aml_layout_cuda_dims(const struct aml_layout_data *data, size_t *dims)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
if (cudata->order == AML_LAYOUT_ORDER_ROW_MAJOR)
memcpy(dims, cudata->dims, sizeof(*dims) * cudata->ndims);
else
for (size_t i = 0; i < cudata->ndims; i++)
dims[i] = cudata->dims[cudata->ndims - 1 - i];
return AML_SUCCESS;
}
int aml_layout_cuda_dims_native(const struct aml_layout_data *data,
size_t *dims)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
memcpy(dims, cudata->dims, sizeof(*dims) * cudata->ndims);
return AML_SUCCESS;
}
size_t aml_layout_cuda_ndims(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->ndims;
}
size_t aml_layout_cuda_element_size(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->element_size;
}
struct aml_layout_ops aml_layout_cuda_ops = {
.deref = aml_layout_cuda_deref,
.deref_native = aml_layout_cuda_deref_native,
.order = aml_layout_cuda_order,
.dims = aml_layout_cuda_dims,
.dims_native = aml_layout_cuda_dims_native,
.ndims = aml_layout_cuda_ndims,
.element_size = aml_layout_cuda_element_size,
.reshape = NULL,
.slice = NULL,
.slice_native = NULL
};
......@@ -18,37 +18,32 @@ static int aml_layout_dense_alloc(struct aml_layout **ret,
struct aml_layout *layout;
struct aml_layout_dense *data;
layout = AML_INNER_MALLOC_EXTRA(struct aml_layout,
struct aml_layout_dense,
size_t, 3*ndims);
layout = AML_INNER_MALLOC_ARRAY(3*ndims, size_t,
struct aml_layout,
struct aml_layout_dense);
if (layout == NULL) {
*ret = NULL;
return -AML_ENOMEM;
}
data = AML_INNER_MALLOC_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense);
data = AML_INNER_MALLOC_GET_FIELD(layout, 2,
struct aml_layout,
struct aml_layout_dense);
layout->data = (struct aml_layout_data *) data;
data->ptr = NULL;
data->ndims = ndims;
data->dims = AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense,
size_t, 0);
data->stride = AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense,
size_t, ndims);
data->dims = AML_INNER_MALLOC_GET_ARRAY(layout,
size_t,
struct aml_layout,
struct aml_layout_dense);
data->stride = data->dims + ndims;
for (size_t i = 0; i < ndims; i++)
data->stride[i] = 1;
data->cpitch = AML_INNER_MALLOC_EXTRA_NEXTPTR(layout,
struct aml_layout,
struct aml_layout_dense,
size_t, ndims*2);
data->cpitch = data->stride + ndims;
*ret = layout;
return AML_SUCCESS;
}
......
......@@ -17,30 +17,30 @@ static int aml_layout_pad_alloc(struct aml_layout **ret,
struct aml_layout *layout;
struct aml_layout_pad *data;
layout = AML_INNER_MALLOC_4(struct aml_layout,
struct aml_layout_pad,
size_t, 2*ndims, element_size);
layout = AML_INNER_MALLOC_EXTRA(2*ndims, size_t,
element_size,
struct aml_layout,
struct aml_layout_pad);
if (layout == NULL) {
*ret = NULL;
return -AML_ENOMEM;
}
data = AML_INNER_MALLOC_NEXTPTR(layout,