Commit 1b8d5db5 authored by Swann Perarnau's avatar Swann Perarnau
Browse files

Merge branch 'no_layout_cuda' into 'staging'

Remove Cuda Layout

See merge request !167
parents 610cc3ef ad2d1dce
Pipeline #12058 passed with stages
in 6 minutes and 57 seconds
......@@ -198,5 +198,4 @@ AC_CONFIG_FILES([Makefile
aml.pc
include/aml/utils/version.h
include/aml/utils/features.h])
AC_CONFIG_FILES([o2lo], [chmod +x o2lo])
AC_OUTPUT
......@@ -44,6 +44,5 @@ endif
if HAVE_CUDA
include_aml_area_HEADERS += aml/area/cuda.h
include_aml_layout_HEADERS += aml/layout/cuda.h
include_aml_dma_HEADERS += aml/dma/cuda.h
endif
......@@ -106,7 +106,29 @@ int aml_dma_cuda_destroy(struct aml_dma **dma);
//--- DMA copy operators --------------------------------------------------//
/** aml_dma_cuda copy operator for 1D to 1D layouts **/
/**
* Embed a pair of devices in a void* to use as dma copy_operator argument
* when copying from device to device.
**/
#define AML_DMA_CUDA_DEVICE_PAIR(src, dst) \
(void *)(((intptr_t)dst << 32) | ((intptr_t)src))
/**
* Translate back a pair of device ids stored in `pair` (void*) into to
* device id integers.
**/
#define AML_DMA_CUDA_DEVICE_FROM_PAIR(pair, src, dst) \
src = dst = 0; \
src = ((intptr_t)pair & 0xffffffff); \
dst = ((intptr_t)pair >> 32);
/**
* aml_dma_cuda copy operator for 1D to 1D layouts.
* @param [in] dst: The destination layout of the copy.
* @param [in] src: The source layout of the copy.
* @param [in] arg: Either a device id (int). or two device ids
* @return an AML error code.
**/
int aml_dma_cuda_copy_1D(struct aml_layout *dst,
const struct aml_layout *src,
void *arg);
......
/******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#ifndef AML_LAYOUT_CUDA_H
#define AML_LAYOUT_CUDA_H
/**
* @defgroup aml_layout_cuda "AML Layout Cuda"
* @brief Layout on device pointer.
*
* Cuda layout is a wrapper on other layout.
* All operations are deferred to the embedded layout.
* deref operation of the embedded layout is used to compute offset
* on device pointer and return the appropriate offset.
* Operations on this layout cannot be used on device side.
* However the layout pointer (if it is a device pointer) can be used
* on device side.
*
* @code
* #include <aml/layout/cuda.h>
* @endcode
* @see aml_layout
* @{
**/
/** aml_layout data structure **/
struct aml_layout_cuda_data {
/** Pointer to data on device. **/
void *device_ptr;
/** device id where ptr is located **/
int device;
/** user expected layout order **/
int order;
/** layout num dims **/
size_t ndims;
/** layout dims stored in row major order **/
size_t *dims;
/**
* Offset between elements of the same dimension.
* Offset is in number of elements.
**/
size_t *stride;
/**
* cumulative distances between two elements in the same
* dimension (pitch[0] is the element size in bytes).
**/
size_t *cpitch;
};
/**
* Create a new layout on device pointer with embedded layout.
* @param[out] out: A pointer to receive the newly allocated layout.
* @param[in] device_ptr: The pointer on which the layout has to work.
* @param[in] device: The device id where the device_ptr is allocated.
* @param[in] element_size: The size of elements in this layout.
* @param[in] order: Order of dimensions in the layout.
* @param[in] ndims: The number of dimensions in the layout.
* @param[in] dims: The dimensions in the layout.
* @param[in] stride: The empty -- in number of elements -- space between
* consecutive elements of the same dimension, in number of elements.
* @param[in] pitch: The space -- in number of element -- between 2 elements in
* the next dimension.
* @return AML_SUCCESS or -AML_ENOMEM if the memory allocation for layout
* failed.
**/
int aml_layout_cuda_create(struct aml_layout **out,
void *device_ptr,
int device,
const size_t element_size,
const int order,
const size_t ndims,
const size_t *dims,
const size_t *stride,
const size_t *pitch);
/** Always returns the pointer to device_ptr, whatever the coordinates. **/
void *aml_layout_cuda_deref(const struct aml_layout_data *data,
const size_t *coords);
/** Always returns the pointer to device_ptr, whatever the coordinates. **/
void *aml_layout_cuda_deref_native(const struct aml_layout_data *data,
const size_t *coords);
/** Returns layout order **/
int aml_layout_cuda_order(const struct aml_layout_data *data);
/** Copies layout dims with user order. **/
int aml_layout_cuda_dims(const struct aml_layout_data *data, size_t *dims);
/** Copies layout dims in row major order. **/
int aml_layout_cuda_dims_native(const struct aml_layout_data *data,
size_t *dims);
/** Returns the number of dimensions in the layout. **/
size_t aml_layout_cuda_ndims(const struct aml_layout_data *data);
/** Returns the size of an element in the layout. **/
size_t aml_layout_cuda_element_size(const struct aml_layout_data *data);
/** Cuda layout operations **/
extern struct aml_layout_ops aml_layout_cuda_ops;
/**
* @}
**/
#endif // AML_LAYOUT_CUDA_H
#!/bin/sh
## ---------------------------------------------------------- ##
## $1: .lo file ##
## $2: .cu.o file object file with dynamic symbol relocation ##
## $3: .o file object file with static symbol relocation ##
## ---------------------------------------------------------- ##
AWK=@AWK@
GREP=@GREP@
LIBTOOL=@abs_top_builddir@/libtool
if test $# -lt 3; then
echo "Create a libtool .lo file provided the target.o file"
echo ""
echo "\t$0 <file.lo> <file.cu.o> <file.o>"
fi
LO_FILE=$1
PIC_FILE=$(echo $2 | awk -F '/' '{print $NF}')
O_FILE=$(echo $3 | awk -F '/' '{print $NF}')
LIB_PIC_FILE=.libs/$O_FILE
BASENAME=$(echo $2 | awk -F '/' '{for(i=1; i<NF; i++) {print $i "/"}}')
PROGRAM=$($GREP -m 1 PROGRAM= $LIBTOOL | cut -d = -f 2)
PACKAGE=$($GREP -m 1 PACKAGE= $LIBTOOL | cut -d = -f 2)
VERSION=$($GREP -m 1 VERSION= $LIBTOOL | cut -d = -f 2)
must_libtool_define() {
eval "var=\$$1"
if [ -z "$var" ]; then
echo "libtool script did not contain $1 variable used to build .lo file."
exit
fi
}
must_libtool_define "PROGRAM"
must_libtool_define "PACKAGE"
must_libtool_define "VERSION"
must_exist() {
if [ ! -f $1 ]; then
echo "File $1 is missing."
exit
fi
}
must_exist $PWD/$BASENAME$O_FILE
must_exist $PWD/$BASENAME$PIC_FILE
mv $PWD/$BASENAME$PIC_FILE $PWD/$BASENAME$LIB_PIC_FILE
echo "# Generated by $PROGRAM (GNU $PACKAGE) $VERSION" > ${LO_FILE}
echo "# Generated by $PROGRAM (GNU $PACKAGE) $VERSION" >> ${LO_FILE}
echo "# $LO_FILE - a libtool object file" >> ${LO_FILE}
echo "# Please DO NOT delete this file!" >> ${LO_FILE}
echo "# It is necessary for linking the library." >> ${LO_FILE}
echo "" >> ${LO_FILE}
echo "# Name of the PIC object." >> ${LO_FILE}
echo "pic_object=$LIB_PIC_FILE" >> ${LO_FILE}
echo "" >> ${LO_FILE}
echo "# Name of the non-PIC object" >> ${LO_FILE}
echo "non_pic_object=$O_FILE" >> ${LO_FILE}
......@@ -62,23 +62,7 @@ if HAVE_CUDA
AM_CPPFLAGS += $(CUDA_CFLAGS)
AM_LDFLAGS += $(CUDA_LIBS)
# Build .c sources using cuda runtime library.
libaml_la_SOURCES+=area/cuda.c layout/cuda.c dma/cuda.c
# Build .cu sources containing device code.
# CUDA_LO_FILES= # .lo files result of .cu files.
# CUDA_FLAGS=--x=cu
#
# .cu.lo: .cu
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -dc -o $<.cu.o $<
# $(NVCC) $(CUDA_FLAGS) -I$(top_srcdir)/include -c -o $<.o $<
# $(top_builddir)/o2lo $@ $<.cu.o $<.o
#
# libamlcuda.la: $(CUDA_LO_FILES)
# $(LIBTOOL) --tag=CC --mode=link $(NVCC) -dlink -o $@ $< -lcudart
#
# libaml_la_LIBADD+=libamlcuda.la
libaml_la_SOURCES+=area/cuda.c dma/cuda.c
endif
......
......@@ -11,7 +11,6 @@
#include "aml.h"
#include "aml/dma/cuda.h"
#include "aml/layout/cuda.h"
/**
* Callback on dma stream to update all requests status
......@@ -190,16 +189,16 @@ int aml_dma_cuda_copy_1D(struct aml_layout *dst,
void *arg)
{
int err;
int src_device;
int dst_device;
const void *src_ptr = aml_layout_rawptr(src);
void *dst_ptr = aml_layout_rawptr(dst);
struct aml_dma_cuda_data *dma_data = (struct aml_dma_cuda_data *)arg;
const struct aml_layout_cuda_data *cu_src =
(struct aml_layout_cuda_data *)(src->data);
struct aml_layout_cuda_data *cu_dst =
(struct aml_layout_cuda_data *)(dst->data);
size_t n = 0;
size_t size = 0;
AML_DMA_CUDA_DEVICE_FROM_PAIR(arg, src_device, dst_device);
err = aml_layout_dims(src, &n);
if (err != AML_SUCCESS)
return err;
......@@ -211,8 +210,8 @@ int aml_dma_cuda_copy_1D(struct aml_layout *dst,
dma_data->stream) != cudaSuccess)
return -AML_FAILURE;
} else if (dma_data->kind == cudaMemcpyDeviceToDevice) {
if (cudaMemcpyPeerAsync(dst_ptr, cu_dst->device, src_ptr,
cu_src->device, size,
if (cudaMemcpyPeerAsync(dst_ptr, dst_device, src_ptr,
src_device, size,
dma_data->stream) != cudaSuccess)
return -AML_FAILURE;
} else
......
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
#include "aml.h"
#include "aml/layout/cuda.h"
#include "aml/utils/inner-malloc.h"
int aml_layout_cuda_create(struct aml_layout **out,
void *device_ptr,
int device,
const size_t element_size,
const int order,
const size_t ndims,
const size_t *dims,
const size_t *stride,
const size_t *pitch)
{
struct aml_layout *layout;
struct aml_layout_cuda_data *layout_data;
layout = AML_INNER_MALLOC_ARRAY(3 * ndims, size_t, struct aml_layout,
struct aml_layout_cuda_data);
if (layout == NULL)
return -AML_ENOMEM;
layout_data = AML_INNER_MALLOC_GET_FIELD(layout, 2, struct aml_layout,
struct aml_layout_cuda_data);
layout_data->device_ptr = device_ptr;
layout_data->device = device;
layout_data->order = order;
layout_data->ndims = ndims;
layout_data->dims = AML_INNER_MALLOC_GET_ARRAY(
layout, size_t, struct aml_layout, struct aml_layout_cuda_data);
layout_data->stride = layout_data->dims + ndims;
layout_data->cpitch = layout_data->stride + ndims;
// Store dims, stride and cpitch are internally stored in fortran
// row major.
layout_data->cpitch[0] = element_size;
if (order == AML_LAYOUT_ORDER_COLUMN_MAJOR) {
layout_data->dims[0] = dims[ndims - 1];
layout_data->stride[0] = stride[ndims - 1];
for (size_t i = 1; i < ndims; i++) {
layout_data->dims[i] = dims[ndims - 1 - i];
layout_data->stride[i] = stride[ndims - 1 - i];
layout_data->cpitch[i] = layout_data->cpitch[i - 1] *
pitch[ndims - 1 - i];
}
} else {
memcpy(layout_data->dims, dims, ndims * sizeof(size_t));
memcpy(layout_data->stride, stride, ndims * sizeof(size_t));
for (size_t i = 1; i < ndims; i++)
layout_data->cpitch[i] =
layout_data->cpitch[i - 1] * pitch[i];
}
layout->data = (struct aml_layout_data *)layout_data;
layout->ops = &aml_layout_cuda_ops;
*out = layout;
return AML_SUCCESS;
}
void *aml_layout_cuda_deref(const struct aml_layout_data *data,
const size_t *coords)
{
struct aml_layout_cuda_data *cudata;
(void)coords;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->device_ptr;
}
void *aml_layout_cuda_deref_native(const struct aml_layout_data *data,
const size_t *coords)
{
struct aml_layout_cuda_data *cudata;
(void)coords;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->device_ptr;
}
int aml_layout_cuda_order(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->order;
}
int aml_layout_cuda_dims(const struct aml_layout_data *data, size_t *dims)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
if (cudata->order == AML_LAYOUT_ORDER_ROW_MAJOR)
memcpy(dims, cudata->dims, sizeof(*dims) * cudata->ndims);
else
for (size_t i = 0; i < cudata->ndims; i++)
dims[i] = cudata->dims[cudata->ndims - 1 - i];
return AML_SUCCESS;
}
int aml_layout_cuda_dims_native(const struct aml_layout_data *data,
size_t *dims)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
memcpy(dims, cudata->dims, sizeof(*dims) * cudata->ndims);
return AML_SUCCESS;
}
size_t aml_layout_cuda_ndims(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->ndims;
}
size_t aml_layout_cuda_element_size(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->cpitch[0];
}
void *aml_layout_cuda_rawptr(const struct aml_layout_data *data)
{
struct aml_layout_cuda_data *cudata;
cudata = (struct aml_layout_cuda_data *)data;
return cudata->device_ptr;
}
struct aml_layout_ops aml_layout_cuda_ops = {
.deref = aml_layout_cuda_deref,
.deref_native = aml_layout_cuda_deref_native,
.rawptr = aml_layout_cuda_rawptr,
.order = aml_layout_cuda_order,
.dims = aml_layout_cuda_dims,
.dims_native = aml_layout_cuda_dims_native,
.ndims = aml_layout_cuda_ndims,
.element_size = aml_layout_cuda_element_size,
.reshape = NULL,
.slice = NULL,
.slice_native = NULL,
.fprintf = NULL,
.duplicate = NULL,
.destroy = NULL,
};
......@@ -14,7 +14,6 @@
#include "aml.h"
#include "aml/dma/cuda.h"
#include "aml/layout/cuda.h"
#include "aml/layout/dense.h"
// Data
......@@ -47,10 +46,10 @@ void setup()
AML_LAYOUT_ORDER_COLUMN_MAJOR,
element_size, ndims, &dims, &stride,
&pitch) == AML_SUCCESS);
assert(aml_layout_cuda_create(&device_layout, device_data, 0,
element_size,
AML_LAYOUT_ORDER_COLUMN_MAJOR, ndims,
&dims, &stride, &pitch) == AML_SUCCESS);
assert(aml_layout_dense_create(&device_layout, device_data,
AML_LAYOUT_ORDER_COLUMN_MAJOR,
element_size, ndims, &dims, &stride,
&pitch) == AML_SUCCESS);
// Dma
assert(aml_dma_cuda_create(&host_device, cudaMemcpyHostToDevice) ==
......@@ -66,7 +65,7 @@ void teardown()
cudaFree(device_data);
// Layout
aml_layout_destroy(&device_layout);
aml_layout_destroy(&host_layout);
aml_layout_destroy(&device_layout);
// Dma
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment