Commit a41b0412 authored by Swann Perarnau's avatar Swann Perarnau

[refactor] Rework code for better abstractions

This is a rewrite of the existing code into a memory library exposing
more of its internal abstractions. This refactoring is required to:
- make progress faster by focusing on the core new features
- abstract more of the underlying components and expose those
abstractions
- build upon existing libraries (memkind) for the internal stuff.

Memkind is used as a crutch here, we do not intend to use it in the long
term, as some of its internal are opposed to what we want (topology
management in particular).

Nevertheless, it currently provides a good allocator internally, and
decent access to deep memory, for now.

Over time, we figured out that the best way to build this API was to
create several layers of APIs, each with more abstractions over the
devices. At the same time, we want each layer to expose its internal
mechanisms, so that a user can customize any of them.

This is why we end up with areas and dma engines, and we will add in the
future other stuff, like data decomposition and distribution methods, as
well as direct support for "pipelining".
parent 044dc0c2
......@@ -40,6 +40,9 @@ AM_CONDITIONAL([TEST_VALGRIND],[test "x$valgrind" = xtrue])
AC_CHECK_HEADERS(numa.h)
AC_CHECK_LIB(numa, move_pages)
# memkind
AC_CHECK_LIB(memkind, memkind_malloc)
AC_CONFIG_HEADERS([src/config.h])
AC_CONFIG_FILES([Makefile
......
lib_LTLIBRARIES = libaml.la
LIBCSOURCES = aml.c allocator.c
LIBHSOURCES = aml.h allocator.h
LIBCSOURCES = aml.c area.c dma.c
LIBHSOURCES = aml.h
libaml_la_SOURCES = $(LIBCSOURCES) $(LIBHSOURCES)
include_HEADERS = $(LIBHSOURCES)
......@@ -27,86 +27,3 @@ int aml_finalize(void)
{
return 0;
}
int aml_node_init(struct aml_node *node, unsigned int nid)
{
assert(node != NULL);
assert(nid < MAX_NUMNODES);
node->numaid = nid;
node->mask = numa_bitmask_alloc(MAX_NUMNODES);
numa_bitmask_setbit(node->mask, nid);
return 0;
}
int aml_node_destroy(struct aml_node *node)
{
assert(node != NULL);
free(node->mask);
return 0;
}
int aml_malloc(struct aml_alloc *a, size_t memsize, size_t blocksize,
struct aml_node *node)
{
assert(a != NULL);
assert(memsize % blocksize == 0);
assert(blocksize % PAGE_SIZE == 0);
/* TODO: convert to SICM */
struct bitmask *oldbind = numa_get_membind();
numa_set_membind(node->mask);
void *m = mmap(NULL, memsize, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
assert(m != MAP_FAILED);
memset(m, 0, memsize);
numa_set_membind(oldbind);
/* start tracking blocks */
a->start = m;
a->memsize = memsize;
a->blocksize = blocksize;
a->numblocks = memsize/blocksize;
a->nodemap = calloc(a->numblocks, sizeof(*a->nodemap));
for(unsigned long i = 0; i < a->numblocks; i++)
a->nodemap[i] = node;
return 0;
}
int aml_free(struct aml_alloc *a)
{
assert(a != NULL);
free(a->nodemap);
a->nodemap = NULL;
return munmap(a->start, a->memsize);
}
int aml_block_address(struct aml_alloc *a, size_t block, void **ret)
{
assert(a != NULL);
assert(block < a->numblocks);
*ret = (void*)((char*)a->start + block*a->blocksize);
return 0;
}
int aml_block_move(struct aml_alloc *a, size_t block, struct aml_node *node)
{
assert(a != NULL);
assert(block < a->numblocks);
if(a->nodemap[block] != node) {
unsigned long count = a->blocksize/PAGE_SIZE;
int *nodes = calloc(count, sizeof(*nodes));
void **pages = calloc(count, sizeof(*pages));
int *status = calloc(count, sizeof(*status));
for(unsigned long i = 0; i < count; i++) {
nodes[i] = node->numaid;
pages[i] = (void*)((char*)a->start + i*PAGE_SIZE);
}
move_pages(0, count, pages, nodes, status, MPOL_MF_MOVE);
}
return 0;
}
int aml_block_copy(struct aml_alloc *src, size_t srcblock,
struct aml_alloc *dest, size_t destblock)
{
return 0;
}
#ifndef AML_H
#define AML_H 1
#include<numa.h>
#include <numa.h>
#include <memkind.h>
#include <stdlib.h>
/* An allocation.
*
* Contains information about an allocation.
*/
struct aml_node;
struct aml_alloc;
struct aml_alloc {
void *start;
size_t memsize;
size_t blocksize;
size_t numblocks;
struct aml_node **nodemap;
};
/*******************************************************************************
* Areas:
* embeds information about a byte-addressable physical memory location and well
* as binding policies over it.
******************************************************************************/
struct aml_node {
struct bitmask *mask;
int numaid;
/* WARNING: kind must be the first argument for this library to work */
struct aml_area {
memkind_t kind;
struct bitmask *nodemask;
};
int aml_init(int *argc, char **argv[]);
int aml_finalize(void);
#define AML_AREA_TYPE_HBM 0
#define AML_AREA_TYPE_REGULAR 1
#define AML_AREA_TYPE_MAX 2
int aml_area_init(struct aml_area *, unsigned int type);
int aml_area_from_nodestring(struct aml_area *, unsigned int, const char *);
int aml_area_from_nodemask(struct aml_area *, unsigned int, struct bitmask *);
int aml_area_destroy(struct aml_area *);
/*******************************************************************************
* Area allocations:
* Low-level, direct allocation of memory from an area.
******************************************************************************/
int aml_node_init(struct aml_node *, unsigned int);
int aml_node_destroy(struct aml_node *);
void *aml_area_malloc(struct aml_area *, size_t);
void aml_area_free(struct aml_area *, void *);
void *aml_area_calloc(struct aml_area *, size_t, size_t);
void *aml_area_realloc(struct aml_area *, void *, size_t);
void *aml_area_acquire(struct aml_area *, size_t);
void aml_area_release(struct aml_area *, void *);
int aml_malloc(struct aml_alloc *, size_t, size_t, struct aml_node *);
int aml_free(struct aml_alloc *);
/*******************************************************************************
* DMA Engines:
* Low-level, direct movement of memory.
* We haven't decided in our design how we want to deal with memcpy/move_pages
* differences yet.
******************************************************************************/
inline size_t aml_block_size(struct aml_alloc *a) {
return a->blocksize;
}
struct aml_dma {
int (*copy)(struct aml_dma *, void *, const void *, size_t);
int (*move)(struct aml_dma *, struct aml_area *, struct aml_area *,
void *, size_t);
};
int aml_dma_init(struct aml_dma *, unsigned int);
int aml_dma_destroy(struct aml_dma *);
int aml_dma_copy(struct aml_dma *, void *, const void *, size_t);
int aml_dma_move(struct aml_dma *, struct aml_area *, struct aml_area *,
void *, size_t);
/*******************************************************************************
* General functions:
* Initialize internal structures, cleanup everything at the end.
******************************************************************************/
int aml_block_address(struct aml_alloc *, size_t, void **);
int aml_init(int *argc, char **argv[]);
int aml_finalize(void);
int aml_block_move(struct aml_alloc *, size_t, struct aml_node *);
int aml_block_copy(struct aml_alloc *, size_t, struct aml_alloc *, size_t);
#endif
#include <aml.h>
#include <assert.h>
memkind_t *type2kind[AML_AREA_TYPE_MAX] = {
&MEMKIND_HBW_ALL,
&MEMKIND_REGULAR,
};
/*******************************************************************************
* memkind additional functions:
* memkind is missing some features, that we add by re-implementing some of the
* needed hooks.
******************************************************************************/
int aml_memkind_areanodemask(struct memkind *kind, unsigned long *nodemask,
unsigned long maxnode)
{
/* transform back kind into an area */
struct aml_area *area = (struct aml_area*)kind;
struct bitmask ret = {maxnode, nodemask};
copy_bitmask_to_bitmask(area->nodemask, &ret);
return 0;
}
/*******************************************************************************
* area implementation
* At this point, use memkind internally to implement our stuff
******************************************************************************/
int aml_area_init(struct aml_area *area, unsigned int type)
{
assert(type < AML_AREA_TYPE_MAX);
area->kind = *type2kind[type];
area->nodemask = numa_allocate_nodemask();
copy_bitmask_to_bitmask(area->nodemask,numa_all_nodes_ptr);
return 0;
}
int aml_area_from_nodestring(struct aml_area *area, unsigned int type,
const char *nodes)
{
aml_area_init(area, type);
area->nodemask = numa_parse_nodestring(nodes);
return 0;
}
int aml_area_from_nodemask(struct aml_area *area, unsigned int type,
struct bitmask *nodes)
{
aml_area_init(area, type);
copy_bitmask_to_bitmask(area->nodemask, nodes);
return 0;
}
int aml_area_destroy(struct aml_area *area)
{
numa_bitmask_free(area->nodemask);
return 0;
}
void *aml_area_malloc(struct aml_area *area, size_t size)
{
return memkind_malloc(area->kind, size);
}
void aml_area_free(struct aml_area *area, void *ptr)
{
memkind_free(area->kind, ptr);
}
void *aml_area_calloc(struct aml_area *area, size_t num, size_t size)
{
return memkind_calloc(area->kind, num, size);
}
void *aml_area_realloc(struct aml_area *area, void *ptr, size_t size)
{
return memkind_realloc(area->kind, ptr, size);
}
void *aml_area_acquire(struct aml_area *area, size_t size)
{
/* as far as we know memkind doesn't zero new areas
* TODO: find a way to assert it
*/
return aml_area_malloc(area,size);
}
void aml_area_release(struct aml_area *area, void *ptr)
{
/* As far as we know memkind doesn't decommit new areas
* TODO: find a way to assert it
*/
aml_area_free(area, ptr);
}
#include <aml.h>
#include <assert.h>
/*******************************************************************************
* DMA implementation
* At this point, implement the single threaded synchronous stuff
******************************************************************************/
int aml_dma_init(struct aml_dma *dma, unsigned int type)
{
return 0;
}
int aml_dma_destroy(struct aml_dma *dma)
{
return 0;
}
int aml_dma_copy(struct aml_dma *dma, void *dest, const void *src, size_t size)
{
return 0;
}
int aml_dma_move(struct aml_dma *dma, struct aml_area *dest,
struct aml_area *src, void *ptr, size_t size)
{
return 0;
}
......@@ -10,7 +10,7 @@ TESTS_ENVIRONMENT= @LIBTOOL@ --mode=execute @VALGRIND@ --tool=memcheck -q --leak
endif
# all check programs
TST_PROGS = stream_add_pth
TST_PROGS = stream_add_pth stream_add_omp stream_vanilla
check_PROGRAMS = $(TST_PROGS)
TESTS = $(TST_PROGS)
......@@ -25,13 +25,15 @@ int main(int argc, char *argv[])
/* we want to back our array on the slow node and use the fast node as
* a faster buffer.
*/
struct aml_node slow, fast;
struct bitmask *mask = numa_parse_nodestring_all("0");
assert(!aml_node_init(&slow, mask, MEMSIZE*3));
assert(!aml_node_init(&fast, mask, MEMSIZE*3));
struct aml_area slow, fast;
int type = AML_AREA_TYPE_REGULAR;
assert(!aml_area_from_nodestring(&slow, type, "0"));
assert(!aml_area_from_nodestring(&fast, type, "0"));
/* we are only dealing with one contiguous array */
struct aml_alloc a,b,c;
struct aml_dma dma;
assert(!aml_dma_init(&dma, 0));
void *a, *b, *c;
/* describe the allocation */
size_t chunk_msz, esz;
......@@ -43,14 +45,15 @@ int main(int argc, char *argv[])
chunk_msz = MEMSIZE/(numthreads*CHUNKING);
esz = chunk_msz/sizeof(unsigned long);
}
assert(!aml_malloc(&a, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&b, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&c, MEMSIZE, chunk_msz, &slow));
a = aml_area_malloc(&slow, MEMSIZE);
b = aml_area_malloc(&slow, MEMSIZE);
c = aml_area_malloc(&slow, MEMSIZE);
assert(a != NULL && b != NULL && c != NULL);
/* create virtually accessible address range, backed by slow memory */
unsigned long *wa = (unsigned long*)a.start;
unsigned long *wb = (unsigned long*)b.start;
unsigned long *wc = (unsigned long*)c.start;
unsigned long *wa = (unsigned long*)a;
unsigned long *wb = (unsigned long*)b;
unsigned long *wc = (unsigned long*)c;
unsigned long esize = MEMSIZE/sizeof(unsigned long);
for(unsigned long i = 0; i < esize; i++) {
wa[i] = i;
......@@ -64,19 +67,19 @@ int main(int argc, char *argv[])
{
for(unsigned long i = 0; i < numthreads*CHUNKING; i++) {
#pragma omp task depend(inout: wa[i*esz:esz])
assert(!aml_pull_sync(&a, i, &fast));
assert(!aml_dma_move(&dma, &fast, &slow, &wa[i*esz], esz));
#pragma omp task depend(inout: wb[i*esz:esz])
assert(!aml_pull_sync(&b, i, &fast));
assert(!aml_dma_move(&dma, &fast, &slow, &wb[i*esz], esz));
#pragma omp task depend(inout: wc[i*esz:esz])
assert(!aml_pull_sync(&c, i, &fast));
assert(!aml_dma_move(&dma, &fast, &slow, &wc[i*esz], esz));
#pragma omp task depend(in: wa[i*esz:esz], wb[i*esz:esz]) depend(out: wc[i*esz:esz])
kernel(&wa[i*esz], &wb[i*esz], &wc[i*esz], esz);
#pragma omp task depend(inout: wa[i*esz:esz])
assert(!aml_push_sync(&a, i, &slow));
assert(!aml_dma_move(&dma, &slow, &fast, &wa[i*esz], esz));
#pragma omp task depend(inout: wb[i*esz:esz])
assert(!aml_push_sync(&b, i, &slow));
assert(!aml_dma_move(&dma, &slow, &fast, &wb[i*esz], esz));
#pragma omp task depend(inout: wc[i*esz:esz])
assert(!aml_push_sync(&c, i, &slow));
assert(!aml_dma_move(&dma, &slow, &fast, &wc[i*esz], esz));
}
}
......@@ -85,11 +88,12 @@ int main(int argc, char *argv[])
assert(wc[i] == esize);
}
aml_free(&a);
aml_free(&b);
aml_free(&c);
aml_node_destroy(&slow);
aml_node_destroy(&fast);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&slow, c);
aml_area_destroy(&slow);
aml_area_destroy(&fast);
aml_dma_destroy(&dma);
aml_finalize();
return 0;
}
......@@ -10,7 +10,8 @@
#define MEMSIZE (1UL<<20)
#define CHUNKING 4
struct aml_node slow, fast;
struct aml_area slow, fast;
struct aml_dma dma;
int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
{
......@@ -22,23 +23,23 @@ int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
}
struct cinfo {
struct aml_alloc *tab;
unsigned long *tab;
pthread_t tid;
unsigned long chunk;
size_t size;
};
void *th_copy(void *arg)
{
struct cinfo *ci = arg;
aml_block_move(ci->tab, ci->chunk, &fast);
aml_dma_move(&dma, &fast, &slow, ci->tab, ci->size*sizeof(unsigned long));
return arg;
}
struct winfo {
struct aml_alloc *a, *b, *c;
unsigned long *a, *b, *c;
pthread_t *ca, *cb;
pthread_t tid;
unsigned long chunk;
size_t size;
};
void *th_work(void *arg)
......@@ -47,16 +48,7 @@ void *th_work(void *arg)
pthread_join(*(wi->ca), NULL);
pthread_join(*(wi->cb), NULL);
void *aa,*bb,*cc;
size_t esize = aml_block_size(wi->c)/sizeof(unsigned long);
aml_block_address(wi->a, wi->chunk, &aa);
aml_block_address(wi->b, wi->chunk, &bb);
aml_block_address(wi->c, wi->chunk, &cc);
printf("%p[%lu]:%p\n",wi->a->start, wi->chunk, aa);
printf("%p[%lu]:%p\n",wi->b->start, wi->chunk, bb);
printf("%p[%lu]:%p\n",wi->c->start, wi->chunk, cc);
kernel(aa, bb, cc, esize);
kernel(wi->a, wi->b, wi->c, wi->size);
return arg;
}
int main(int argc, char *argv[])
......@@ -67,11 +59,13 @@ int main(int argc, char *argv[])
/* we want to back our array on the slow node and use the fast node as
* a faster buffer.
*/
assert(!aml_node_init(&slow, 0));
assert(!aml_node_init(&fast, 0));
assert(!aml_area_from_nodestring(&slow, AML_AREA_TYPE_REGULAR, "0"));
assert(!aml_area_from_nodestring(&fast, AML_AREA_TYPE_REGULAR, "0"));
struct aml_dma dma;
assert(!aml_dma_init(&dma, 0));
void *a, *b, *c;
struct aml_alloc a,b,c;
/* describe the allocation */
size_t chunk_msz, esz;
int numthreads, copythreads;
......@@ -85,15 +79,15 @@ int main(int argc, char *argv[])
chunk_msz = MEMSIZE/(numthreads*CHUNKING);
esz = chunk_msz/sizeof(unsigned long);
}
printf("th: %lu, mem: %zi, chunk: %zi\n",numthreads,MEMSIZE,chunk_msz);
assert(!aml_malloc(&a, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&b, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&c, MEMSIZE, chunk_msz, &fast));
a = aml_area_malloc(&slow, MEMSIZE);
b = aml_area_malloc(&slow, MEMSIZE);
c = aml_area_malloc(&fast, MEMSIZE);
assert(a != NULL && b != NULL && c != NULL);
/* create virtually accessible address range, backed by slow memory */
unsigned long *wa = (unsigned long*)a.start;
unsigned long *wb = (unsigned long*)b.start;
unsigned long *wc = (unsigned long*)c.start;
unsigned long *wa = (unsigned long*)a;
unsigned long *wb = (unsigned long*)b;
unsigned long *wc = (unsigned long*)c;
unsigned long esize = MEMSIZE/sizeof(unsigned long);
for(unsigned long i = 0; i < esize; i++) {
wa[i] = i;
......@@ -107,16 +101,16 @@ int main(int argc, char *argv[])
struct winfo *wis = calloc(numthreads, sizeof(struct winfo));
for(unsigned long i = 0; i < CHUNKING; i++) {
for(unsigned long j = 0; j < numthreads; j++) {
cas[j].tab = &a;
cas[j].chunk = i*CHUNKING + j;
cbs[j].tab = &b;
cbs[j].chunk = i*CHUNKING + j;
wis[j].a = &a;
wis[j].b = &b;
wis[j].c = &c;
cas[j].tab = &wa[i*CHUNKING +j];
cas[j].size = esize;
cbs[j].tab = &wb[i*CHUNKING +j];
cbs[j].size = esize;
wis[j].a = &wa[i*CHUNKING +j];
wis[j].b = &wb[i*CHUNKING +j];
wis[j].c = &wc[i*CHUNKING +j];
wis[j].ca = &cas[j].tid;
wis[j].cb = &cbs[j].tid;
wis[j].chunk = i*CHUNKING + j;
wis[j].size = esize;
pthread_create(&cas[j].tid, NULL, &th_copy, (void*)&cas[j]);
pthread_create(&cbs[j].tid, NULL, &th_copy, (void*)&cbs[j]);
pthread_create(&wis[j].tid, NULL, &th_work, (void*)&wis[j]);
......@@ -134,11 +128,12 @@ int main(int argc, char *argv[])
assert(wc[i] == esize);
}
aml_free(&a);
aml_free(&b);
aml_free(&c);
aml_node_destroy(&slow);
aml_node_destroy(&fast);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&fast, c);
aml_area_destroy(&slow);
aml_area_destroy(&fast);
aml_dma_destroy(&dma);
aml_finalize();
return 0;
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment