Commit f47dc685 authored by Swann Perarnau's avatar Swann Perarnau
Browse files

[refactor] make use of functional tests again

This patch reintroduce the first functional test, a stream add
implementation using pthreads for parallelism. We make use of our
scratch_par implementation to implement a pipelined version of the
application, where each worker thread is using its own batch of tiles,
and migrating data asynchronously.
parent 7260868d
......@@ -41,9 +41,9 @@ UNIT_TESTS = $(ARENA_JEMALLOC_TESTS) \
$(SCRATCH_TESTS)
# fonctional tests
FUNC_TESTS = stream_add_pth stream_add_omp stream_vanilla
FUNC_TESTS = stream_add_pth
# all tests
TST_PROGS = $(UNIT_TESTS)
TST_PROGS = $(UNIT_TESTS) $(FUNC_TESTS)
check_PROGRAMS = $(TST_PROGS)
TESTS = $(TST_PROGS)
......@@ -10,8 +10,13 @@
#define MEMSIZE (1UL<<20)
#define CHUNKING 4
struct aml_area slow, fast;
struct aml_dma dma;
size_t numthreads, tilesz, esz;
unsigned long *a, *b, *c;
AML_TILING_1D_DECL(tiling);
AML_AREA_LINUX_DECL(slow);
AML_AREA_LINUX_DECL(fast);
AML_SCRATCH_PAR_DECL(sa);
AML_SCRATCH_PAR_DECL(sb);
int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
{
......@@ -22,53 +27,49 @@ int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
return 0;
}
struct cinfo {
unsigned long *tab;
pthread_t tid;
size_t size;
};
void *th_copy(void *arg)
{
struct cinfo *ci = arg;
aml_dma_move(&dma, &fast, &slow, ci->tab, ci->size*sizeof(unsigned long));
return arg;
}
struct winfo {
unsigned long *a, *b, *c;
pthread_t *ca, *cb;
pthread_t tid;
size_t size;
int tid;
pthread_t th;
};
void *th_work(void *arg)
{
int offset, i, ai, bi, oldai, oldbi;
unsigned long *ap, *bp;
void *abaseptr, *bbaseptr;
struct winfo *wi = arg;
pthread_join(*(wi->ca), NULL);
pthread_join(*(wi->cb), NULL);
offset = wi->tid*CHUNKING;
ap = aml_tiling_tilestart(&tiling, a, offset);
bp = aml_tiling_tilestart(&tiling, b, offset);
abaseptr = aml_scratch_baseptr(&sa);
bbaseptr = aml_scratch_baseptr(&sb);
ai = -1; bi = -1;
for(i = 0; i < CHUNKING-1; i++) {
struct aml_scratch_request *ar, *br;
oldai = ai; oldbi = bi;
aml_scratch_async_pull(&sa, &ar, abaseptr, &ai, a, offset+i+1);
aml_scratch_async_pull(&sb, &br, bbaseptr, &bi, b, offset+i+1);
kernel(ap, bp, &c[(offset+i)*esz], esz);
aml_scratch_wait(&sa, ar);
aml_scratch_wait(&sb, br);
ap = aml_tiling_tilestart(&tiling, abaseptr, ai);
bp = aml_tiling_tilestart(&tiling, bbaseptr, bi);
aml_scratch_release(&sa, oldai);
aml_scratch_release(&sb, oldbi);
}
kernel(ap, bp, &c[(offset+i)*esz], esz);
kernel(wi->a, wi->b, wi->c, wi->size);
return arg;
}
int main(int argc, char *argv[])
{
assert(argc == 1);
AML_BINDING_SINGLE_DECL(binding);
AML_ARENA_JEMALLOC_DECL(arena);
AML_DMA_LINUX_SEQ_DECL(dma);
unsigned long nodemask[AML_NODEMASK_SZ];
aml_init(&argc, &argv);
/* we want to back our array on the slow node and use the fast node as
* a faster buffer.
*/
assert(!aml_area_from_nodestring(&slow, AML_AREA_TYPE_REGULAR, "all"));
assert(!aml_area_from_nodestring(&fast, AML_AREA_TYPE_REGULAR, "all"));
struct aml_dma dma;
assert(!aml_dma_init(&dma, 0));
void *a, *b, *c;
/* describe the allocation */
size_t chunk_msz, esz;
int numthreads, copythreads;
assert(argc == 1);
/* use openmp env to figure out how many threads we want
* (we actually use 3x as much)
......@@ -76,64 +77,72 @@ int main(int argc, char *argv[])
#pragma omp parallel
{
numthreads = omp_get_num_threads();
chunk_msz = MEMSIZE/(numthreads*CHUNKING);
esz = chunk_msz/sizeof(unsigned long);
tilesz = MEMSIZE/(numthreads*CHUNKING);
esz = tilesz/sizeof(unsigned long);
}
/* initialize all the supporting struct */
assert(!aml_binding_init(&binding, AML_BINDING_TYPE_SINGLE, 0));
assert(!aml_tiling_init(&tiling, AML_TILING_TYPE_1D, tilesz, MEMSIZE));
AML_NODEMASK_ZERO(nodemask);
AML_NODEMASK_SET(nodemask, 0);
assert(!aml_arena_jemalloc_init(&arena, AML_ARENA_JEMALLOC_TYPE_REGULAR));
assert(!aml_area_linux_init(&slow,
AML_AREA_LINUX_MANAGER_TYPE_SINGLE,
AML_AREA_LINUX_MBIND_TYPE_REGULAR,
AML_AREA_LINUX_MMAP_TYPE_ANONYMOUS,
&arena, MPOL_BIND, nodemask));
assert(!aml_area_linux_init(&fast,
AML_AREA_LINUX_MANAGER_TYPE_SINGLE,
AML_AREA_LINUX_MBIND_TYPE_REGULAR,
AML_AREA_LINUX_MMAP_TYPE_ANONYMOUS,
&arena, MPOL_BIND, nodemask));
assert(!aml_dma_linux_seq_init(&dma, numthreads*2));
assert(!aml_scratch_par_init(&sa, &fast, &slow, &dma, &tiling,
2*numthreads, numthreads));
assert(!aml_scratch_par_init(&sb, &fast, &slow, &dma, &tiling,
2*numthreads, numthreads));
/* allocation */
a = aml_area_malloc(&slow, MEMSIZE);
b = aml_area_malloc(&slow, MEMSIZE);
c = aml_area_malloc(&fast, MEMSIZE);
assert(a != NULL && b != NULL && c != NULL);
/* create virtually accessible address range, backed by slow memory */
unsigned long *wa = (unsigned long*)a;
unsigned long *wb = (unsigned long*)b;
unsigned long *wc = (unsigned long*)c;
unsigned long esize = MEMSIZE/sizeof(unsigned long);
for(unsigned long i = 0; i < esize; i++) {
wa[i] = i;
wb[i] = esize - i;
wc[i] = 0;
a[i] = i;
b[i] = esize - i;
c[i] = 0;
}
/* run kernel */
struct cinfo *cas = calloc(numthreads, sizeof(struct cinfo));
struct cinfo *cbs = calloc(numthreads, sizeof(struct cinfo));
struct winfo *wis = calloc(numthreads, sizeof(struct winfo));
for(unsigned long i = 0; i < CHUNKING; i++) {
for(unsigned long j = 0; j < numthreads; j++) {
cas[j].tab = &wa[i*CHUNKING +j];
cas[j].size = esize;
cbs[j].tab = &wb[i*CHUNKING +j];
cbs[j].size = esize;
wis[j].a = &wa[i*CHUNKING +j];
wis[j].b = &wb[i*CHUNKING +j];
wis[j].c = &wc[i*CHUNKING +j];
wis[j].ca = &cas[j].tid;
wis[j].cb = &cbs[j].tid;
wis[j].size = esize;
pthread_create(&cas[j].tid, NULL, &th_copy, (void*)&cas[j]);
pthread_create(&cbs[j].tid, NULL, &th_copy, (void*)&cbs[j]);
pthread_create(&wis[j].tid, NULL, &th_work, (void*)&wis[j]);
}
for(unsigned long j = 0; j < numthreads; j++) {
pthread_join(wis[j].tid, NULL);
}
struct winfo *wis = aml_area_calloc(&slow, numthreads, sizeof(struct winfo));
for(unsigned long i = 0; i < numthreads; i++) {
wis[i].tid = i;
pthread_create(&wis[i].th, NULL, &th_work, (void*)&wis[i]);
}
for(unsigned long j = 0; j < numthreads; j++) {
pthread_join(wis[j].th, NULL);
}
free(cas);
free(cbs);
free(wis);
aml_area_free(&slow, wis);
/* validate */
for(unsigned long i = 0; i < esize; i++) {
assert(wc[i] == esize);
assert(c[i] == esize);
}
aml_scratch_par_destroy(&sa);
aml_scratch_par_destroy(&sb);
aml_dma_linux_seq_destroy(&dma);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&fast, c);
aml_area_destroy(&slow);
aml_area_destroy(&fast);
aml_dma_destroy(&dma);
aml_area_free(&slow, c);
aml_area_linux_destroy(&slow);
aml_area_linux_destroy(&fast);
aml_tiling_destroy(&tiling, AML_TILING_TYPE_1D);
aml_binding_destroy(&binding, AML_BINDING_TYPE_SINGLE);
aml_finalize();
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment