stream_add_omp.c 3.01 KB
Newer Older
Swann Perarnau's avatar
Swann Perarnau committed
1 2 3 4 5 6 7 8 9 10
/*******************************************************************************
 * Copyright 2019 UChicago Argonne, LLC.
 * (c.f. AUTHORS, LICENSE)
 *
 * This file is part of the AML project.
 * For more info, see https://xgitlab.cels.anl.gov/argo/aml
 *
 * SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/

11 12 13 14 15 16 17 18 19 20 21
#include <assert.h>
#include <errno.h>
#include <omp.h>
#include <aml.h>
#include <stdlib.h>

#define ITER 10
#define MEMSIZE (1UL<<26)
#define PHASES 20
#define CHUNKING 4

22
int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
23 24
{
	size_t i;
25 26
	for(i = 0; i < n; i++)
		c[i] = a[i] + b[i];
27 28 29 30 31 32 33 34 35 36 37
	return 0;
}

int main(int argc, char *argv[])
{
	assert(argc == 1);
	aml_init(&argc, &argv);

	/* we want to back our array on the slow node and use the fast node as
	 * a faster buffer.
	 */
38 39
	struct aml_area slow, fast;
	int type = AML_AREA_TYPE_REGULAR;
40 41
	assert(!aml_area_from_nodestring(&slow, type, "all"));
	assert(!aml_area_from_nodestring(&fast, type, "all"));
42

43 44 45 46
	struct aml_dma dma;
	assert(!aml_dma_init(&dma, 0));

	void *a, *b, *c;
47 48

	/* describe the allocation */
49
	size_t chunk_msz, esz;
50 51 52 53 54 55
	int numthreads;

	#pragma omp parallel
	{
		numthreads = omp_get_num_threads();
		chunk_msz = MEMSIZE/(numthreads*CHUNKING);
56
		esz = chunk_msz/sizeof(unsigned long);
57
	}
58 59 60 61
	a = aml_area_malloc(&slow, MEMSIZE);
	b = aml_area_malloc(&slow, MEMSIZE);
	c = aml_area_malloc(&slow, MEMSIZE);
	assert(a != NULL && b != NULL && c != NULL);
62 63

	/* create virtually accessible address range, backed by slow memory */
64 65 66
	unsigned long *wa = (unsigned long*)a;
	unsigned long *wb = (unsigned long*)b;
	unsigned long *wc = (unsigned long*)c;
67 68
	unsigned long esize = MEMSIZE/sizeof(unsigned long);
	for(unsigned long i = 0; i < esize; i++) {
69
		wa[i] = i;
70 71
		wb[i] = esize - i;
		wc[i] = 0;
72 73 74 75 76 77
	}

	/* run kernel */
	#pragma omp parallel
	#pragma omp single nowait
	{
78 79
		for(unsigned long i = 0; i < numthreads*CHUNKING; i++) {
			#pragma omp task depend(inout: wa[i*esz:esz])
80
			assert(!aml_dma_move(&dma, &fast, &slow, &wa[i*esz], esz));
81
			#pragma omp task depend(inout: wb[i*esz:esz])
82
			assert(!aml_dma_move(&dma, &fast, &slow, &wb[i*esz], esz));
83
			#pragma omp task depend(inout: wc[i*esz:esz])
84
			assert(!aml_dma_move(&dma, &fast, &slow, &wc[i*esz], esz));
85 86 87
			#pragma omp task depend(in: wa[i*esz:esz], wb[i*esz:esz]) depend(out: wc[i*esz:esz])
			kernel(&wa[i*esz], &wb[i*esz], &wc[i*esz], esz);
			#pragma omp task depend(inout: wa[i*esz:esz])
88
			assert(!aml_dma_move(&dma, &slow, &fast, &wa[i*esz], esz));
89
			#pragma omp task depend(inout: wb[i*esz:esz])
90
			assert(!aml_dma_move(&dma, &slow, &fast, &wb[i*esz], esz));
91
			#pragma omp task depend(inout: wc[i*esz:esz])
92
			assert(!aml_dma_move(&dma, &slow, &fast, &wc[i*esz], esz));
93 94
		}
	}
95 96 97 98 99 100

	/* validate */
	for(unsigned long i = 0; i < esize; i++) {
		assert(wc[i] == esize);
	}

101 102 103 104 105 106
	aml_area_free(&slow, a);
	aml_area_free(&slow, b);
	aml_area_free(&slow, c);
	aml_area_destroy(&slow);
	aml_area_destroy(&fast);
	aml_dma_destroy(&dma);
107 108 109
	aml_finalize();
	return 0;
}