Commit a41b0412 authored by Swann Perarnau's avatar Swann Perarnau

[refactor] Rework code for better abstractions

This is a rewrite of the existing code into a memory library exposing
more of its internal abstractions. This refactoring is required to:
- make progress faster by focusing on the core new features
- abstract more of the underlying components and expose those
abstractions
- build upon existing libraries (memkind) for the internal stuff.

Memkind is used as a crutch here, we do not intend to use it in the long
term, as some of its internal are opposed to what we want (topology
management in particular).

Nevertheless, it currently provides a good allocator internally, and
decent access to deep memory, for now.

Over time, we figured out that the best way to build this API was to
create several layers of APIs, each with more abstractions over the
devices. At the same time, we want each layer to expose its internal
mechanisms, so that a user can customize any of them.

This is why we end up with areas and dma engines, and we will add in the
future other stuff, like data decomposition and distribution methods, as
well as direct support for "pipelining".
parent 044dc0c2
......@@ -40,6 +40,9 @@ AM_CONDITIONAL([TEST_VALGRIND],[test "x$valgrind" = xtrue])
AC_CHECK_HEADERS(numa.h)
AC_CHECK_LIB(numa, move_pages)
# memkind
AC_CHECK_LIB(memkind, memkind_malloc)
AC_CONFIG_HEADERS([src/config.h])
AC_CONFIG_FILES([Makefile
......
lib_LTLIBRARIES = libaml.la
LIBCSOURCES = aml.c allocator.c
LIBHSOURCES = aml.h allocator.h
LIBCSOURCES = aml.c area.c dma.c
LIBHSOURCES = aml.h
libaml_la_SOURCES = $(LIBCSOURCES) $(LIBHSOURCES)
include_HEADERS = $(LIBHSOURCES)
......@@ -27,86 +27,3 @@ int aml_finalize(void)
{
return 0;
}
int aml_node_init(struct aml_node *node, unsigned int nid)
{
assert(node != NULL);
assert(nid < MAX_NUMNODES);
node->numaid = nid;
node->mask = numa_bitmask_alloc(MAX_NUMNODES);
numa_bitmask_setbit(node->mask, nid);
return 0;
}
int aml_node_destroy(struct aml_node *node)
{
assert(node != NULL);
free(node->mask);
return 0;
}
int aml_malloc(struct aml_alloc *a, size_t memsize, size_t blocksize,
struct aml_node *node)
{
assert(a != NULL);
assert(memsize % blocksize == 0);
assert(blocksize % PAGE_SIZE == 0);
/* TODO: convert to SICM */
struct bitmask *oldbind = numa_get_membind();
numa_set_membind(node->mask);
void *m = mmap(NULL, memsize, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
assert(m != MAP_FAILED);
memset(m, 0, memsize);
numa_set_membind(oldbind);
/* start tracking blocks */
a->start = m;
a->memsize = memsize;
a->blocksize = blocksize;
a->numblocks = memsize/blocksize;
a->nodemap = calloc(a->numblocks, sizeof(*a->nodemap));
for(unsigned long i = 0; i < a->numblocks; i++)
a->nodemap[i] = node;
return 0;
}
int aml_free(struct aml_alloc *a)
{
assert(a != NULL);
free(a->nodemap);
a->nodemap = NULL;
return munmap(a->start, a->memsize);
}
int aml_block_address(struct aml_alloc *a, size_t block, void **ret)
{
assert(a != NULL);
assert(block < a->numblocks);
*ret = (void*)((char*)a->start + block*a->blocksize);
return 0;
}
int aml_block_move(struct aml_alloc *a, size_t block, struct aml_node *node)
{
assert(a != NULL);
assert(block < a->numblocks);
if(a->nodemap[block] != node) {
unsigned long count = a->blocksize/PAGE_SIZE;
int *nodes = calloc(count, sizeof(*nodes));
void **pages = calloc(count, sizeof(*pages));
int *status = calloc(count, sizeof(*status));
for(unsigned long i = 0; i < count; i++) {
nodes[i] = node->numaid;
pages[i] = (void*)((char*)a->start + i*PAGE_SIZE);
}
move_pages(0, count, pages, nodes, status, MPOL_MF_MOVE);
}
return 0;
}
int aml_block_copy(struct aml_alloc *src, size_t srcblock,
struct aml_alloc *dest, size_t destblock)
{
return 0;
}
#ifndef AML_H
#define AML_H 1
#include<numa.h>
#include <numa.h>
#include <memkind.h>
#include <stdlib.h>
/* An allocation.
*
* Contains information about an allocation.
*/
struct aml_node;
struct aml_alloc;
struct aml_alloc {
void *start;
size_t memsize;
size_t blocksize;
size_t numblocks;
struct aml_node **nodemap;
};
/*******************************************************************************
* Areas:
* embeds information about a byte-addressable physical memory location and well
* as binding policies over it.
******************************************************************************/
struct aml_node {
struct bitmask *mask;
int numaid;
/* WARNING: kind must be the first argument for this library to work */
struct aml_area {
memkind_t kind;
struct bitmask *nodemask;
};
int aml_init(int *argc, char **argv[]);
int aml_finalize(void);
#define AML_AREA_TYPE_HBM 0
#define AML_AREA_TYPE_REGULAR 1
#define AML_AREA_TYPE_MAX 2
int aml_area_init(struct aml_area *, unsigned int type);
int aml_area_from_nodestring(struct aml_area *, unsigned int, const char *);
int aml_area_from_nodemask(struct aml_area *, unsigned int, struct bitmask *);
int aml_area_destroy(struct aml_area *);
/*******************************************************************************
* Area allocations:
* Low-level, direct allocation of memory from an area.
******************************************************************************/
int aml_node_init(struct aml_node *, unsigned int);
int aml_node_destroy(struct aml_node *);
void *aml_area_malloc(struct aml_area *, size_t);
void aml_area_free(struct aml_area *, void *);
void *aml_area_calloc(struct aml_area *, size_t, size_t);
void *aml_area_realloc(struct aml_area *, void *, size_t);
void *aml_area_acquire(struct aml_area *, size_t);
void aml_area_release(struct aml_area *, void *);
int aml_malloc(struct aml_alloc *, size_t, size_t, struct aml_node *);
int aml_free(struct aml_alloc *);
/*******************************************************************************
* DMA Engines:
* Low-level, direct movement of memory.
* We haven't decided in our design how we want to deal with memcpy/move_pages
* differences yet.
******************************************************************************/
inline size_t aml_block_size(struct aml_alloc *a) {
return a->blocksize;
}
struct aml_dma {
int (*copy)(struct aml_dma *, void *, const void *, size_t);
int (*move)(struct aml_dma *, struct aml_area *, struct aml_area *,
void *, size_t);
};
int aml_dma_init(struct aml_dma *, unsigned int);
int aml_dma_destroy(struct aml_dma *);
int aml_dma_copy(struct aml_dma *, void *, const void *, size_t);
int aml_dma_move(struct aml_dma *, struct aml_area *, struct aml_area *,
void *, size_t);
/*******************************************************************************
* General functions:
* Initialize internal structures, cleanup everything at the end.
******************************************************************************/
int aml_block_address(struct aml_alloc *, size_t, void **);
int aml_init(int *argc, char **argv[]);
int aml_finalize(void);
int aml_block_move(struct aml_alloc *, size_t, struct aml_node *);
int aml_block_copy(struct aml_alloc *, size_t, struct aml_alloc *, size_t);
#endif
#include <aml.h>
#include <assert.h>
memkind_t *type2kind[AML_AREA_TYPE_MAX] = {
&MEMKIND_HBW_ALL,
&MEMKIND_REGULAR,
};
/*******************************************************************************
* memkind additional functions:
* memkind is missing some features, that we add by re-implementing some of the
* needed hooks.
******************************************************************************/
int aml_memkind_areanodemask(struct memkind *kind, unsigned long *nodemask,
unsigned long maxnode)
{
/* transform back kind into an area */
struct aml_area *area = (struct aml_area*)kind;
struct bitmask ret = {maxnode, nodemask};
copy_bitmask_to_bitmask(area->nodemask, &ret);
return 0;
}
/*******************************************************************************
* area implementation
* At this point, use memkind internally to implement our stuff
******************************************************************************/
int aml_area_init(struct aml_area *area, unsigned int type)
{
assert(type < AML_AREA_TYPE_MAX);
area->kind = *type2kind[type];
area->nodemask = numa_allocate_nodemask();
copy_bitmask_to_bitmask(area->nodemask,numa_all_nodes_ptr);
return 0;
}
int aml_area_from_nodestring(struct aml_area *area, unsigned int type,
const char *nodes)
{
aml_area_init(area, type);
area->nodemask = numa_parse_nodestring(nodes);
return 0;
}
int aml_area_from_nodemask(struct aml_area *area, unsigned int type,
struct bitmask *nodes)
{
aml_area_init(area, type);
copy_bitmask_to_bitmask(area->nodemask, nodes);
return 0;
}
int aml_area_destroy(struct aml_area *area)
{
numa_bitmask_free(area->nodemask);
return 0;
}
void *aml_area_malloc(struct aml_area *area, size_t size)
{
return memkind_malloc(area->kind, size);
}
void aml_area_free(struct aml_area *area, void *ptr)
{
memkind_free(area->kind, ptr);
}
void *aml_area_calloc(struct aml_area *area, size_t num, size_t size)
{
return memkind_calloc(area->kind, num, size);
}
void *aml_area_realloc(struct aml_area *area, void *ptr, size_t size)
{
return memkind_realloc(area->kind, ptr, size);
}
void *aml_area_acquire(struct aml_area *area, size_t size)
{
/* as far as we know memkind doesn't zero new areas
* TODO: find a way to assert it
*/
return aml_area_malloc(area,size);
}
void aml_area_release(struct aml_area *area, void *ptr)
{
/* As far as we know memkind doesn't decommit new areas
* TODO: find a way to assert it
*/
aml_area_free(area, ptr);
}
#include <aml.h>
#include <assert.h>
/*******************************************************************************
* DMA implementation
* At this point, implement the single threaded synchronous stuff
******************************************************************************/
int aml_dma_init(struct aml_dma *dma, unsigned int type)
{
return 0;
}
int aml_dma_destroy(struct aml_dma *dma)
{
return 0;
}
int aml_dma_copy(struct aml_dma *dma, void *dest, const void *src, size_t size)
{
return 0;
}
int aml_dma_move(struct aml_dma *dma, struct aml_area *dest,
struct aml_area *src, void *ptr, size_t size)
{
return 0;
}
......@@ -10,7 +10,7 @@ TESTS_ENVIRONMENT= @LIBTOOL@ --mode=execute @VALGRIND@ --tool=memcheck -q --leak
endif
# all check programs
TST_PROGS = stream_add_pth
TST_PROGS = stream_add_pth stream_add_omp stream_vanilla
check_PROGRAMS = $(TST_PROGS)
TESTS = $(TST_PROGS)
......@@ -25,13 +25,15 @@ int main(int argc, char *argv[])
/* we want to back our array on the slow node and use the fast node as
* a faster buffer.
*/
struct aml_node slow, fast;
struct bitmask *mask = numa_parse_nodestring_all("0");
assert(!aml_node_init(&slow, mask, MEMSIZE*3));
assert(!aml_node_init(&fast, mask, MEMSIZE*3));
struct aml_area slow, fast;
int type = AML_AREA_TYPE_REGULAR;
assert(!aml_area_from_nodestring(&slow, type, "0"));
assert(!aml_area_from_nodestring(&fast, type, "0"));
/* we are only dealing with one contiguous array */
struct aml_alloc a,b,c;
struct aml_dma dma;
assert(!aml_dma_init(&dma, 0));
void *a, *b, *c;
/* describe the allocation */
size_t chunk_msz, esz;
......@@ -43,14 +45,15 @@ int main(int argc, char *argv[])
chunk_msz = MEMSIZE/(numthreads*CHUNKING);
esz = chunk_msz/sizeof(unsigned long);
}
assert(!aml_malloc(&a, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&b, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&c, MEMSIZE, chunk_msz, &slow));
a = aml_area_malloc(&slow, MEMSIZE);
b = aml_area_malloc(&slow, MEMSIZE);
c = aml_area_malloc(&slow, MEMSIZE);
assert(a != NULL && b != NULL && c != NULL);
/* create virtually accessible address range, backed by slow memory */
unsigned long *wa = (unsigned long*)a.start;
unsigned long *wb = (unsigned long*)b.start;
unsigned long *wc = (unsigned long*)c.start;
unsigned long *wa = (unsigned long*)a;
unsigned long *wb = (unsigned long*)b;
unsigned long *wc = (unsigned long*)c;
unsigned long esize = MEMSIZE/sizeof(unsigned long);
for(unsigned long i = 0; i < esize; i++) {
wa[i] = i;
......@@ -64,19 +67,19 @@ int main(int argc, char *argv[])
{
for(unsigned long i = 0; i < numthreads*CHUNKING; i++) {
#pragma omp task depend(inout: wa[i*esz:esz])
assert(!aml_pull_sync(&a, i, &fast));
assert(!aml_dma_move(&dma, &fast, &slow, &wa[i*esz], esz));
#pragma omp task depend(inout: wb[i*esz:esz])
assert(!aml_pull_sync(&b, i, &fast));
assert(!aml_dma_move(&dma, &fast, &slow, &wb[i*esz], esz));
#pragma omp task depend(inout: wc[i*esz:esz])
assert(!aml_pull_sync(&c, i, &fast));
assert(!aml_dma_move(&dma, &fast, &slow, &wc[i*esz], esz));
#pragma omp task depend(in: wa[i*esz:esz], wb[i*esz:esz]) depend(out: wc[i*esz:esz])
kernel(&wa[i*esz], &wb[i*esz], &wc[i*esz], esz);
#pragma omp task depend(inout: wa[i*esz:esz])
assert(!aml_push_sync(&a, i, &slow));
assert(!aml_dma_move(&dma, &slow, &fast, &wa[i*esz], esz));
#pragma omp task depend(inout: wb[i*esz:esz])
assert(!aml_push_sync(&b, i, &slow));
assert(!aml_dma_move(&dma, &slow, &fast, &wb[i*esz], esz));
#pragma omp task depend(inout: wc[i*esz:esz])
assert(!aml_push_sync(&c, i, &slow));
assert(!aml_dma_move(&dma, &slow, &fast, &wc[i*esz], esz));
}
}
......@@ -85,11 +88,12 @@ int main(int argc, char *argv[])
assert(wc[i] == esize);
}
aml_free(&a);
aml_free(&b);
aml_free(&c);
aml_node_destroy(&slow);
aml_node_destroy(&fast);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&slow, c);
aml_area_destroy(&slow);
aml_area_destroy(&fast);
aml_dma_destroy(&dma);
aml_finalize();
return 0;
}
......@@ -10,7 +10,8 @@
#define MEMSIZE (1UL<<20)
#define CHUNKING 4
struct aml_node slow, fast;
struct aml_area slow, fast;
struct aml_dma dma;
int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
{
......@@ -22,23 +23,23 @@ int kernel(unsigned long *a, unsigned long *b, unsigned long *c, size_t n)
}
struct cinfo {
struct aml_alloc *tab;
unsigned long *tab;
pthread_t tid;
unsigned long chunk;
size_t size;
};
void *th_copy(void *arg)
{
struct cinfo *ci = arg;
aml_block_move(ci->tab, ci->chunk, &fast);
aml_dma_move(&dma, &fast, &slow, ci->tab, ci->size*sizeof(unsigned long));
return arg;
}
struct winfo {
struct aml_alloc *a, *b, *c;
unsigned long *a, *b, *c;
pthread_t *ca, *cb;
pthread_t tid;
unsigned long chunk;
size_t size;
};
void *th_work(void *arg)
......@@ -47,16 +48,7 @@ void *th_work(void *arg)
pthread_join(*(wi->ca), NULL);
pthread_join(*(wi->cb), NULL);
void *aa,*bb,*cc;
size_t esize = aml_block_size(wi->c)/sizeof(unsigned long);
aml_block_address(wi->a, wi->chunk, &aa);
aml_block_address(wi->b, wi->chunk, &bb);
aml_block_address(wi->c, wi->chunk, &cc);
printf("%p[%lu]:%p\n",wi->a->start, wi->chunk, aa);
printf("%p[%lu]:%p\n",wi->b->start, wi->chunk, bb);
printf("%p[%lu]:%p\n",wi->c->start, wi->chunk, cc);
kernel(aa, bb, cc, esize);
kernel(wi->a, wi->b, wi->c, wi->size);
return arg;
}
int main(int argc, char *argv[])
......@@ -67,11 +59,13 @@ int main(int argc, char *argv[])
/* we want to back our array on the slow node and use the fast node as
* a faster buffer.
*/
assert(!aml_node_init(&slow, 0));
assert(!aml_node_init(&fast, 0));
assert(!aml_area_from_nodestring(&slow, AML_AREA_TYPE_REGULAR, "0"));
assert(!aml_area_from_nodestring(&fast, AML_AREA_TYPE_REGULAR, "0"));
struct aml_dma dma;
assert(!aml_dma_init(&dma, 0));
void *a, *b, *c;
struct aml_alloc a,b,c;
/* describe the allocation */
size_t chunk_msz, esz;
int numthreads, copythreads;
......@@ -85,15 +79,15 @@ int main(int argc, char *argv[])
chunk_msz = MEMSIZE/(numthreads*CHUNKING);
esz = chunk_msz/sizeof(unsigned long);
}
printf("th: %lu, mem: %zi, chunk: %zi\n",numthreads,MEMSIZE,chunk_msz);
assert(!aml_malloc(&a, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&b, MEMSIZE, chunk_msz, &slow));
assert(!aml_malloc(&c, MEMSIZE, chunk_msz, &fast));
a = aml_area_malloc(&slow, MEMSIZE);
b = aml_area_malloc(&slow, MEMSIZE);
c = aml_area_malloc(&fast, MEMSIZE);
assert(a != NULL && b != NULL && c != NULL);
/* create virtually accessible address range, backed by slow memory */
unsigned long *wa = (unsigned long*)a.start;
unsigned long *wb = (unsigned long*)b.start;
unsigned long *wc = (unsigned long*)c.start;
unsigned long *wa = (unsigned long*)a;
unsigned long *wb = (unsigned long*)b;
unsigned long *wc = (unsigned long*)c;
unsigned long esize = MEMSIZE/sizeof(unsigned long);
for(unsigned long i = 0; i < esize; i++) {
wa[i] = i;
......@@ -107,16 +101,16 @@ int main(int argc, char *argv[])
struct winfo *wis = calloc(numthreads, sizeof(struct winfo));
for(unsigned long i = 0; i < CHUNKING; i++) {
for(unsigned long j = 0; j < numthreads; j++) {
cas[j].tab = &a;
cas[j].chunk = i*CHUNKING + j;
cbs[j].tab = &b;
cbs[j].chunk = i*CHUNKING + j;
wis[j].a = &a;
wis[j].b = &b;
wis[j].c = &c;
cas[j].tab = &wa[i*CHUNKING +j];
cas[j].size = esize;
cbs[j].tab = &wb[i*CHUNKING +j];
cbs[j].size = esize;
wis[j].a = &wa[i*CHUNKING +j];
wis[j].b = &wb[i*CHUNKING +j];
wis[j].c = &wc[i*CHUNKING +j];
wis[j].ca = &cas[j].tid;
wis[j].cb = &cbs[j].tid;
wis[j].chunk = i*CHUNKING + j;
wis[j].size = esize;
pthread_create(&cas[j].tid, NULL, &th_copy, (void*)&cas[j]);
pthread_create(&cbs[j].tid, NULL, &th_copy, (void*)&cbs[j]);
pthread_create(&wis[j].tid, NULL, &th_work, (void*)&wis[j]);
......@@ -134,11 +128,12 @@ int main(int argc, char *argv[])
assert(wc[i] == esize);
}
aml_free(&a);
aml_free(&b);
aml_free(&c);
aml_node_destroy(&slow);
aml_node_destroy(&fast);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&fast, c);
aml_area_destroy(&slow);
aml_area_destroy(&fast);
aml_dma_destroy(&dma);
aml_finalize();
return 0;
}
/*-----------------------------------------------------------------------*/
/* Program: STREAM */
/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
/* Original code developed by John D. McCalpin */
/* Programmers: John D. McCalpin */
/* Joe R. Zagar */
/* */
/* This program measures memory transfer rates in MB/s for simple */
/* computational kernels coded in C. */
/*-----------------------------------------------------------------------*/
/* Copyright 1991-2013: John D. McCalpin */
/*-----------------------------------------------------------------------*/
/* License: */
/* 1. You are free to use this program and/or to redistribute */
/* this program. */
/* 2. You are free to modify this program for your own use, */
/* including commercial use, subject to the publication */
/* restrictions in item 3. */
/* 3. You are free to publish results obtained from running this */
/* program, or from works that you derive from this program, */
/* with the following limitations: */
/* 3a. In order to be referred to as "STREAM benchmark results", */
/* published results must be in conformance to the STREAM */
/* Run Rules, (briefly reviewed below) published at */
/* http://www.cs.virginia.edu/stream/ref.html */
/* and incorporated herein by reference. */
/* As the copyright holder, John McCalpin retains the */
/* right to determine conformity with the Run Rules. */
/* 3b. Results based on modified source code or on runs not in */
/* accordance with the STREAM Run Rules must be clearly */
/* labelled whenever they are published. Examples of */
/* proper labelling include: */
/* "tuned STREAM benchmark results" */
/* "based on a variant of the STREAM benchmark code" */
/* Other comparable, clear, and reasonable labelling is */
/* acceptable. */
/* 3c. Submission of results to the STREAM benchmark web site */
/* is encouraged, but not required. */
/* 4. Use of this program or creation of derived works based on this */
/* program constitutes acceptance of these licensing restrictions. */
/* 5. Absolutely no warranty is expressed or implied. */
/*-----------------------------------------------------------------------*/
# include <aml.h>
# include <stdio.h>
# include <unistd.h>
# include <math.h>
# include <float.h>
# include <limits.h>
# include <sys/time.h>
/*-----------------------------------------------------------------------
* INSTRUCTIONS:
*
* 1) STREAM requires different amounts of memory to run on different
* systems, depending on both the system cache size(s) and the
* granularity of the system timer.
* You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
* to meet *both* of the following criteria:
* (a) Each array must be at least 4 times the size of the
* available cache memory. I don't worry about the difference
* between 10^6 and 2^20, so in practice the minimum array size
* is about 3.8 times the cache size.
* Example 1: One Xeon E3 with 8 MB L3 cache
* STREAM_ARRAY_SIZE should be >= 4 million, giving
* an array size of 30.5 MB and a total memory requirement
* of 91.5 MB.
* Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
* STREAM_ARRAY_SIZE should be >= 20 million, giving
* an array size of 153 MB and a total memory requirement
* of 458 MB.
* (b) The size should be large enough so that the 'timing calibration'
* output by the program is at least 20 clock-ticks.
* Example: most versions of Windows have a 10 millisecond timer
* granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds.
* If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
* This means the each array must be at least 1 GB, or 128M elements.
*
* Version 5.10 increases the default array size from 2 million
* elements to 10 million elements in response to the increasing
* size of L3 caches. The new default size is large enough for caches
* up to 20 MB.
* Version 5.10 changes the loop index variables from "register int"
* to "ssize_t", which allows array indices >2^32 (4 billion)
* on properly configured 64-bit systems. Additional compiler options
* (such as "-mcmodel=medium") may be required for large memory runs.
*
* Array size can be set at compile time without modifying the source
* code for the (many) compilers that support preprocessor definitions
* on the compile line. E.g.,
* gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
* will override the default size of 10M with a new size of 100M elements
* per array.
*/
#ifndef STREAM_ARRAY_SIZE
# define STREAM_ARRAY_SIZE 1000000
#endif
/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result
* for any iteration after the first, therefore the minimum value
* for NTIMES is 2.
* There are no rules on maximum allowable values for NTIMES, but
* values larger than the default are unlikely to noticeably
* increase the reported performance.
* NTIMES can also be set on the compile line without changing the source
* code using, for example, "-DNTIMES=7".
*/
#ifdef NTIMES
#if NTIMES<=1
# define NTIMES 10
#endif
#endif
#ifndef NTIMES
# define NTIMES 10
#endif
/* Users are allowed to modify the "OFFSET" variable, which *may* change the
* relative alignment of the arrays (though compilers may change the
* effective offset by making the arrays non-contiguous on some systems).
* Use of non-zero values for OFFSET can be especially helpful if the
* STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
* OFFSET can also be set on the compile line without changing the source
* code using, for example, "-DOFFSET=56".
*/
#ifndef OFFSET
# define OFFSET 0
#endif
/*
* 3) Compile the code with optimization. Many compilers generate
* unreasonably bad code before the optimizer tightens things up.
* If the results are unreasonably good, on the other hand, the
* optimizer might be too smart for me!
*
* For a simple single-core version, try compiling with:
* cc -O stream.c -o stream
* This is known to work on many, many systems....
*
* To use multiple cores, you need to tell the compiler to obey the OpenMP
* directives in the code. This varies by compiler, but a common example is
* gcc -O -fopenmp stream.c -o stream_omp
* The environment variable OMP_NUM_THREADS allows runtime control of the
* number of threads/cores used when the resulting "stream_omp" program
* is executed.