Commit 47c021c8 authored by Swann Perarnau's avatar Swann Perarnau
Browse files

[refactor] use new tilings for dgemm_prefetch

Use the new 2D tilings for dgemm_prefetch, also refactor the code to
match the rest of the benchmarks. The code should be a lot more cleaner
now.
parent 9764f3c6
#include <aml.h>
#include <assert.h>
#include <errno.h>
#include <mkl.h>
......@@ -8,260 +9,133 @@
#include <math.h>
#include <stdlib.h>
#define ITER 10
#define MEMSIZES 134217728//1024 entries by 1024 entries * sizeof(unsigned long)
#define NUMBER_OF_THREADS 32
#define L2_CACHE_SIZE 1048576 //1MB
#define HBM_SIZE 17179869184 //16 GB
#define VERBOSE 0 //Verbose mode will print out extra information about what is happening
#define DEBUG 0 //This will print out verbose messages and debugging statements
#define PRINT_ARRAYS 0
#define BILLION 1000000000L
#include <aml.h>
AML_TILING_2D_DECL(tiling);
AML_TILING_2D_DECL(tilingB);
AML_TILING_2D_CONTIG_ROWMAJOR_DECL(tiling_row);
AML_TILING_2D_CONTIG_COLMAJOR_DECL(tiling_col);
AML_TILING_1D_DECL(tiling_prefetch);
AML_AREA_LINUX_DECL(slow);
AML_AREA_LINUX_DECL(fast);
AML_SCRATCH_PAR_DECL(sa);
AML_SCRATCH_PAR_DECL(sb);
size_t numthreads;
//size of 2D Tiles in A matrix
size_t tilesz, esz;
size_t numTiles;
unsigned long CHUNKING;
size_t memsize, tilesize, N, T;
double *a, *b, *c;
unsigned long MEMSIZE;
unsigned long esize, numRows, rowLengthInBytes;
unsigned long rowSizeOfTile, rowSizeInTiles;
unsigned long long beginTime, endTime;
struct timespec startClock, endClock;
double elapsedTime;
//This code will take cycles executed as a use for timing the kernel.
unsigned long rdtsc(){
unsigned int lo,hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return ((unsigned long)hi << 32) | lo;
}
struct timespec start, stop;
void do_work()
{
int i, k, ai, bi, oldai, oldbi, tilesPerCol;
int lda = (int)rowSizeOfTile, ldb, ldc;
int lda = (int)T, ldb, ldc;
ldb = lda;
ldc = lda;
size_t ndims[2];
double *ap, *bp, *cp;
void *abaseptr, *bbaseptr, *cbaseptr;
double *prea, *preb;
int ai, bi, oldai, oldbi;
void *abaseptr, *bbaseptr;
struct aml_scratch_request *ar, *br;
size_t coff;
aml_tiling_ndims(&tiling_row, &ndims[0], &ndims[1]);
abaseptr = aml_scratch_baseptr(&sa);
bbaseptr = aml_scratch_baseptr(&sb);
prea = aml_tiling_tilestart(&tiling_prefetch, a, 0);
preb = aml_tiling_tilestart(&tiling_prefetch, b, 0);
ai = -1; bi = -1;
ap = a;
bp = b;
cp = c;
struct aml_scratch_request *ar, *br, *cr, *cr2;
tilesPerCol = rowSizeInTiles / numthreads; //This should evaluate to an integer value
//This will iterate through each column of tiles in A matrix
//This loop is O(rowSizeInTiles) (O(n))
for(i = 0; i < rowSizeInTiles; i++) {
//Request next column of tiles from A into the scratchpad for A
for(int k = 0; k < ndims[1]; k++)
{
oldbi = bi;
oldai = ai;
aml_scratch_async_pull(&sa, &ar, abaseptr, &ai, a, i + 1);
aml_scratch_async_pull(&sb, &br, bbaseptr, &bi, b, i + 1);
//This will begin the dispersion of work accross threads, this loop is actually O(1)
aml_scratch_async_pull(&sa, &ar, abaseptr, &ai, a, k + 1);
aml_scratch_async_pull(&sb, &br, bbaseptr, &bi, b, k + 1);
#pragma omp parallel for
for(k = 0; k < numthreads; k++)
for(int i = 0; i < ndims[0]; i++)
{
int j, l, offset;
double *apLoc, *bpLoc, *cpLoc;
//This loop will cover if threads are handling multiple rows of tiles. This shouldn't be a necessarily large number
//This loop is technically O(n) but in reality will usually be O(1) because tilesPerCol will be a small number relative to rowSizeInTiles
for(j = 0; j < tilesPerCol; j++){
offset = (k * tilesPerCol) + j;
//This will give the beginning offset for where each thread should point to in the tilingB sized ap array
apLoc = aml_tiling_tilestart(&tiling, ap, offset);
offset = (k * tilesPerCol) + j;
//Now we will iterate through all the tiles in the row of B tiles and compute a partial matrix multiplication
//This loop is O(n)
for(l = 0; l < rowSizeInTiles; l++){
bpLoc = aml_tiling_tilestart(&tiling, bp, l);
//This will begin at the tile row that is at x cordinate equal to bp offset, and y cordinate equal to ap offset
offset = (((int)rowSizeInTiles * ((k * tilesPerCol)+ j) ) + l);
cp = aml_tiling_tilestart(&tiling, c, offset);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, ldc, lda, ldb, 1.0, apLoc, lda, bpLoc, ldb, 1.0, cp, ldc);
}
for(int j = 0; j < ndims[1]; j++)
{
ap = aml_tiling_tilestart(&tiling_row, prea, i);
bp = aml_tiling_tilestart(&tiling_row, preb, j);
coff = i*ndims[1] + j;
cp = aml_tiling_tilestart(&tiling_row, c, coff);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, ldc, lda, ldb, 1.0, ap, lda, bp, ldb, 1.0, cp, ldc);
}
}
}
aml_scratch_wait(&sa, ar);
aml_scratch_wait(&sb, br);
ap = aml_tiling_tilestart(&tilingB, abaseptr, ai);
bp = aml_tiling_tilestart(&tilingB, bbaseptr, bi);
prea = aml_tiling_tilestart(&tiling_prefetch, abaseptr, ai);
preb = aml_tiling_tilestart(&tiling_prefetch, bbaseptr, bi);
aml_scratch_release(&sa, oldai);
aml_scratch_release(&sb, oldbi);
}
}
int argoMM(int argc, char* argv[]){
AML_BINDING_SINGLE_DECL(binding);
AML_ARENA_JEMALLOC_DECL(arena);
AML_DMA_LINUX_SEQ_DECL(dma);
unsigned long nodemask[AML_NODEMASK_SZ];
aml_init(&argc, &argv);
esize = (unsigned long)MEMSIZE/sizeof(double);
numRows = (unsigned long)sqrt(esize);
#pragma omp parallel
{
rowLengthInBytes = (unsigned long)sqrt(MEMSIZE/sizeof(double)) * sizeof(double);
numthreads = omp_get_num_threads();
tilesz = ((unsigned long) pow( ( ( sqrt(MEMSIZE / sizeof(double)) ) / (numthreads * 2) ), 2) ) * sizeof(double);
double multiplier = 2;
if(argc == 4){
tilesz = sizeof(double)*(atol(argv[3]) * atol(argv[3]));
}
numTiles = MEMSIZE / tilesz;
CHUNKING = numTiles / numthreads;
esz = tilesz/sizeof(double);
}
if(DEBUG || VERBOSE)printf("The total memory size is: %lu\nWe are dealing with a %lu x %lu matrix multiplication\nThe number of threads: %d\nThe chunking is: %lu\nThe tilesz is: %lu\nThat means there are %lu elements per tile\nThere are %lu tiles total\nThe length of a column in bytes is: %lu\n", MEMSIZE, (unsigned long)sqrt(MEMSIZE/sizeof(unsigned long)), (unsigned long)sqrt(MEMSIZE/sizeof(unsigned long)),numthreads, CHUNKING, tilesz, esz, numTiles, rowLengthInBytes);
assert(!aml_binding_init(&binding, AML_BINDING_TYPE_SINGLE, 0));
assert(!aml_tiling_init(&tiling, AML_TILING_TYPE_2D, (unsigned long)sqrt(tilesz/sizeof(double))*sizeof(double), (unsigned long)sqrt(tilesz/sizeof(double))*sizeof(double), tilesz, MEMSIZE));
rowSizeOfTile = aml_tiling_tilerowsize(&tiling, 0) / sizeof(double);
rowSizeInTiles = numRows / rowSizeOfTile;
assert(!aml_tiling_init(&tilingB, AML_TILING_TYPE_2D, rowSizeOfTile * sizeof(double), rowSizeInTiles * rowSizeOfTile * sizeof(double), tilesz * rowSizeInTiles, MEMSIZE));
AML_NODEMASK_ZERO(nodemask);
AML_NODEMASK_SET(nodemask, 0);
assert(!aml_arena_jemalloc_init(&arena, AML_ARENA_JEMALLOC_TYPE_REGULAR));
assert(!aml_area_linux_init(&slow,
AML_AREA_LINUX_MANAGER_TYPE_SINGLE,
AML_AREA_LINUX_MBIND_TYPE_REGULAR,
AML_AREA_LINUX_MMAP_TYPE_ANONYMOUS,
&arena, MPOL_BIND, nodemask));
AML_NODEMASK_ZERO(nodemask);
AML_NODEMASK_SET(nodemask, 1);
assert(!aml_area_linux_init(&fast,
AML_AREA_LINUX_MANAGER_TYPE_SINGLE,
AML_AREA_LINUX_MBIND_TYPE_REGULAR,
AML_AREA_LINUX_MMAP_TYPE_ANONYMOUS,
&arena, MPOL_BIND, nodemask));
assert(!aml_dma_linux_seq_init(&dma, 3));
assert(!aml_scratch_par_init(&sa, &fast, &slow, &dma, &tilingB, 2, 2));
assert(!aml_scratch_par_init(&sb, &fast, &slow, &dma, &tilingB, 2, 2));
/* allocation */
a = aml_area_malloc(&slow, MEMSIZE);
b = aml_area_malloc(&slow, MEMSIZE);
c = aml_area_malloc(&fast, MEMSIZE);
assert(a != NULL && b != NULL && c != NULL);
esize = MEMSIZE/sizeof(double);
numRows = (unsigned long)sqrt(esize);
for(unsigned long i = 0; i < esize; i++) {
a[i] = 1.0;//i % numRows;
b[i] = 1.0;//numRows - (i % numRows);
c[i] = 0.0;
}
int newLines = 0;
//This will execute on core 0
clock_gettime(CLOCK_REALTIME, &startClock);
beginTime = rdtsc();
do_work();
//This will execute on core 0
endTime = rdtsc();
clock_gettime(CLOCK_REALTIME, &endClock);
elapsedTime = BILLION * ( endClock.tv_sec - startClock.tv_sec ) + (( endClock.tv_nsec - startClock.tv_nsec ));
//Prints RDTSC then CLOCK
printf("%lu\t%lf\n", endTime - beginTime, elapsedTime);
/* validate */
unsigned long correct = 1;
for(unsigned long i = 0; i < esize; i++){
if(c[0] != c[i]){
correct = 0;
}
}
if(!correct){
printf("The matrix multiplication failed. The last incorrect result is at location C(0,0) = %lf in the C matrix\n", c[0]);
}
aml_scratch_par_destroy(&sa);
aml_scratch_par_destroy(&sb);
aml_dma_linux_seq_destroy(&dma);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&fast, c);
aml_area_linux_destroy(&slow);
aml_area_linux_destroy(&fast);
aml_tiling_destroy(&tiling, AML_TILING_TYPE_1D);
aml_tiling_destroy(&tilingB, AML_TILING_TYPE_2D);
aml_binding_destroy(&binding, AML_BINDING_TYPE_SINGLE);
aml_finalize();
}
//This matrix multiplication will implement matrix multiplication in the following way:
// The A, B, and C matrices will be broken into tiles that edge as close to 1 MB as possible (size of L2 cache) while also allowing all threads to work on data.
// The algorithm will chunk up the work dependent on number of tiles. The multiplication will go as follows:
// 1) The algorithm will take a column of the tiles of A. (Prefetch next column of A tiles to fast memory).
// 2) The algorithm will take a column of B tiles (prefetch next column into fast memory).
// 3) Perform partial matrix multiplications using A tile and B Tile (using dgemm).
// 4) Repeat partial matrix multiplications until A and B columns are exhausted.
// 5) DONE
//Another potential solution could be to tile the B matrix as well. This will require Atomic Additions though.
int main(int argc, char *argv[])
int main(int argc, char* argv[])
{
if(argc == 1){
if(VERBOSE) printf("No arguments provided, setting numThreads = %d and Memsize = %lu\n", NUMBER_OF_THREADS, MEMSIZE);
omp_set_num_threads(NUMBER_OF_THREADS);
MEMSIZE = MEMSIZES;
AML_ARENA_JEMALLOC_DECL(arena);
AML_DMA_LINUX_SEQ_DECL(dma);
struct bitmask *slowb, *fastb;
aml_init(&argc, &argv);
assert(argc == 5);
fastb = numa_parse_nodestring_all(argv[1]);
slowb = numa_parse_nodestring_all(argv[2]);
N = atol(argv[3]);
T = atol(argv[4]);
/* let's not handle messy tile sizes */
assert(N % T == 0);
memsize = sizeof(double)*N*N;
tilesize = sizeof(double)*T*T;
/* the initial tiling, 2d grid of tiles */
assert(!aml_tiling_init(&tiling_row, AML_TILING_TYPE_2D_CONTIG_ROWMAJOR,
tilesize, memsize, N/T , N/T));
assert(!aml_tiling_init(&tiling_col, AML_TILING_TYPE_2D_CONTIG_COLMAJOR,
tilesize, memsize, N/T , N/T));
/* the prefetch tiling, 1D sequence of columns of tiles */
assert(!aml_tiling_init(&tiling_prefetch, AML_TILING_TYPE_1D,
tilesize*(N/T), memsize));
assert(!aml_arena_jemalloc_init(&arena, AML_ARENA_JEMALLOC_TYPE_REGULAR));
assert(!aml_area_linux_init(&slow,
AML_AREA_LINUX_MANAGER_TYPE_SINGLE,
AML_AREA_LINUX_MBIND_TYPE_REGULAR,
AML_AREA_LINUX_MMAP_TYPE_ANONYMOUS,
&arena, MPOL_BIND, slowb->maskp));
assert(!aml_area_linux_init(&fast,
AML_AREA_LINUX_MANAGER_TYPE_SINGLE,
AML_AREA_LINUX_MBIND_TYPE_REGULAR,
AML_AREA_LINUX_MMAP_TYPE_ANONYMOUS,
&arena, MPOL_BIND, fastb->maskp));
assert(!aml_dma_linux_seq_init(&dma, 2));
assert(!aml_scratch_par_init(&sa, &fast, &slow, &dma, &tiling_prefetch, 2, 2));
assert(!aml_scratch_par_init(&sb, &fast, &slow, &dma, &tiling_prefetch, 2, 2));
/* allocation */
a = aml_area_malloc(&slow, memsize);
b = aml_area_malloc(&slow, memsize);
c = aml_area_malloc(&fast, memsize);
assert(a != NULL && b != NULL && c != NULL);
for(unsigned long i = 0; i < N*N; i++) {
a[i] = (double)rand();
b[i] = (double)rand();
c[i] = 0.0;
}
else if(argc == 2){
if(VERBOSE) printf("Setting number of threads\n");
omp_set_num_threads(NUMBER_OF_THREADS);
if(VERBOSE) printf("Setting MEMSIZE\n");
MEMSIZE = sizeof(double)*(atol(argv[1]) * atol(argv[1]));
if(VERBOSE) printf("1 argument provided, setting numThreads = %d and Memsize = %lu\n", NUMBER_OF_THREADS, MEMSIZE);
}
else if(argc >= 3){
omp_set_num_threads(atoi(argv[2]));
MEMSIZE = sizeof(double)*(atol(argv[1]) * atol(argv[1]));
if(VERBOSE) printf("Two arguments provided, setting numThreads = %d and Memsize = %lu\n", atoi(argv[2]), atol(argv[1]));
}
argoMM(argc, argv);
clock_gettime(CLOCK_REALTIME, &start);
do_work();
clock_gettime(CLOCK_REALTIME, &stop);
long long int time = 0;
time = (stop.tv_nsec - start.tv_nsec) +
1e9* (stop.tv_sec - start.tv_sec);
double flops = (2.0*N*N*N)/(time/1e9);
/* print the flops in GFLOPS */
printf("dgemm-prefetch: %llu %lld %lld %f\n", N, memsize, time,
flops/1e9);
aml_scratch_par_destroy(&sa);
aml_scratch_par_destroy(&sb);
aml_dma_linux_seq_destroy(&dma);
aml_area_free(&slow, a);
aml_area_free(&slow, b);
aml_area_free(&fast, c);
aml_area_linux_destroy(&slow);
aml_area_linux_destroy(&fast);
aml_tiling_destroy(&tiling_row, AML_TILING_TYPE_2D_CONTIG_ROWMAJOR);
aml_tiling_destroy(&tiling_col, AML_TILING_TYPE_2D_CONTIG_ROWMAJOR);
aml_tiling_destroy(&tiling_prefetch, AML_TILING_TYPE_1D);
aml_finalize();
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment