dgemm_prefetch.c 6.38 KB
Newer Older
Swann Perarnau's avatar
Swann Perarnau committed
1 2 3 4 5 6 7 8 9 10
/*******************************************************************************
 * Copyright 2019 UChicago Argonne, LLC.
 * (c.f. AUTHORS, LICENSE)
 *
 * This file is part of the AML project.
 * For more info, see https://xgitlab.cels.anl.gov/argo/aml
 *
 * SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/

11
#include "aml.h"
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
12
#include "aml/area/linux.h"
13
#include "aml/dma/linux-seq.h"
14
#include "aml/scratch/par.h"
15 16
#include "aml/tiling/1d.h"
#include "aml/tiling/2d.h"
17 18
#include <assert.h>
#include <errno.h>
19
#include <mkl.h>
20 21 22 23 24 25 26
#include <omp.h>
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>

27 28
AML_TILING_2D_ROWMAJOR_DECL(tiling_row);
AML_TILING_2D_COLMAJOR_DECL(tiling_col);
29
AML_TILING_1D_DECL(tiling_prefetch);
30 31 32
AML_SCRATCH_PAR_DECL(sa);
AML_SCRATCH_PAR_DECL(sb);

Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
33
struct aml_area *slow, *fast;
34
size_t memsize, tilesize, N, T;
35
double *a, *b, *c;
36
struct timespec start, stop;
37 38 39

void do_work()
{
40
	int lda = (int)T, ldb, ldc;
41 42
	ldb = lda;
	ldc = lda;
43 44 45 46 47 48
	size_t ndims[2];
	double *prea, *preb;
	int ai, bi, oldai, oldbi;
	void *abaseptr, *bbaseptr;
	struct aml_scratch_request *ar, *br;
	aml_tiling_ndims(&tiling_row, &ndims[0], &ndims[1]);
49 50
	abaseptr = aml_scratch_baseptr(&sa);
	bbaseptr = aml_scratch_baseptr(&sb);
51 52
	prea = aml_tiling_tilestart(&tiling_prefetch, a, 0);
	preb = aml_tiling_tilestart(&tiling_prefetch, b, 0);
53 54
	ai = -1; bi = -1;

55 56
	for(int k = 0; k < ndims[1]; k++)
	{
57 58
		oldbi = bi;
		oldai = ai;
59 60
		aml_scratch_async_pull(&sa, &ar, abaseptr, &ai, a, k + 1);
		aml_scratch_async_pull(&sb, &br, bbaseptr, &bi, b, k + 1);
61
		#pragma omp parallel for
62
		for(int i = 0; i < ndims[0]; i++)
63
		{
64 65
			for(int j = 0; j < ndims[1]; j++)
			{
66 67
				size_t coff;
				double *ap, *bp, *cp;
68 69
				ap = aml_tiling_tilestart(&tiling_row, prea, i);
				bp = aml_tiling_tilestart(&tiling_row, preb, j);
70
				coff = aml_tiling_tileid(&tiling_row, i, j);
71 72
				cp = aml_tiling_tilestart(&tiling_row, c, coff);
				cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, ldc, lda, ldb, 1.0, ap, lda, bp, ldb, 1.0, cp, ldc);
73
			}
74
		}
75 76
		aml_scratch_wait(&sa, ar);
		aml_scratch_wait(&sb, br);
77 78
		prea = aml_tiling_tilestart(&tiling_prefetch, abaseptr, ai);
		preb = aml_tiling_tilestart(&tiling_prefetch, bbaseptr, bi);
79 80 81 82 83
		aml_scratch_release(&sa, oldai);
		aml_scratch_release(&sb, oldbi);
	}
}

84
int main(int argc, char* argv[])
85
{
86
	AML_DMA_LINUX_SEQ_DECL(dma);
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
87
	struct aml_bitmap slowb, fastb;
88 89
	aml_init(&argc, &argv);
	assert(argc == 5);
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
90 91
	assert(aml_bitmap_from_string(&fastb, argv[1]) == 0);
	assert(aml_bitmap_from_string(&slowb, argv[2]) == 0);
92 93 94 95 96 97 98 99
	N = atol(argv[3]);
	T = atol(argv[4]);
	/* let's not handle messy tile sizes */
	assert(N % T == 0);
	memsize = sizeof(double)*N*N;
	tilesize = sizeof(double)*T*T;

	/* the initial tiling, 2d grid of tiles */
100
	assert(!aml_tiling_2d_init(&tiling_row, AML_TILING_TYPE_2D_ROWMAJOR,
101
				tilesize, memsize, N/T , N/T));
102
	assert(!aml_tiling_2d_init(&tiling_col, AML_TILING_TYPE_2D_COLMAJOR,
103 104
				tilesize, memsize, N/T , N/T));
	/* the prefetch tiling, 1D sequence of columns of tiles */
105
	assert(!aml_tiling_1d_init(&tiling_prefetch,
106
				tilesize*(N/T), memsize));
107

108
	aml_area_linux_create(&slow, AML_AREA_LINUX_MMAP_FLAG_PRIVATE,
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
109 110
				     &slowb, AML_AREA_LINUX_BINDING_FLAG_BIND);
	assert(slow != NULL);
111
	aml_area_linux_create(&fast, AML_AREA_LINUX_MMAP_FLAG_PRIVATE,
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
112 113 114
				     &fastb, AML_AREA_LINUX_BINDING_FLAG_BIND);
	assert(fast != NULL);
	
115
	assert(!aml_dma_linux_seq_init(&dma, 2));
116 117
	assert(!aml_scratch_par_init(&sa, &fast, &slow, &dma, &tiling_prefetch, (size_t)2, (size_t)2));
	assert(!aml_scratch_par_init(&sb, &fast, &slow, &dma, &tiling_prefetch, (size_t)2, (size_t)2));
118
	/* allocation */
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
119 120 121
	a = aml_area_mmap(slow, NULL, memsize);
	b = aml_area_mmap(slow, NULL, memsize);
	c = aml_area_mmap(fast, NULL, memsize);
122
	assert(a != NULL && b != NULL && c != NULL);
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154

	size_t ntilerows, ntilecols, tilerowsize, tilecolsize, rowsize, colsize;
	rowsize = colsize = N;
	tilerowsize = tilecolsize = T;
	ntilerows = ntilecols = N/T;
	for(unsigned long i = 0; i < N*N; i+=tilerowsize) {
		size_t tilerow, tilecol, row, column;
		/* Tile row index (row-major).  */
		tilerow = i / (tilerowsize * tilecolsize * ntilerows);
		/* Tile column index (row-major).  */
		tilecol = (i / tilerowsize) % ntilerows;
		/* Row index within a tile (row-major).  */
		row = (i / rowsize) % tilecolsize;
		/* Column index within a tile (row-major).  */
		/* column = i % tilerowsize; */

		size_t a_offset, b_offset;
		/* Tiles in A need to be transposed (column-major).  */
		a_offset = (tilecol * ntilecols + tilerow) *
			tilerowsize * tilecolsize +
			row * tilerowsize;
		/* Tiles in B are in row-major order.  */
		b_offset = (tilerow * ntilerows + tilecol) *
			tilerowsize * tilecolsize +
			row * tilerowsize;
		for (column = 0; column < tilerowsize; column++) {
			a[a_offset + column] = (double)rand();
			b[b_offset + column] = (double)rand();
			/* C is tiled as well (row-major) but since it's
			   all-zeros at this point, we don't bother.  */
			c[i+column] = 0.0;
		}
155
	}
156

157 158 159 160 161 162 163
	clock_gettime(CLOCK_REALTIME, &start);
	do_work();
	clock_gettime(CLOCK_REALTIME, &stop);
	long long int time = 0;
	time =  (stop.tv_nsec - start.tv_nsec) +
                1e9* (stop.tv_sec - start.tv_sec);
	double flops = (2.0*N*N*N)/(time/1e9);
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181

	/* De-tile the result matrix (C).  I couldn't figure out how to do
	   it in-place so we are de-tiling to the A matrix.  */
	for(unsigned long i = 0; i < N*N; i+=tilerowsize) {
		size_t tilerow, tilecol, row;
		/* Tile row index (row-major).  */
		tilerow = i / (tilerowsize * tilecolsize * ntilerows);
		/* Tile column index (row-major).  */
		tilecol = (i / tilerowsize) % ntilerows;
		/* Row index within a tile (row-major).  */
		row = (i / rowsize) % tilecolsize;
		/* i converted to tiled.  */
		unsigned long tiledi = (tilerow * ntilerows + tilecol) *
			tilerowsize * tilecolsize + row * tilerowsize;

		memcpy(&a[i], &c[tiledi], tilerowsize*sizeof(double));
	}

182 183 184
	/* print the flops in GFLOPS */
	printf("dgemm-prefetch: %llu %lld %lld %f\n", N, memsize, time,
	       flops/1e9);
185 186
	aml_scratch_par_fini(&sa);
	aml_scratch_par_fini(&sb);
187
	aml_dma_linux_seq_fini(&dma);
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
188 189 190
	aml_area_munmap(slow, a, memsize);
	aml_area_munmap(slow, b, memsize);
	aml_area_munmap(fast, c, memsize);
191 192
	aml_area_linux_destroy(&slow);
	aml_area_linux_destroy(&fast);
193 194 195
	aml_tiling_2d_fini(&tiling_row);
	aml_tiling_2d_fini(&tiling_col);
	aml_tiling_1d_fini(&tiling_prefetch);
196
	aml_finalize();
197 198
	return 0;
}