dgemm_noprefetch.c 5.1 KB
Newer Older
Swann Perarnau's avatar
Swann Perarnau committed
1 2 3 4 5 6 7 8 9 10
/*******************************************************************************
 * Copyright 2019 UChicago Argonne, LLC.
 * (c.f. AUTHORS, LICENSE)
 *
 * This file is part of the AML project.
 * For more info, see https://xgitlab.cels.anl.gov/argo/aml
 *
 * SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/

11
#include "aml.h"
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
12
#include "aml/area/linux.h"
13
#include "aml/tiling/2d.h"
14 15
#include <assert.h>
#include <errno.h>
16
#include <mkl.h>
17 18 19 20 21 22 23
#include <omp.h>
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>

24 25
AML_TILING_2D_ROWMAJOR_DECL(tiling_row);
AML_TILING_2D_COLMAJOR_DECL(tiling_col);
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
26
struct aml_area *slow, *fast;
27
size_t memsize, tilesize, N, T;
28
double *a, *b, *c;
29
struct timespec start, stop;
30 31 32

void do_work()
{
33
	int lda = (int)T, ldb, ldc;
34 35
	ldb = lda;
	ldc = lda;
36
	size_t ndims[2];
37
	aml_tiling_ndims(&tiling_row, &ndims[0], &ndims[1]);
38

39
	for(int k = 0; k < ndims[1]; k++)
40
	{
41 42
		#pragma omp parallel for
		for(int i = 0; i < ndims[0]; i++)
43
		{
44
			for(int j = 0; j < ndims[1]; j++)
45
			{
46 47
				size_t aoff, boff, coff;
				double *ap, *bp, *cp;
48 49 50
				aoff = aml_tiling_tileid(&tiling_col, i, k);
				boff = aml_tiling_tileid(&tiling_row, k, j);
				coff = aml_tiling_tileid(&tiling_row, i, j);
51 52 53
				ap = aml_tiling_tilestart(&tiling_col, a, aoff);
				bp = aml_tiling_tilestart(&tiling_row, b, boff);
				cp = aml_tiling_tilestart(&tiling_row, c, coff);
54
				cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, ldc, lda, ldb, 1.0, ap, lda, bp, ldb, 1.0, cp, ldc);
55 56
			}
		}
57
	}
58 59
}

60
int main(int argc, char* argv[])
61
{
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
62
	struct aml_bitmap slowb, fastb;
63 64
	aml_init(&argc, &argv);
	assert(argc == 5);
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
65 66
	assert(aml_bitmap_from_string(&fastb, argv[1]) == 0);
	assert(aml_bitmap_from_string(&slowb, argv[2]) == 0);
67 68 69 70 71 72 73 74
	N = atol(argv[3]);
	T = atol(argv[4]);
	/* let's not handle messy tile sizes */
	assert(N % T == 0);
	memsize = sizeof(double)*N*N;
	tilesize = sizeof(double)*T*T;

	/* the initial tiling, of 2D square tiles */
75
	assert(!aml_tiling_2d_init(&tiling_row, AML_TILING_TYPE_2D_ROWMAJOR,
76
				tilesize, memsize, N/T , N/T));
77
	assert(!aml_tiling_2d_init(&tiling_col, AML_TILING_TYPE_2D_COLMAJOR,
78
				tilesize, memsize, N/T , N/T));
79

80
	aml_area_linux_create(&slow, AML_AREA_LINUX_MMAP_FLAG_PRIVATE,
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
81 82
				     &slowb, AML_AREA_LINUX_BINDING_FLAG_BIND);
	assert(slow != NULL);
83
	aml_area_linux_create(&fast, AML_AREA_LINUX_MMAP_FLAG_PRIVATE,
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
84 85 86
				     &fastb, AML_AREA_LINUX_BINDING_FLAG_BIND);
	assert(fast != NULL);

87
	/* allocation */
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
88 89 90
	a = aml_area_mmap(slow, NULL, memsize);
	b = aml_area_mmap(slow, NULL, memsize);
	c = aml_area_mmap(fast, NULL, memsize);
91
	assert(a != NULL && b != NULL && c != NULL);
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123

	size_t ntilerows, ntilecols, tilerowsize, tilecolsize, rowsize, colsize;
	rowsize = colsize = N;
	tilerowsize = tilecolsize = T;
	ntilerows = ntilecols = N/T;
	for(unsigned long i = 0; i < N*N; i+=tilerowsize) {
		size_t tilerow, tilecol, row, column;
		/* Tile row index (row-major).  */
		tilerow = i / (tilerowsize * tilecolsize * ntilerows);
		/* Tile column index (row-major).  */
		tilecol = (i / tilerowsize) % ntilerows;
		/* Row index within a tile (row-major).  */
		row = (i / rowsize) % tilecolsize;
		/* Column index within a tile (row-major).  */
		/* column = i % tilerowsize; */

		size_t a_offset, b_offset;
		/* Tiles in A need to be transposed (column-major).  */
		a_offset = (tilecol * ntilecols + tilerow) *
			tilerowsize * tilecolsize +
			row * tilerowsize;
		/* Tiles in B are in row-major order.  */
		b_offset = (tilerow * ntilerows + tilecol) *
			tilerowsize * tilecolsize +
			row * tilerowsize;
		for (column = 0; column < tilerowsize; column++) {
			a[a_offset + column] = (double)rand();
			b[b_offset + column] = (double)rand();
			/* C is tiled as well (row-major) but since it's
			   all-zeros at this point, we don't bother.  */
			c[i+column] = 0.0;
		}
124
	}
125

126 127 128 129 130 131 132
	clock_gettime(CLOCK_REALTIME, &start);
	do_work();
	clock_gettime(CLOCK_REALTIME, &stop);
	long long int time = 0;
	time =  (stop.tv_nsec - start.tv_nsec) +
                1e9* (stop.tv_sec - start.tv_sec);
	double flops = (2.0*N*N*N)/(time/1e9);
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150

	/* De-tile the result matrix (C).  I couldn't figure out how to do
	   it in-place so we are de-tiling to the A matrix.  */
	for(unsigned long i = 0; i < N*N; i+=tilerowsize) {
		size_t tilerow, tilecol, row;
		/* Tile row index (row-major).  */
		tilerow = i / (tilerowsize * tilecolsize * ntilerows);
		/* Tile column index (row-major).  */
		tilecol = (i / tilerowsize) % ntilerows;
		/* Row index within a tile (row-major).  */
		row = (i / rowsize) % tilecolsize;
		/* i converted to tiled.  */
		unsigned long tiledi = (tilerow * ntilerows + tilecol) *
			tilerowsize * tilecolsize + row * tilerowsize;

		memcpy(&a[i], &c[tiledi], tilerowsize*sizeof(double));
	}

151
	/* print the flops in GFLOPS */
152 153
	printf("dgemm-noprefetch: %llu %lld %lld %f\n", N, memsize, time,
	       flops/1e9);
Nicolas Denoyelle's avatar
Nicolas Denoyelle committed
154 155 156
	aml_area_munmap(slow, a, memsize);
	aml_area_munmap(slow, b, memsize);
	aml_area_munmap(fast, c, memsize);
157 158
	aml_area_linux_destroy(&slow);
	aml_area_linux_destroy(&fast);
159 160
	aml_tiling_2d_fini(&tiling_row);
	aml_tiling_2d_fini(&tiling_col);
161
	aml_finalize();
162 163
	return 0;
}