Commit 0244130a authored by Florence Monna's avatar Florence Monna Committed by Swann Perarnau
Browse files

[ci/features] New benchmarks based on BLAS L1

BLAS Level 1 benchmarks with flat and tile examples are implemented, except for the rotations generator functions.
parent 1c4b94d4
......@@ -14,7 +14,7 @@ repoquality:
- /^wip.*/
- /^WIP.*/
script:
- git ls-files '*.c' '*.h' | grep -vE "benchmarks" > .repoquality
- git ls-files '*.c' '*.h' > .repoquality
- nix run -f "$ARGOPKGS" repoquality --command repoquality
tags:
- nix
......@@ -74,7 +74,7 @@ make:generic:
- /^wip.*/
- /^WIP.*/
variables:
CFLAGS: "-std=c99 -pedantic -Wall -Wextra -Werror -Wno-unused-but-set-parameter"
CFLAGS: "-std=c99 -pedantic -Wall -Wextra -Werror -Wno-unused-but-set-parameter -Wno-builtin-declaration-mismatch"
script:
- |
nix-shell --run bash <<< '
......@@ -100,7 +100,7 @@ make:out-of-tree:
- /^wip.*/
- /^WIP.*/
variables:
CFLAGS: "-std=c99 -pedantic -Wall -Wextra -Werror -Wno-unused-but-set-parameter"
CFLAGS: "-std=c99 -pedantic -Wall -Wextra -Werror -Wno-unused-but-set-parameter -Wno-builtin-declaration-mismatch"
script:
- |
nix-shell --run bash <<< '
......@@ -123,13 +123,16 @@ make:knl:
except:
- /^wip.*/
- /^WIP.*/
variables:
CFLAGS: "-mkl -xhost"
CC: "icc"
tags:
- knl
script:
- source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64
- ./autogen.sh
- mkdir build
- CC=icc CFLAGS="-mkl -xhost" ./configure --prefix=`pwd`/build --enable-benchmarks
- ./configure --prefix=`pwd`/build
- make -j64
- make -C tests check
- make install
......
ACLOCAL_AMFLAGS = -I m4
SUBDIRS = src include tests doc
#if ADD_BENCHMARKS
#SUBDIRS += benchmarks
#endif
SUBDIRS = src include tests doc benchmarks
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = aml.pc
......
AM_CFLAGS = -I$(top_srcdir)/include -I. $(OPENMP_CFLAGS)
AM_LDFLAGS = ../src/libaml.la $(OPENMP_CFLAGS)
noinst_LIBRARIES = libutils.a
libutils_a_SOURCES = utils.c utils.h
LDADD = libutils.a
noinst_PROGRAMS = stream_add_pth_st \
stream_add_omp_st \
stream_add_omp_mt \
dgemm_vanilla \
dgemm_mkl \
dgemm_prefetch \
dgemm_noprefetch
AM_COLOR_TESTS = yes
AM_CFLAGS = -I$(top_srcdir)/include $(PTHREAD_CFLAGS) $(OPENMP_CFLAGS)
AM_LDFLAGS = ../src/libaml.la $(PTHREAD_LIBS) $(OPENMP_CFLAGS)
noinst_LIBRARIES = libkernel.a
libkernel_a_SOURCES = utils.c utils.h blas_l1_kernel.c blas_l1_kernel.h verify_blas_l1.c verify_blas_l1.h
LDADD = libkernel.a
NOPREFETCH = noprefetch/flat_blas_l1 \
noprefetch/tiled_blas_l1
BENCHMARKS = $(NOPREFETCH)
# all tests
check_PROGRAMS = $(BENCHMARKS)
TESTS = $(BENCHMARKS)
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
/*
* This is a benchmark for the BLAS Level 1 operations for AML.
*/
#include "blas_l1_kernel.h"
/* Look into another way to define these */
#define sign(a) ((a > 0) ? 1 : ((a < 0) ? -1 : 0))
double dasum(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*b;
(void)*c;
(void)scalar;
size_t i;
double dasum = 0;
for (i = 0; i < n; i++) {
dasum = dasum + fabs(a[i]);
}
return dasum;
}
double daxpy(size_t n, double *a, double *b, double *c, double scalar)
{
size_t i;
#pragma omp parallel for
for (i = 0; i < n; i++)
c[i] = b[i] + scalar * a[i];
return 1;
}
double dcopy(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*c;
(void)scalar;
size_t i;
#pragma omp parallel for
for (i = 0; i < n; i++)
b[i] = a[i];
return 1;
}
double ddot(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*c;
(void)scalar;
size_t i;
long double dot = 0.0;
#pragma omp parallel for reduction(+ : dot)
for (i = 0; i < n; i++) {
long double temp;
temp = a[i] * b[i];
dot += temp;
}
return (double)dot;
}
double dnrm2(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*b;
(void)*c;
(void)scalar;
size_t i;
double scale, ssq, temp;
scale = 0.0;
ssq = 1.0;
for (i = 0; i < n; i++) {
if (a[i] != 0.0) {
temp = fabs(a[i]);
if (scale < temp) {
ssq = 1.0 + ssq * pow(scale / temp, 2);
scale = temp;
} else
ssq = ssq + pow(temp / scale, 2);
}
}
return scale * sqrt(ssq);
}
double dscal(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*c;
size_t i;
#pragma omp parallel for
for (i = 0; i < n; i++)
b[i] = scalar * a[i];
return 1;
}
double dswap(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*c;
(void)scalar;
size_t i;
double temp;
#pragma omp parallel for
for (i = 0; i < n; i++) {
temp = a[i];
a[i] = b[i];
b[i] = temp;
}
return 1;
}
double idmax(size_t n, double *a, double *b, double *c, double scalar)
{
(void)*b;
(void)*c;
(void)scalar;
if (n == 1)
return 0;
size_t i;
double max;
size_t id_max = 0;
max = a[0];
for (i = 1; i < n; i++) {
if (fabs(a[i]) > max) {
id_max = i;
max = fabs(a[i]);
}
}
return id_max;
}
/* The rotations. Not included in the array of functions because of their
parameters */
/* Plane rotation */
void drot(size_t n, double *a, double *b, double x, double y)
{
double temp;
size_t i;
#pragma omp parallel for
for (i = 0; i < n; i++) {
temp = x * a[i] + y * b[i];
b[i] = x * b[i] - y * a[i];
a[i] = temp;
}
}
/* Create a plane rotation. TODO: Verify */
void drotg(double x, double y, double c, double s)
{
double r, roe, scale, z;
roe = y;
if (fabs(x) > fabs(y))
roe = x;
scale = fabs(x) + fabs(y);
if (scale == 0.0) {
c = 1.0;
s = 0.0;
r = 0.0;
z = 0.0;
} else {
r = scale * sqrt(pow(x / scale, 2) + pow(y / scale, 2));
r = sign(roe) * r;
c = x / r;
s = y / r;
z = 1.0;
if (fabs(x) > fabs(y))
z = s;
if (fabs(y) >= fabs(x) && c != 0.0)
z = 1.0 / c;
}
x = r;
y = z;
}
void drotm(size_t n, double *a, double *b, double *param)
{
double flag, h11, h12, h21, h22, w, z;
size_t i;
flag = param[0];
if (flag < 0.0) {
h11 = param[1];
h12 = param[3];
h21 = param[2];
h22 = param[4];
} else {
if (flag == 0) {
h11 = 1.0;
h12 = param[3];
h21 = param[2];
h22 = 1.0;
} else {
h11 = param[1];
h12 = 1.0;
h21 = -1.0;
h22 = param[4];
}
}
#pragma omp parallel for
for (i = 0; i < n; i++) {
w = a[i];
z = b[i];
a[i] = w * h11 + z * h12;
b[i] = w * h21 + z * h22;
}
}
/* TODO: Verify */
void drotmg(double d1, double d2, double x, double y, double *param)
{
double flag, h11, h12, h21, h22, p1, p2, q1, q2, temp, u, gam, gamsq,
rgamsq;
gam = 4096.0;
gamsq = 16777216.0;
rgamsq = 5.9604645e-8;
/* default initialization */
h11 = 0.0;
h12 = 0.0;
h21 = 0.0;
h22 = 0.0;
if (d1 < 0) {
flag = -1.0;
d1 = 0.0;
d2 = 0.0;
x = 0.0;
} else {
p2 = d2 * y;
if (p2 == 0) {
flag = -2.0;
param[0] = flag;
}
p1 = d1 * x;
q2 = p2 * y;
q1 = p1 * x;
if (fabs(q1) > fabs(q2)) {
h21 = -y / x;
h12 = p2 / p1;
u = 1.0 - h12 * h21;
if (u > 0) {
flag = 0.0;
d1 = d1 / u;
d2 = d2 / u;
x = x * u;
}
} else {
if (q2 < 0.0) {
flag = -1.0;
d1 = 0.0;
d2 = 0.0;
x = 0.0;
} else {
flag = 1.0;
h11 = p1 / p2;
h22 = x / y;
u = 1.0 + h11 * h22;
temp = d2 / u;
d2 = d1 / u;
d1 = temp;
x = y * u;
}
}
if (d1 != 0.0) {
while (fabs(d1) <= rgamsq || d1 >= gamsq) {
if (flag == 0.0) {
h11 = 1.0;
h22 = 1.0;
} else {
h21 = -1.0;
h12 = 1.0;
}
flag = -1.0;
if (d1 <= rgamsq) {
d1 = d1 * pow(gam, 2);
x = x / gam;
h11 = h11 / gam;
h12 = h12 / gam;
} else {
d1 = d1 / pow(gam, 2);
x = x * gam;
h11 = h11 * gam;
h12 = h12 * gam;
}
}
}
if (d2 != 0) {
while (fabs(d2) <= rgamsq || fabs(d2) >= gamsq) {
if (flag == 0.0) {
h11 = 1.0;
h22 = 1.0;
} else {
h21 = -1.0;
h12 = 1.0;
}
flag = -1.0;
if (fabs(d2) <= rgamsq) {
d2 = d2 * pow(gam, 2);
h21 = h21 / gam;
h22 = h22 / gam;
} else {
d2 = d2 / pow(gam, 2);
h21 = h21 * gam;
h22 = h22 * gam;
}
}
}
}
param[1] = h11;
param[2] = h21;
param[3] = h12;
param[4] = h22;
param[0] = flag;
}
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
/*
* This is a benchmark for the BLAS Level 1 operations for AML.
*/
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdlib.h>
#include <unistd.h>
double dasum(size_t n, double *a, double *b, double *c, double scalar);
double daxpy(size_t n, double *a, double *b, double *c, double scalar);
double dcopy(size_t n, double *a, double *b, double *c, double scalar);
double ddot(size_t n, double *a, double *b, double *c, double scalar);
double dnrm2(size_t n, double *a, double *b, double *c, double scalar);
double dscal(size_t n, double *a, double *b, double *c, double scalar);
double dswap(size_t n, double *a, double *b, double *c, double scalar);
double idmax(size_t n, double *a, double *b, double *c, double scalar);
void drot(size_t n, double *a, double *b, double c, double s);
void drotg(double x, double y, double c, double s);
void drotm(size_t n, double *a, double *b, double *param);
void drotmg(double d1, double d2, double x, double y, double *param);
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#include "aml.h"
#include "aml/area/linux.h"
#include <assert.h>
#include <errno.h>
#include <mkl.h>
#include <omp.h>
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
struct aml_area *slow, *fast;
struct aml_bitmap slowb, fastb;
struct timespec start, stop;
double *a, *b, *c;
aml_init(&argc, &argv);
assert(argc == 4);
assert(aml_bitmap_from_string(&fastb, argv[1]) == 0);
assert(aml_bitmap_from_string(&slowb, argv[2]) == 0);
long int N = atol(argv[3]);
unsigned long memsize = sizeof(double)*N*N;
aml_area_linux_create(&slow, &slowb, AML_AREA_LINUX_POLICY_BIND);
assert(slow != NULL);
aml_area_linux_create(&fast, &fastb, AML_AREA_LINUX_POLICY_BIND);
assert(fast != NULL);
a = aml_area_mmap(slow, memsize, NULL);
b = aml_area_mmap(slow, memsize, NULL);
c = aml_area_mmap(fast, memsize, NULL);
assert(a != NULL && b != NULL && c != NULL);
double alpha = 1.0, beta = 1.0;
for(unsigned long i = 0; i < N*N; i++){
a[i] = (double)rand();
b[i] = (double)rand();
c[i] = 0.0;
}
clock_gettime(CLOCK_REALTIME, &start);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, N, N, alpha, a, N, b, N, beta, c, N);
clock_gettime(CLOCK_REALTIME, &stop);
long long int time = 0;
time = (stop.tv_nsec - start.tv_nsec) +
1e9* (stop.tv_sec - start.tv_sec);
double flops = (2.0*N*N*N)/(time/1e9);
/* print the flops in GFLOPS */
printf("dgemm-mkl: %llu %lld %lld %f\n", N, memsize, time, flops/1e9);
aml_area_munmap(slow, a, memsize);
aml_area_munmap(slow, b, memsize);
aml_area_munmap(fast, c, memsize);
aml_area_linux_destroy(&slow);
aml_area_linux_destroy(&fast);
aml_finalize();
return 0;
}
/*******************************************************************************
* Copyright 2019 UChicago Argonne, LLC.
* (c.f. AUTHORS, LICENSE)
*
* This file is part of the AML project.
* For more info, see https://xgitlab.cels.anl.gov/argo/aml
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
#include "aml.h"
#include "aml/area/linux.h"
#include "aml/tiling/2d.h"
#include <assert.h>
#include <errno.h>
#include <mkl.h>
#include <omp.h>
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
struct aml_tiling *tiling_row;
struct aml_tiling *tiling_col;
struct aml_area *slow, *fast;
size_t memsize, tilesize, N, T;
double *a, *b, *c;
struct timespec start, stop;
void do_work()
{
int lda = (int)T, ldb, ldc;
ldb = lda;
ldc = lda;
size_t ndims[2];
aml_tiling_ndims(tiling_row, &ndims[0], &ndims[1]);
for(int k = 0; k < ndims[1]; k++)
{
#pragma omp parallel for
for(int i = 0; i < ndims[0]; i++)
{
for(int j = 0; j < ndims[1]; j++)
{
size_t aoff, boff, coff;
double *ap, *bp, *cp;
aoff = aml_tiling_tileid(tiling_col, i, k);
boff = aml_tiling_tileid(tiling_row, k, j);
coff = aml_tiling_tileid(tiling_row, i, j);
ap = aml_tiling_tilestart(tiling_col, a, aoff);
bp = aml_tiling_tilestart(tiling_row, b, boff);
cp = aml_tiling_tilestart(tiling_row, c, coff);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, ldc, lda, ldb, 1.0, ap, lda, bp, ldb, 1.0, cp, ldc);
}
}
}
}
int main(int argc, char* argv[])
{
struct aml_bitmap slowb, fastb;
aml_init(&argc, &argv);
assert(argc == 5);
assert(aml_bitmap_from_string(&fastb, argv[1]) == 0);
assert(aml_bitmap_from_string(&slowb, argv[2]) == 0);
N = atol(argv[3]);
T = atol(argv[4]);
/* let's not handle messy tile sizes */
assert(N % T == 0);
memsize = sizeof(double)*N*N;
tilesize = sizeof(double)*T*T;
/* the initial tiling, of 2D square tiles */
assert(!aml_tiling_2d_create(&tiling_row, AML_TILING_TYPE_2D_ROWMAJOR,
tilesize, memsize, N/T , N/T));
assert(!aml_tiling_2d_create(&tiling_col, AML_TILING_TYPE_2D_COLMAJOR,
tilesize, memsize, N/T , N/T));
aml_area_linux_create(&slow, &slowb, AML_AREA_LINUX_POLICY_BIND);
assert(slow != NULL);
aml_area_linux_create(&fast, &fastb, AML_AREA_LINUX_POLICY_BIND);
assert(fast != NULL);