Unverified Commit dc90a496 authored by Malachi's avatar Malachi Committed by GitHub
Browse files

Lin alg t fixes (#220)

* Realloc buffers as required for reductions in linAlg_t
* Bug fix for linAlg_t::max/min kernels
* Add ciTest for linAlg_t reductions to ensure correctness
* Remove accidental change to makenrs
parent 5407349d
build/
CMakeFiles
*.tgz
.vscode/
......@@ -53,6 +53,53 @@ void ciSetup(MPI_Comm comm, setupAide &options)
options.setArgs("VARIABLEPROPERTIES", "FALSE");
}
dfloat sum(dfloat const * const array, const int size)
{
dfloat sumr = 0.0;
for(int i = 0 ; i < size; ++i)
sumr += array[i];
return sumr;
}
dfloat max(dfloat const * const array, const int size)
{
dfloat maxr = -9e30;
for(int i = 0 ; i < size; ++i)
maxr = (array[i] > maxr) ? array[i] : maxr;
return maxr;
}
dfloat min(dfloat const * const array, const int size)
{
dfloat minr = 9e30;
for(int i = 0 ; i < size; ++i)
minr = (array[i] < minr) ? array[i] : minr;
return minr;
}
void ciTestLinAlg(nrs_t *nrs, const int N)
{
linAlg_t* linAlg = nrs->linAlg;
MPI_Comm comm = nrs->mesh->comm;
int rank = nrs->mesh->rank;
dfloat* x = (dfloat*) calloc(N, sizeof(dfloat));
for(int i = 0 ; i < N; ++i)
x[i] = drand48();
occa::memory o_x = nrs->mesh->device.malloc(N*sizeof(dfloat), x);
const dfloat err_sum = abs(
sum(x,N) - linAlg->sum(N, o_x, comm)
);
const dfloat err_max = abs(
max(x,N) - linAlg->max(N, o_x, comm)
);
const dfloat err_min = abs(
min(x,N) - linAlg->min(N, o_x, comm)
);
printf("linAlg errs (N=%d): max=%g, min=%g, sum=%g\n", N, err_max, err_min, err_sum);
free(x);
o_x.free();
const dfloat testTol = 1e-10;
if(err_max > testTol || err_min > testTol || err_sum > testTol)
FAIL;
}
void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
{
if (!nrs->lastStep) return;
......@@ -70,6 +117,16 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
int pIterErr;
int velIterErr;
ciTestLinAlg(nrs, BLOCKSIZE / 16);
ciTestLinAlg(nrs, BLOCKSIZE / 8);
ciTestLinAlg(nrs, BLOCKSIZE / 4);
ciTestLinAlg(nrs, BLOCKSIZE / 2);
ciTestLinAlg(nrs, BLOCKSIZE);
ciTestLinAlg(nrs, 2 * BLOCKSIZE);
ciTestLinAlg(nrs, 4 * BLOCKSIZE);
ciTestLinAlg(nrs, 8 * BLOCKSIZE);
ciTestLinAlg(nrs, 16 * BLOCKSIZE);
switch (ciMode) {
case 1 : velIterErr = abs(nrs->NiterU - 10);
s1Err = abs((err[2] - 5.25E-12)/err[2]);
......
......@@ -27,7 +27,7 @@ SOFTWARE.
@kernel void max(const dlong Nblocks,
const dlong N,
@restrict const dfloat *x,
@restrict dfloat *sum){
@restrict dfloat *result){
for(dlong b=0;b<Nblocks;++b;@outer(0)){
......@@ -66,6 +66,6 @@ SOFTWARE.
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 8) s[t] = (s[t] > s[t + 8]) ? s[t]:s[t + 8];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 4) s[t] = (s[t] > s[t + 4]) ? s[t]:s[t + 4];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 2) s[t] = (s[t] > s[t + 2]) ? s[t]:s[t + 2];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 1) s[b] = (s[0] > s[1] ) ? s[0]:s[1 ];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 1) result[b] = (s[0] > s[1] ) ? s[0]:s[1 ];
}
}
......@@ -27,7 +27,7 @@ SOFTWARE.
@kernel void min(const dlong Nblocks,
const dlong N,
@restrict const dfloat *x,
@restrict dfloat *sum){
@restrict dfloat *result){
for(dlong b=0;b<Nblocks;++b;@outer(0)){
......@@ -66,6 +66,6 @@ SOFTWARE.
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 8) s[t] = (s[t] < s[t + 8]) ? s[t]:s[t + 8];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 4) s[t] = (s[t] < s[t + 4]) ? s[t]:s[t + 4];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 2) s[t] = (s[t] < s[t + 2]) ? s[t]:s[t + 2];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 1) s[b] = (s[0] < s[1] ) ? s[0]:s[1 ];
for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 1) result[b] = (s[0] < s[1] ) ? s[0]:s[1 ];
}
}
......@@ -26,6 +26,19 @@ SOFTWARE.
#include "linAlg.hpp"
void linAlg_t::reallocBuffers(const dlong Nbytes)
{
if(h_scratch.size()) h_scratch.free();
if(o_scratch.size()) o_scratch.free();
//pinned scratch buffer
{
occa::properties props = kernelInfo;
props["mapped"] = true;
h_scratch = device.malloc(Nbytes, props);
scratch = (dfloat*) h_scratch.ptr(props);
}
o_scratch = device.malloc(Nbytes);
}
void linAlg_t::setup() {
int rank;
......@@ -34,14 +47,8 @@ void linAlg_t::setup() {
//add defines
kernelInfo["defines/" "p_blockSize"] = blocksize;
//pinned scratch buffer
{
occa::properties props = kernelInfo;
props["mapped"] = true;
h_scratch = device.malloc(BLOCKSIZE*sizeof(dfloat), props);
scratch = (dfloat*) h_scratch.ptr(props);
}
o_scratch = device.malloc(BLOCKSIZE*sizeof(dfloat));
reallocBuffers(BLOCKSIZE * sizeof(dfloat));
string oklDir;
oklDir.assign(getenv("NEKRS_INSTALL_DIR"));
......@@ -211,7 +218,8 @@ void linAlg_t::axdyz(const dlong N, const dfloat alpha,
// \sum o_a
dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
const dlong Nbytes = Nblock * sizeof(dfloat);
if(o_scratch.size() < Nbytes) reallocBuffers(Nbytes);
sumKernel(Nblock, N, o_a, o_scratch);
......@@ -231,7 +239,8 @@ dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
// \min o_a
dfloat linAlg_t::min(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
const dlong Nbytes = Nblock * sizeof(dfloat);
if(o_scratch.size() < Nbytes) reallocBuffers(Nbytes);
minKernel(Nblock, N, o_a, o_scratch);
......@@ -250,7 +259,8 @@ dfloat linAlg_t::min(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
// \max o_a
dfloat linAlg_t::max(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
const dlong Nbytes = Nblock * sizeof(dfloat);
if(o_scratch.size() < Nbytes) reallocBuffers(Nbytes);
maxKernel(Nblock, N, o_a, o_scratch);
......@@ -258,7 +268,7 @@ dfloat linAlg_t::max(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
dfloat max = -9e30;
for(dlong n=0;n<Nblock;++n){
max = (scratch[n] < max) ? scratch[n]:max;
max = (scratch[n] > max) ? scratch[n]:max;
}
if (_comm != MPI_COMM_NULL)
......@@ -293,7 +303,8 @@ dfloat linAlg_t::norm2(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
MPI_Comm _comm) {
int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
const dlong Nbytes = Nblock * sizeof(dfloat);
if(o_scratch.size() < Nbytes) reallocBuffers(Nbytes);
innerProdKernel(Nblock, N, o_x, o_y, o_scratch);
......@@ -315,7 +326,8 @@ dfloat linAlg_t::weightedInnerProd(const dlong N, occa::memory& o_w,
occa::memory& o_x, occa::memory& o_y,
MPI_Comm _comm) {
int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
const dlong Nbytes = Nblock * sizeof(dfloat);
if(o_scratch.size() < Nbytes) reallocBuffers(Nbytes);
weightedInnerProdKernel(Nblock, N, o_w, o_x, o_y, o_scratch);
......@@ -336,7 +348,8 @@ dfloat linAlg_t::weightedInnerProd(const dlong N, occa::memory& o_w,
dfloat linAlg_t::weightedNorm2(const dlong N, occa::memory& o_w,
occa::memory& o_a, MPI_Comm _comm) {
int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
const dlong Nbytes = Nblock * sizeof(dfloat);
if(o_scratch.size() < Nbytes) reallocBuffers(Nbytes);
weightedNorm2Kernel(Nblock, N, o_w, o_a, o_scratch);
......
......@@ -44,6 +44,7 @@ private:
occa::memory o_scratch;
void setup();
void reallocBuffers(const dlong Nbytes);
public:
linAlg_t(occa::device& _device, occa::properties*& _kernelInfo, MPI_Comm& _comm) {
blocksize = BLOCKSIZE;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment