Unverified Commit dd304b9a authored by stgeke's avatar stgeke Committed by GitHub
Browse files

Import next (#222)

* Add early prepostRecv option
* Enable FDM overlap by default
* fix accuracy issue when printing device memory usage
* fuse pack/unpack of halos + comm buf
* initialize gs->mode properly
* abort in build-only mode if udf failed to compile
* use correct diffusivity in k-tau example
* set correct default initial guess method for scalars
* do no duplicate device memory for CPU backend
* bypass unnecessary parAlmond setup steps
* add missing mcmodel in CFLAGS
parent 82a68b6b
......@@ -234,6 +234,9 @@ typedef struct {
ogs_t *ogs;
MPI_Comm comm;
int rank;
occa::memory h_buffSend, h_buffRecv;
unsigned char *bufSend, *bufRecv;
......@@ -242,11 +245,13 @@ typedef struct {
occa::memory o_scatterOffsets, o_gatherOffsets;
occa::memory o_scatterIds, o_gatherIds;
occa::kernel packBufDoubleKernel, packBufFloatKernel;
occa::kernel packBufFloatToHalfKernel;
occa::kernel unpackBufDoubleAddKernel, unpackBufDoubleMinKernel, unpackBufDoubleMaxKernel;
occa::kernel unpackBufFloatAddKernel;
occa::kernel unpackBufHalfToFloatAddKernel;
occa::kernel packBufFloatToHalfAddKernel, unpackBufHalfToFloatAddKernel;
occa::kernel packBufFloatAddKernel, unpackBufFloatAddKernel;
occa::kernel packBufDoubleAddKernel, unpackBufDoubleAddKernel;
occa::kernel packBufDoubleMinKernel, unpackBufDoubleMinKernel;
occa::kernel packBufDoubleMaxKernel, unpackBufDoubleMaxKernel;
int earlyPrepostRecv;
oogs_mode mode;
......
#include <cuda_fp16.h>
#define p_blockSize 256
extern "C" __global__ void packBuf_half(
const int Nscatter,
extern "C" __global__ void packBuf_halfAdd(
const int N,
const int Nentries,
const int stride,
const int * __restrict__ gatherStarts,
const int * __restrict__ gatherIds,
const int * __restrict__ scatterStarts,
const int * __restrict__ scatterIds,
const float * __restrict__ q,
half * __restrict__ scatterq
)
float * __restrict__ q,
half * __restrict__ qout)
{
int tile = p_blockSize * blockIdx.x;
{
int s = tile + threadIdx.x;
if (s < Nscatter * Nentries) {
const float qs = q[s];
const int sid = s % Nscatter;
const int k = s / Nscatter;
const int start = scatterStarts[sid];
const int end = scatterStarts[sid + 1];
for (int n = start; n < end; ++n) {
const int id = scatterIds[n];
scatterq[id * Nentries + k] = __float2half(qs);
}
const int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id < N * Nentries) {
const int sid = id % N;
const int k = id / N;
const int startGather = gatherStarts[sid];
const int endGather = gatherStarts[sid + 1];
const int startScatter = scatterStarts[sid];
const int endScatter= scatterStarts[sid + 1];
float gq = 0.0f;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += q[id+k*stride];
}
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
q[id+k*stride] = gq;
}
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id*Nentries+k] = __float2half(gq);
}
}
}
extern "C" __global__ void unpackBuf_halfAdd(const int Ngather,
const int Nentries,
const int * __restrict__ gatherStarts,
const int * __restrict__ gatherIds,
const half * __restrict__ q,
float * __restrict__ gatherq) {
{
int tile = p_blockSize * blockIdx.x;
{
int g = tile + threadIdx.x;
if (g < Ngather * Nentries) {
const int gid = g % Ngather;
const int k = g / Ngather;
const int start = gatherStarts[gid];
const int end = gatherStarts[gid + 1];
float gq = 0.00000000e+00f;
for (int n = start; n < end; ++n) {
const int id = gatherIds[n];
gq += __half2float(q[id * Nentries + k]);
}
extern "C" __global__ void unpackBuf_halfAdd(
const int N,
const int Nentries,
const int stride,
const int * __restrict__ gatherStarts,
const int * __restrict__ gatherIds,
const int * __restrict__ scatterStarts,
const int * __restrict__ scatterIds,
const half * __restrict__ q,
float * __restrict__ qout)
{
const int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id < N * Nentries) {
const int gid = id % N;
const int k = id / N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
//contiguously packed
gatherq[g] += gq;
}
float gq = 0.0f;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += __half2float(q[id*Nentries+k]);
}
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id+k*stride] += gq;
}
}
}
#include "hip/hip_runtime.h"
#include <hip/hip_fp16.h>
#define p_blockSize 256
extern "C" __global__ void packBuf_half(
const int Nscatter,
extern "C" __global__ void packBuf_halfAdd(
const int N,
const int Nentries,
const int stride,
const int * __restrict__ gatherStarts,
const int * __restrict__ gatherIds,
const int * __restrict__ scatterStarts,
const int * __restrict__ scatterIds,
const float * __restrict__ q,
half * __restrict__ scatterq
)
float * __restrict__ q,
half * __restrict__ qout)
{
int tile = p_blockSize * blockIdx.x;
{
int s = tile + threadIdx.x;
if (s < Nscatter * Nentries) {
const float qs = q[s];
const int sid = s % Nscatter;
const int k = s / Nscatter;
const int start = scatterStarts[sid];
const int end = scatterStarts[sid + 1];
for (int n = start; n < end; ++n) {
const int id = scatterIds[n];
scatterq[id * Nentries + k] = __float2half(qs);
}
const int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id < N * Nentries) {
const int sid = id % N;
const int k = id / N;
const int startGather = gatherStarts[sid];
const int endGather = gatherStarts[sid + 1];
const int startScatter = scatterStarts[sid];
const int endScatter= scatterStarts[sid + 1];
float gq = 0.0f;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += q[id+k*stride];
}
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
q[id+k*stride] = gq;
}
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id*Nentries+k] = __float2half(gq);
}
}
}
extern "C" __global__ void unpackBuf_halfAdd(const int Ngather,
const int Nentries,
const int * __restrict__ gatherStarts,
const int * __restrict__ gatherIds,
const half * __restrict__ q,
float * __restrict__ gatherq) {
{
int tile = p_blockSize * blockIdx.x;
{
int g = tile + threadIdx.x;
if (g < Ngather * Nentries) {
const int gid = g % Ngather;
const int k = g / Ngather;
const int start = gatherStarts[gid];
const int end = gatherStarts[gid + 1];
float gq = 0.00000000e+00f;
for (int n = start; n < end; ++n) {
const int id = gatherIds[n];
gq += __half2float(q[id * Nentries + k]);
}
extern "C" __global__ void unpackBuf_halfAdd(
const int N,
const int Nentries,
const int stride,
const int * __restrict__ gatherStarts,
const int * __restrict__ gatherIds,
const int * __restrict__ scatterStarts,
const int * __restrict__ scatterIds,
const half * __restrict__ q,
float * __restrict__ qout)
{
const int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id < N * Nentries) {
const int gid = id % N;
const int k = id / N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
//contiguously packed
gatherq[g] += gq;
}
float gq = 0.0f;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += __half2float(q[id*Nentries+k]);
}
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id+k*stride] += gq;
}
}
}
@kernel void packBuf_float(const dlong Nscatter,
const int Nentries,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict const float * q,
@restrict float * scatterq){
@kernel void packBuf_floatAdd(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict float * q,
@restrict float * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
for(dlong s=0;s<Nscatter*Nentries;++s;@tile(256,@outer,@inner)){
float gq = 0.0f;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += q[id+k*stride];
}
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
q[id+k*stride] = gq;
}
const float qs = q[s];
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id*Nentries+k] = gq;
}
}
}
const dlong sid = s%Nscatter;
const int k = s/Nscatter;
const dlong start = scatterStarts[sid];
const dlong end = scatterStarts[sid+1];
@kernel void unpackBuf_floatAdd(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict const float * q,
@restrict float * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
for(dlong n=start;n<end;++n){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
float gq = 0.0f;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += q[id*Nentries+k];
}
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
scatterq[id*Nentries+k] = qs;
qout[id+k*stride] += gq;
}
}
}
@kernel void packBuf_double(const dlong Nscatter,
const int Nentries,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict const double * q,
@restrict double * scatterq){
@kernel void packBuf_doubleAdd(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict double * q,
@restrict double * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
for(dlong s=0;s<Nscatter*Nentries;++s;@tile(256,@outer,@inner)){
double gq = 0.0;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += q[id+k*stride];
}
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
q[id+k*stride] = gq;
}
const double qs = q[s];
const dlong sid = s%Nscatter;
const int k = s/Nscatter;
const dlong start = scatterStarts[sid];
const dlong end = scatterStarts[sid+1];
for(dlong n=start;n<end;++n){
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
scatterq[id*Nentries+k] = qs;
qout[id*Nentries+k] = gq;
}
}
}
@kernel void unpackBuf_floatAdd(const dlong Ngather,
const int Nentries,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const float * q,
@restrict float * gatherq){
@kernel void packBuf_doubleMin(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict double * q,
@restrict double * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
double gq = q[gatherIds[startGather]+k*stride];
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq = (q[id+k*stride] < gq) ? q[id+k*stride] : gq;
}
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
q[id+k*stride] = gq;
}
const dlong gid = g%Ngather;
const int k = g/Ngather;
const dlong start = gatherStarts[gid];
const dlong end = gatherStarts[gid+1];
float gq = 0.f;
for(dlong n=start;n<end;++n){
const dlong id = gatherIds[n];
gq += q[id*Nentries+k];
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id*Nentries+k] = gq;
}
}
}
//contiguously packed
gatherq[g] += gq;
@kernel void packBuf_doubleMax(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict double * q,
@restrict double * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
double gq = q[gatherIds[startGather]+k*stride];
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq = (q[id+k*stride] > gq) ? q[id+k*stride] : gq;
}
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
q[id+k*stride] = gq;
}
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id*Nentries+k] = gq;
}
}
}
@kernel void unpackBuf_doubleAdd(const dlong Ngather,
const int Nentries,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const double * q,
@restrict double * gatherq){
for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%Ngather;
const int k = g/Ngather;
const dlong start = gatherStarts[gid];
const dlong end = gatherStarts[gid+1];
double gq = 0.f;
for(dlong n=start;n<end;++n){
@kernel void unpackBuf_doubleAdd(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict const double * q,
@restrict double * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
double gq = 0;
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq += q[id*Nentries+k];
}
//contiguously packed
gatherq[g] += gq;
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id+k*stride] += gq;
}
}
}
@kernel void unpackBuf_doubleMin(const dlong Ngather,
const int Nentries,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const double * q,
@restrict double * gatherq){
for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%Ngather;
const int k = g/Ngather;
const dlong start = gatherStarts[gid];
const dlong end = gatherStarts[gid+1];
const dlong startId = gatherIds[start];
double gq = q[startId*Nentries+k];
for(dlong n=start;n<end;++n){
@kernel void unpackBuf_doubleMin(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict const double * q,
@restrict double * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
double gq = q[gatherIds[startGather]*Nentries+k];
for(dlong n=startGather;n<endGather;++n){
const dlong id = gatherIds[n];
gq = (q[id*Nentries+k] < gq) ? q[id*Nentries+k] : gq;
}
//contiguously packed
gatherq[g] = gq;
for(dlong n=startScatter;n<endScatter;++n){
const dlong id = scatterIds[n];
qout[id+k*stride] = (qout[id+k*stride] < gq) ? qout[id+k*stride] : gq;
}
}
}
@kernel void unpackBuf_doubleMax(const dlong Ngather,
const int Nentries,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const double * q,
@restrict double * gatherq){
for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%Ngather;
const int k = g/Ngather;
const dlong start = gatherStarts[gid];
const dlong end = gatherStarts[gid+1];
const dlong startId = gatherIds[start];
double gq = q[startId*Nentries+k];
for(dlong n=start;n<end;++n){
@kernel void unpackBuf_doubleMax(const dlong N,
const int Nentries,
const dlong stride,
@restrict const dlong * gatherStarts,
@restrict const dlong * gatherIds,
@restrict const dlong * scatterStarts,
@restrict const dlong * scatterIds,
@restrict const double * q,
@restrict double * qout)
{
for(dlong g=0;g<N*Nentries;++g;@tile(256,@outer,@inner)){
const dlong gid = g%N;
const int k = g/N;
const dlong startGather = gatherStarts[gid];
const dlong endGather = gatherStarts[gid+1];
const dlong startScatter = scatterStarts[gid];
const dlong endScatter = scatterStarts[gid+1];
double gq = q[gatherIds[startGather]*Nentries+k];