Commit 6e9e77f7 authored by Adrian Pope's avatar Adrian Pope
Browse files

updated to README, GNUmakefiles, and test Fortran driver

parent d534826b
...@@ -60,7 +60,7 @@ DFFT_MPI_CC ?= mpicc ...@@ -60,7 +60,7 @@ DFFT_MPI_CC ?= mpicc
DFFT_MPI_CXX ?= mpicxx DFFT_MPI_CXX ?= mpicxx
# MPI Fortran compiler # MPI Fortran compiler
DFFT_MPI_FC = mpif90 DFFT_MPI_FC ?= mpif90
# pre-processor flags # pre-processor flags
DFFT_MPI_CPPFLAGS ?= -DDFFT_TIMING=2 DFFT_MPI_CPPFLAGS ?= -DDFFT_TIMING=2
...@@ -72,10 +72,29 @@ DFFT_MPI_CFLAGS ?= -g -O3 -Wall -Wno-deprecated -std=gnu99 ...@@ -72,10 +72,29 @@ DFFT_MPI_CFLAGS ?= -g -O3 -Wall -Wno-deprecated -std=gnu99
DFFT_MPI_CXXFLAGS ?= -g -O3 -Wall DFFT_MPI_CXXFLAGS ?= -g -O3 -Wall
# Fortran flags # Fortran flags
DFFT_MPI_FFLAGS ?= -g -O3 -fpp # -cpp seems to work with GNU and Intel
# though -fpp might be more correct for Intel
DFFT_MPI_FFLAGS ?= -g -O3 -cpp
# linker flags # linker flags
DFFT_MPI_LDFLAGS ?= -lfftw3 -lm DFFT_MPI_LDFLAGS ?=
# additional Fortran linker flags
# sometimes this also needs -lmpi++
DFFT_MPI_FLDFLAGS ?= -lstdc++
# FFTW3
DFFT_FFTW_HOME ?= $(shell dirname $(shell dirname $(shell which fftw-wisdom)))
DFFT_FFTW_CPPFLAGS ?= -I$(DFFT_FFTW_HOME)/include
DFFT_FFTW_LDFLAGS ?= -L$(DFFT_FFTW_HOME)/lib -lfftw3 -lm
# these should not usuall require modification
DFFT_MPI_CPPFLAGS += $(DFFT_FFTW_CPPFLAGS)
DFFT_MPI_LDFLAGS += $(DFFT_FFTW_LDFLAGS)
all: nativec utilities fortran all: nativec utilities fortran
...@@ -85,6 +104,11 @@ fortran: $(DFFT_MPI_DIR)/TestFDfft ...@@ -85,6 +104,11 @@ fortran: $(DFFT_MPI_DIR)/TestFDfft
utilities: $(DFFT_MPI_DIR)/CheckDecomposition utilities: $(DFFT_MPI_DIR)/CheckDecomposition
clean:
rm -rf $(DFFT_MPI_DIR) *.mod
$(DFFT_MPI_DIR): $(DFFT_MPI_DIR):
mkdir -p $(DFFT_MPI_DIR) mkdir -p $(DFFT_MPI_DIR)
...@@ -95,7 +119,9 @@ $(DFFT_MPI_DIR)/%.o: %.cpp | $(DFFT_MPI_DIR) ...@@ -95,7 +119,9 @@ $(DFFT_MPI_DIR)/%.o: %.cpp | $(DFFT_MPI_DIR)
$(DFFT_MPI_CXX) $(DFFT_MPI_CXXFLAGS) $(DFFT_MPI_CPPFLAGS) -c -o $@ $< $(DFFT_MPI_CXX) $(DFFT_MPI_CXXFLAGS) $(DFFT_MPI_CPPFLAGS) -c -o $@ $<
$(DFFT_MPI_DIR)/%.o: %.f90 | $(DFFT_MPI_DIR) $(DFFT_MPI_DIR)/%.o: %.f90 | $(DFFT_MPI_DIR)
$(DFFT_MPI_FC) $(DFFT_MPI_FFLAGS) -c -o $@ $< $(DFFT_MPI_FC) $(DFFT_MPI_FFLAGS) $(DFFT_MPI_CPPFLAGS) -c -o $@ $<
$(DFFT_MPI_DIR)/TestDfft: $(DFFT_MPI_DIR)/TestDfft.o $(DFFT_MPI_DIR)/distribution.o $(DFFT_MPI_DIR)/TestDfft: $(DFFT_MPI_DIR)/TestDfft.o $(DFFT_MPI_DIR)/distribution.o
$(DFFT_MPI_CXX) $(DFFT_MPI_CXXFLAGS) -o $@ $^ $(DFFT_MPI_LDFLAGS) $(DFFT_MPI_CXX) $(DFFT_MPI_CXXFLAGS) -o $@ $^ $(DFFT_MPI_LDFLAGS)
...@@ -104,10 +130,6 @@ $(DFFT_MPI_DIR)/CheckDecomposition: $(DFFT_MPI_DIR)/CheckDecomposition.o $(DFFT_ ...@@ -104,10 +130,6 @@ $(DFFT_MPI_DIR)/CheckDecomposition: $(DFFT_MPI_DIR)/CheckDecomposition.o $(DFFT_
$(DFFT_MPI_CC) $(DFFT_MPI_CFLAGS) -o $@ $^ $(DFFT_MPI_LDFLAGS) $(DFFT_MPI_CC) $(DFFT_MPI_CFLAGS) -o $@ $^ $(DFFT_MPI_LDFLAGS)
$(DFFT_MPI_DIR)/TestFDfft.o: TestFDfft.f90 $(DFFT_MPI_DIR)/FDistribution.o $(DFFT_MPI_DIR)/FDfft.o $(DFFT_MPI_DIR)/TestFDfft.o: TestFDfft.f90 $(DFFT_MPI_DIR)/FDistribution.o $(DFFT_MPI_DIR)/FDfft.o
$(DFFT_MPI_FC) $(DFFT_MPI_FFLAGS) -c -o $@ $<
$(DFFT_MPI_DIR)/TestFDfft: $(DFFT_MPI_DIR)/TestFDfft.o $(DFFT_MPI_DIR)/FDistribution.o $(DFFT_MPI_DIR)/FDfft.o $(DFFT_MPI_DIR)/DistributionC.o $(DFFT_MPI_DIR)/DfftC.o $(DFFT_MPI_DIR)/distribution.o $(DFFT_MPI_DIR)/TestFDfft: $(DFFT_MPI_DIR)/TestFDfft.o $(DFFT_MPI_DIR)/FDistribution.o $(DFFT_MPI_DIR)/FDfft.o $(DFFT_MPI_DIR)/DistributionC.o $(DFFT_MPI_DIR)/DfftC.o $(DFFT_MPI_DIR)/distribution.o
$(DFFT_MPI_FC) $(DFFT_MPI_FFLAGS) -o $@ $^ $(DFFT_MPI_LDFLAGS) -lstdc++ $(DFFT_MPI_FC) $(DFFT_MPI_FFLAGS) -o $@ $^ $(DFFT_MPI_LDFLAGS) $(DFFT_MPI_FLDFLAGS)
clean:
rm -rf $(DFFT_MPI_DIR) *.mod
...@@ -60,9 +60,10 @@ DFFT_MPI_CFLAGS ?= -g -O3 -fopenmp -Wall -Wno-deprecated -std=gnu99 ...@@ -60,9 +60,10 @@ DFFT_MPI_CFLAGS ?= -g -O3 -fopenmp -Wall -Wno-deprecated -std=gnu99
DFFT_MPI_CXXFLAGS ?= -g -O3 -fopenmp -Wall DFFT_MPI_CXXFLAGS ?= -g -O3 -fopenmp -Wall
# Fortran flags # Fortran flags
DFFT_MPI_FFLAGS ?= -g -O3 -fopenmp -fpp DFFT_MPI_FFLAGS ?= -g -O3 -fopenmp -cpp
# linker flags # FFTW3
DFFT_MPI_LDFLAGS ?= -lfftw3_omp -lfftw3 -lm DFFT_FFTW_HOME ?= $(shell dirname $(shell dirname $(shell which fftw-wisdom)))
DFFT_FFTW_LDFLAGS ?= -L$(DFFT_FFTW_HOME)/lib -lfftw3_omp -lfftw3 -lm
include GNUmakefile include GNUmakefile
HACC DFFT MiniApp SWFFT (HACC)
Adrian Pope (et al) Adrian Pope (et al)
apope@anl.gov apope@anl.gov
2017-03-09 2017-10-02
======== ========
Overview Overview
...@@ -9,8 +9,9 @@ Overview ...@@ -9,8 +9,9 @@ Overview
This directory contains the code necessary to run the Hardware Accelerated This directory contains the code necessary to run the Hardware Accelerated
Cosmology Code (HACC) 3D distributed memory discrete fast Fourier transform, Cosmology Code (HACC) 3D distributed memory discrete fast Fourier transform,
a lightweight make system, a driver for an example test code (TestDfft), a lightweight make system, drivers for an example test code in C++ (TestDfft)
and a utility that checks grid sizes and MPI rank layouts (CheckDecomposition). as well as Fortran (TestFDfft), and a utility that checks grid sizes and
MPI rank layouts (CheckDecomposition).
This code assumes that global grid will originally be distributed between This code assumes that global grid will originally be distributed between
MPI ranks using a 3D Cartesian communicator. That data needs to be MPI ranks using a 3D Cartesian communicator. That data needs to be
...@@ -36,7 +37,7 @@ must be present in the set of prime factors of the grid, eg. if you have ...@@ -36,7 +37,7 @@ must be present in the set of prime factors of the grid, eg. if you have
20 MPI ranks then ng must be a multiple of 5 and 2. The "CheckDecomposition" 20 MPI ranks then ng must be a multiple of 5 and 2. The "CheckDecomposition"
utility is provided to check (on one rank) whether a proposed grid size and utility is provided to check (on one rank) whether a proposed grid size and
number of MPI ranks will work, which can be done before submitting a large number of MPI ranks will work, which can be done before submitting a large
test with TestDfft. test with TestDfft/TestFDfft.
======== ========
Building Building
...@@ -46,7 +47,7 @@ Building ...@@ -46,7 +47,7 @@ Building
System Requirements System Requirements
------------------- -------------------
MPI (version?) MPI-1
FFTW3 (double precision, OpenMP optional, does not use FFTW3's MPI interface) FFTW3 (double precision, OpenMP optional, does not use FFTW3's MPI interface)
--------- ---------
...@@ -58,6 +59,14 @@ MPI-only version of the code: ...@@ -58,6 +59,14 @@ MPI-only version of the code:
$ make $ make
By default the GNUmakefile will make the C/C++ interface and driver, the
Fortran interface and driver, and the CheckDecomposition utility. There are
also rules to make subsets of these:
$ make nativec
$ make utilities
$ make fortran
There is also an example of how to modify the makefile to compile with OpenMP There is also an example of how to modify the makefile to compile with OpenMP
threading on each MPI rank: threading on each MPI rank:
...@@ -76,6 +85,21 @@ environment variables: ...@@ -76,6 +85,21 @@ environment variables:
$ export DFFT_MPI_CC=cc $ export DFFT_MPI_CC=cc
$ export DFFT_MPI_CXX=CC $ export DFFT_MPI_CXX=CC
$ export DFFT_MPI_FC=ftn
When using the Intel compilers without the Cray wrappers it may be easier to
use the Intel-specific MPI wrappers, eg:
$ export DFFT_MPI_CC=mpiicc
$ export DFFT_MPI_CXX=mpiicpc
$ export DFFT_MPI_FC=mpiifort
If FFTW3 is in C/PATH, then the GNUmakefile will find it by searching for the
fftw-wisdom executable. An alternate FFTW3 installation can be specified by
setting an environment variable to the directory above the "include" and "lib"
directories:
$ export DFFT_FFTW_HOME=/path/to/fftw3
See the base GNUmakefile for a commented list of variables. See the base GNUmakefile for a commented list of variables.
...@@ -257,6 +281,40 @@ ALCF/Mira/BGQ has incomplete support for C++11 and is unlikely to add ...@@ -257,6 +281,40 @@ ALCF/Mira/BGQ has incomplete support for C++11 and is unlikely to add
features in the remaining lifetime of the machine (2019-2020) so we use features in the remaining lifetime of the machine (2019-2020) so we use
C++11 features very sparingly. C++11 features very sparingly.
===========================
TestFDfft (Fortran Example)
===========================
-----
Usage
-----
This test program proivdes an example interface for the Fortran wrappers
built around the the C++ Distribution and Dfft classes. The structure
and usage of this program is identical to the C++ TestDfft dirver. The
command line signature follows as
$ build/TestFDfft <n_repetitions> <ngx> [ngy ngz]
with the same interpretation as described above for TestDfft.
The Fortran wrappers are contained in the FDistribution and FDfft modules
which are built around the F2003 standard ISO_C_BINDING module. An
important point to make is that the Fortran wrappers interface with C
pointers to the Fortran arrays containing the data to be transformed. This
is achieved using the F2003 standard C_F_POINTER subroutine. In addition,
optimal memory alignment is obtained using the fftw_alloc_complex function.
The TestFDfft driver provides a good example of each of these procedures.
See further notes below regarding the usage of multidimensional arrays
within the Fortran interface.
--------------
Example Output
--------------
The output from TestFDfft will match that of TestDfft shown above including
the hexadecimal cast to check bit-wise numerical accuracy.
============================ ============================
CheckDecomposition (Utility) CheckDecomposition (Utility)
============================ ============================
...@@ -310,3 +368,17 @@ for iteration through the grids values themselves, so the total number of ...@@ -310,3 +368,17 @@ for iteration through the grids values themselves, so the total number of
grid vertexes locally and globally should not be limited by 32-bit integer grid vertexes locally and globally should not be limited by 32-bit integer
size. This distribution code has been tested up to 16384^3 global grid and size. This distribution code has been tested up to 16384^3 global grid and
on >~10^6 MPI-ranks. on >~10^6 MPI-ranks.
-------------------------------
Fortran Multidimensional Arrays
-------------------------------
The linear storage in memory of multidimensional arrays differs between
that of C (row-major) and Fortran (column-major). The Fortran interface
provided here implicitly assumes that the one-dimensional memory storage
of the arrays to be transformed conforms with the C convention. The
returned transformed data is also arranged in row-major format. Hence, care
must be taken to ensure that data is arranged in this way when interfacing
with the Fortran wrappers. In general, this involves a transpose of data when
using a multidimensional Fortran array to store the 3D data in memory.
...@@ -88,7 +88,7 @@ program main ...@@ -88,7 +88,7 @@ program main
#ifdef _OPENMP #ifdef _OPENMP
ierr = fftw_init_threads() ierr = fftw_init_threads()
if (.not. ierr) then if (ierr == 0) then
write(*,*) "fftw_init_threads() failed!" write(*,*) "fftw_init_threads() failed!"
call MPI_Abort(MPI_COMM_WORLD, ierr, ierr) call MPI_Abort(MPI_COMM_WORLD, ierr, ierr)
endif endif
...@@ -224,9 +224,9 @@ subroutine test ...@@ -224,9 +224,9 @@ subroutine test
numg = 1.0*ng(1)*ng(2)*ng(3) numg = 1.0*ng(1)*ng(2)*ng(3)
write(*,*) write(*,*)
write(*,*) "Hex representations of double precision floats" write(*,*) "Hex representations of double precision floats"
write(*,fmt="(F18.3,A,Z)") zero, " = ", zero write(*,fmt="(F18.3,A,Z18)") zero, " = ", zero
write(*,fmt="(F18.3,A,Z)") one, " = ", one write(*,fmt="(F18.3,A,Z18)") one, " = ", one
write(*,fmt="(F18.3,A,Z)") numg, " = ", numg write(*,fmt="(F18.3,A,Z18)") numg, " = ", numg
write(*,*) write(*,*)
endif endif
...@@ -286,7 +286,8 @@ subroutine assign_delta_function(dfft, a, n) ...@@ -286,7 +286,8 @@ subroutine assign_delta_function(dfft, a, n)
!! Determine local grid dimensions in r-space !! Determine local grid dimensions in r-space
call nlocalRspace(dfft, local_ng) call nlocalRspace(dfft, local_ng)
!! Fill in the delta function !! Fill in the delta function.
!! NOTE: We are filling in one-dimensional memory using the row-major C convention.
local_indx = 1 local_indx = 1
do i = 1, local_ng(1) do i = 1, local_ng(1)
global_i = local_ng(1)*self(1) + i global_i = local_ng(1)*self(1) + i
...@@ -294,11 +295,10 @@ subroutine assign_delta_function(dfft, a, n) ...@@ -294,11 +295,10 @@ subroutine assign_delta_function(dfft, a, n)
global_j = local_ng(2)*self(2) + j global_j = local_ng(2)*self(2) + j
do k = 1, local_ng(3) do k = 1, local_ng(3)
global_k = local_ng(3)*self(3) + k global_k = local_ng(3)*self(3) + k
a(local_indx)%im = 0.
if (global_i == 1 .and. global_j == 1 .and. global_k == 1) then if (global_i == 1 .and. global_j == 1 .and. global_k == 1) then
a(local_indx)%re = 1. a(local_indx) = cmplx(1.,0.)
else else
a(local_indx)%re = 0. a(local_indx) = cmplx(0.,0.)
endif endif
local_indx = local_indx + 1 local_indx = local_indx + 1
enddo enddo
...@@ -325,12 +325,12 @@ subroutine check_kspace(dfft, a) ...@@ -325,12 +325,12 @@ subroutine check_kspace(dfft, a)
nlocal = localSize(dfft) nlocal = localSize(dfft)
LocalRealMin = a(1)%re ; LocalRealMax = a(1)%re LocalRealMin = real(a(2)) ; LocalRealMax = real(a(2))
LocalImagMin = a(1)%im ; LocalImagMax = a(1)%im LocalImagMin = aimag(a(2)) ; LocalImagMax = aimag(a(2))
do i = 1, nlocal do i = 1, nlocal
re = a(i)%re re = real(a(i))
im = a(i)%im im = aimag(a(i))
if (re < LocalRealMin) LocalRealMin = re if (re < LocalRealMin) LocalRealMin = re
if (re > LocalRealMax) LocalRealMax = re if (re > LocalRealMax) LocalRealMax = re
if (im < LocalImagMin) LocalImagMin = im if (im < LocalImagMin) LocalImagMin = im
...@@ -348,8 +348,8 @@ subroutine check_kspace(dfft, a) ...@@ -348,8 +348,8 @@ subroutine check_kspace(dfft, a)
if (parRank == 0) then if (parRank == 0) then
write(*,*) write(*,*)
write(*,*) "k-space:" write(*,*) "k-space:"
write(*,fmt="(A,F18.3,F18.3,A,Z,Z,A)") "real in [", GlobalRealMin, GlobalRealMax, "] = [", GlobalRealMin, GlobalRealMax, "]" write(*,fmt="(A,F18.3,F18.3,A,Z18,Z18,A)") "real in [", GlobalRealMin, GlobalRealMax, "] = [", GlobalRealMin, GlobalRealMax, "]"
write(*,fmt="(A,F18.3,F18.3,A,Z,Z,A)") "imag in [", GlobalImagMin, GlobalImagMax, "] = [", GlobalImagMin, GlobalImagMax, "]" write(*,fmt="(A,F18.3,F18.3,A,Z18,Z18,A)") "imag in [", GlobalImagMin, GlobalImagMax, "] = [", GlobalImagMin, GlobalImagMax, "]"
write(*,*) write(*,*)
endif endif
...@@ -378,8 +378,8 @@ subroutine check_rspace(dfft, a) ...@@ -378,8 +378,8 @@ subroutine check_rspace(dfft, a)
call selfRspace(dfft, self) call selfRspace(dfft, self)
call nlocalRspace(dfft, local_ng) call nlocalRspace(dfft, local_ng)
LocalRealMin = a(1)%re ; LocalRealMax = a(1)%re LocalRealMin = real(a(2)) ; LocalRealMax = real(a(2))
LocalImagMin = a(1)%im ; LocalImagMax = a(1)%im LocalImagMin = aimag(a(2)) ; LocalImagMax = aimag(a(2))
parComm = parentComm(dfft) parComm = parentComm(dfft)
call MPI_Comm_rank(parComm, parRank, ierr) call MPI_Comm_rank(parComm, parRank, ierr)
...@@ -397,10 +397,11 @@ subroutine check_rspace(dfft, a) ...@@ -397,10 +397,11 @@ subroutine check_rspace(dfft, a)
do k = 1, local_ng(3) do k = 1, local_ng(3)
global_k = local_ng(3)*self(3) + k global_k = local_ng(3)*self(3) + k
if (global_i == 1 .and. global_j == 1 .and. global_k == 1) then if (global_i == 1 .and. global_j == 1 .and. global_k == 1) then
write(*,fmt="(A,F18.3,F18.3,A,Z,Z,A)") "a[0,0,0] = ", a(local_indx)%re, a(local_indx)%im , "= (", a(local_indx)%re, a(local_indx)%im, ")" write(*,fmt="(A,F18.3,F18.3,A,Z18,Z18,A)") "a[0,0,0] = ", real(a(local_indx)), aimag(a(local_indx)), &
"= (", real(a(local_indx)), aimag(a(local_indx)), ")"
else else
re = a(i)%re re = real(a(local_indx))
im = a(i)%im im = aimag(a(local_indx))
if (re < LocalRealMin) LocalRealMin = re if (re < LocalRealMin) LocalRealMin = re
if (re > LocalRealMax) LocalRealMax = re if (re > LocalRealMax) LocalRealMax = re
if (im < LocalImagMin) LocalImagMin = im if (im < LocalImagMin) LocalImagMin = im
...@@ -411,9 +412,14 @@ subroutine check_rspace(dfft, a) ...@@ -411,9 +412,14 @@ subroutine check_rspace(dfft, a)
enddo enddo
enddo enddo
call MPI_Allreduce(LocalRealMin, GlobalRealMin, 1, MPI_DOUBLE, MPI_MIN, parComm, ierr)
call MPI_Allreduce(LocalRealMax, GlobalRealMax, 1, MPI_DOUBLE, MPI_MAX, parComm, ierr)
call MPI_Allreduce(LocalImagMin, GlobalImagMin, 1, MPI_DOUBLE, MPI_MIN, parComm, ierr)
call MPI_Allreduce(LocalImagMax, GlobalImagMax, 1, MPI_DOUBLE, MPI_MAX, parComm, ierr)
if (parRank == 0) then if (parRank == 0) then
write(*,fmt="(A,F18.3,F18.3,A,Z,Z,A)") "real in [", GlobalRealMin, GlobalRealMax, "] = [", GlobalRealMin, GlobalRealMax, "]" write(*,fmt="(A,F18.3,F18.3,A,Z18,Z18,A)") "real in [", GlobalRealMin, GlobalRealMax, "] = [", GlobalRealMin, GlobalRealMax, "]"
write(*,fmt="(A,F18.3,F18.3,A,Z,Z,A)") "imag in [", GlobalImagMin, GlobalImagMax, "] = [", GlobalImagMin, GlobalImagMax, "]" write(*,fmt="(A,F18.3,F18.3,A,Z18,Z18,A)") "imag in [", GlobalImagMin, GlobalImagMax, "] = [", GlobalImagMin, GlobalImagMax, "]"
write(*,*) write(*,*)
endif endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment