add pffft

This commit is contained in:
2024-11-09 14:57:18 -06:00
parent 78a00f71cc
commit a1790b8977
69 changed files with 25719 additions and 0 deletions

279
pffft/.github/workflows/c-cpp.yml vendored Normal file
View File

@@ -0,0 +1,279 @@
name: C/C++ CI
on:
push:
branches:
- master
- github_actions
pull_request:
branches:
- master
- github_actions
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
jobs:
build_w_mipp_ubuntu-amd64:
runs-on: ubuntu-latest
steps:
- name: check out MIPP
uses: actions/checkout@master
with:
repository: hayguen/MIPP
path: ./MIPP
- name: cmake configure MIPP
run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
- name: cmake install MIPP headers
run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
- uses: actions/checkout@v2
- name: cmake_make_simd_float_double
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
- name: cmake_make_simd_float
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
- name: cmake_make_simd_double
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
- name: cmake_make_no-simd_float_double
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
- name: cmake_make_no-simd_scalar_float_double
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
- name: compress
run: tar zcvf pffft_w_mipp_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_ubuntu_builds
path: pffft_w_mipp_ubuntu-amd64.tar.gz
build_ubuntu-amd64:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: cmake_make_simd_float_double
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
- name: cmake_make_simd_float
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
- name: cmake_make_simd_double
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
- name: cmake_make_no-simd_float_double
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
- name: cmake_make_no-simd_scalar_float_double
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
- name: compress
run: tar zcvf pffft_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_ubuntu_builds
path: pffft_ubuntu-amd64.tar.gz
cross_build_win_from_linux:
runs-on: ubuntu-20.04
steps:
- name: prerequisites
run: sudo apt -qq update && sudo apt -yqq install gcc-mingw-w64 g++-mingw-w64
- name: check out MIPP
uses: actions/checkout@master
with:
repository: hayguen/MIPP
path: ./MIPP
- name: cmake configure MIPP
working-directory: ${{runner.workspace}}
run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
- name: cmake install MIPP headers
working-directory: ${{runner.workspace}}
run: cmake --build MIPP_build --target install
- uses: actions/checkout@v2
- name: build_w32_no-simd
working-directory: ${{runner.workspace}}
run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
- name: build_w32_simd_full
working-directory: ${{runner.workspace}}
run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=pentium4 -DTARGET_C_ARCH=pentium4 -DMIPP_INCLUDE_DIRS=$X/include/mipp
- name: build_w64_no-simd
working-directory: ${{runner.workspace}}
run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
- name: build_w64_simd_full
working-directory: ${{runner.workspace}}
run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=sandybridge -DTARGET_C_ARCH=sandybridge -DMIPP_INCLUDE_DIRS=$X/include/mipp
- name: compress
run: tar zcvf pffft_cross-build-windows-from-linux-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_w32_no-simd build_w32_simd_full build_w64_no-simd build_w64_simd_full
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_windows_from_cross_builds
path: pffft_cross-build-windows-from-linux-amd64.tar.gz
build_win_msvc:
# The CMake configure and build commands are platform agnostic and should work equally
# well on Windows or Mac. You can convert this to a matrix build if you need
# cross-platform coverage.
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
runs-on: windows-2019
steps:
- name: check out MIPP
uses: actions/checkout@master
with:
repository: hayguen/MIPP
path: ./MIPP
- name: cmake configure MIPP
shell: bash
working-directory: ${{runner.workspace}}
run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
- name: cmake install MIPP headers
working-directory: ${{runner.workspace}}
run: cmake --build MIPP_build --target install
- uses: actions/checkout@v2
- name: Configure CMake No-SIMD
shell: bash
working-directory: ${{runner.workspace}}
run: cmake -S $GITHUB_WORKSPACE -B build_no-simd -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DPFFFT_USE_SIMD=OFF -DTARGET_CXX_ARCH=none -DTARGET_C_ARCH=none
- name: Build No-SIMD
shell: bash
working-directory: ${{runner.workspace}}
# Execute the build. You can specify a specific target with "--target <NAME>"
run: cmake --build build_no-simd --config $BUILD_TYPE
- name: Configure CMake SSE2
shell: bash
working-directory: ${{runner.workspace}}
run: cmake -S $GITHUB_WORKSPACE -B build_sse2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=SSE2 -DTARGET_C_ARCH=SSE2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
- name: Build SSE2
shell: bash
working-directory: ${{runner.workspace}}
# Execute the build. You can specify a specific target with "--target <NAME>"
run: cmake --build build_sse2 --config $BUILD_TYPE
- name: Configure CMake AVX
# Use a bash shell so we can use the same syntax for environment variable
# access regardless of the host operating system
shell: bash
working-directory: ${{runner.workspace}}
run: cmake -S $GITHUB_WORKSPACE -B build_avx -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX -DTARGET_C_ARCH=AVX -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
- name: Build AVX
working-directory: ${{runner.workspace}}
shell: bash
# Execute the build. You can specify a specific target with "--target <NAME>"
run: cmake --build build_avx --config $BUILD_TYPE
- name: Configure CMake AVX2
# Use a bash shell so we can use the same syntax for environment variable
# access regardless of the host operating system
shell: bash
working-directory: ${{runner.workspace}}
run: cmake -S $GITHUB_WORKSPACE -B build_avx2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX2 -DTARGET_C_ARCH=AVX2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
- name: Build AVX2
working-directory: ${{runner.workspace}}
shell: bash
# Execute the build. You can specify a specific target with "--target <NAME>"
run: cmake --build build_avx2 --config $BUILD_TYPE
- name: compress
working-directory: ${{runner.workspace}}
run: tar zcvf pffft_windows-msvc-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_no-simd build_sse2 build_avx build_avx2
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_windows_msvc_builds
path: ${{runner.workspace}}/pffft_windows-msvc-amd64.tar.gz
build_win_mingw:
runs-on: windows-2019
strategy:
matrix:
compiler: [gcc]
msystem: [MINGW64]
defaults:
run:
shell: msys2 {0}
steps:
- uses: actions/checkout@v2
- uses: msys2/setup-msys2@v2
with:
msystem: MINGW64
install: gcc cmake make
- name: Configure cmake
run: CC=gcc cmake -DMINGW=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native -S . -B build_mgw64
- name: Build
run: cmake --build build_mgw64
- name: compress
run: tar zcvf pffft_windows-mingw-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_mgw64
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_windows_mingw_builds
path: pffft_windows-mingw-amd64.tar.gz
build_macos11:
# copied from build_ubuntu-amd64 with minor renaming
runs-on: macos-11
steps:
- uses: actions/checkout@v2
- name: cmake_make_simd_float_double
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
- name: cmake_make_simd_float
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
- name: cmake_make_simd_double
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
- name: cmake_make_no-simd_float_double
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
- name: cmake_make_no-simd_scalar_float_double
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
- name: compress
run: tar zcvf pffft_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_macos_builds
path: pffft_macos-11.tar.gz
build_w_mipp_macos11:
# copied from build_w_mipp_ubuntu-amd64 with minor renaming
runs-on: macos-11
steps:
- name: check out MIPP
uses: actions/checkout@master
with:
repository: hayguen/MIPP
path: ./MIPP
- name: cmake configure MIPP
run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
- name: cmake install MIPP headers
run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
- uses: actions/checkout@v2
- name: cmake_make_simd_float_double
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
- name: cmake_make_simd_float
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
- name: cmake_make_simd_double
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
- name: cmake_make_no-simd_float_double
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
- name: cmake_make_no-simd_scalar_float_double
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
- name: compress
run: tar zcvf pffft_w_mipp_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
- name: 'Upload Artifact'
uses: actions/upload-artifact@v2
with:
name: pffft_macos_builds
path: pffft_w_mipp_macos-11.tar.gz

4
pffft/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
build
build_benches
build_*
.vscode

9
pffft/.gitmodules vendored Normal file
View File

@@ -0,0 +1,9 @@
[submodule "greenffts"]
path = greenffts
url = https://github.com/hayguen/greenffts.git
[submodule "kissfft"]
path = kissfft
url = https://github.com/hayguen/kissfft.git
[submodule "pocketfft"]
path = pocketfft
url = https://github.com/hayguen/pocketfft.git

663
pffft/CMakeLists.txt Normal file
View File

@@ -0,0 +1,663 @@
cmake_minimum_required(VERSION 2.8)
project(PRETTY_FAST_FFT)
# smaller library size?
option(PFFFT_USE_TYPE_FLOAT "activate single precision 'float'?" ON)
option(PFFFT_USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
# architecture/optimization options
option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
# what to install?
option(INSTALL_PFFFT "install pffft to CMAKE_INSTALL_PREFIX?" ON)
option(INSTALL_PFDSP "install pfdsp to CMAKE_INSTALL_PREFIX?" OFF)
option(INSTALL_PFFASTCONV "install pffastconv to CMAKE_INSTALL_PREFIX?" OFF)
# test options
option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF)
option(PFFFT_USE_FFTPACK "compile and use FFTPACK in fft benchmark & validation?" ON)
option(PFFFT_USE_DEBUG_ASAN "use GCC's address sanitizer?" OFF)
option(PFFFT_DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
# C90 requires the gcc extensions for function attributes like always_inline
# C99 provides the function attributes: no gcc extensions required
set(CMAKE_C_STANDARD 99)
set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD 98)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
# populate what to install
set(INSTALL_TARGETS "")
set(INSTALL_HEADERS "")
if ( (NOT PFFFT_USE_TYPE_FLOAT) AND (NOT PFFFT_USE_TYPE_DOUBLE) )
message(FATAL_ERROR "activate at least one of PFFFT_USE_TYPE_FLOAT or PFFFT_USE_TYPE_DOUBLE")
endif()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
include(cmake/target_optimizations.cmake)
include(cmake/compiler_warnings.cmake)
find_package(PAPI)
find_package(MIPP)
if (MIPP_FOUND)
# if (TARGET MIPP)
message(STATUS "found MIPP")
else()
message(STATUS "NOT found MIPP")
endif()
if (PFFFT_USE_DEBUG_ASAN)
set(ASANLIB "asan")
else()
set(ASANLIB "")
endif()
message(STATUS "INFO: CMAKE_C_COMPILER_ID is ${CMAKE_C_COMPILER_ID}")
message(STATUS "INFO: CMAKE_CXX_COMPILER_ID is ${CMAKE_CXX_COMPILER_ID}")
if (WIN32)
message(STATUS "INFO: detected WIN32")
else()
message(STATUS "INFO: NOT WIN32")
endif()
if (MINGW)
message(STATUS "INFO: detected MINGW with compiler ${CMAKE_C_COMPILER_ID}")
else()
message(STATUS "INFO: NOT MINGW")
endif()
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
message(STATUS "INFO: detected MSVC with compiler ${CMAKE_C_COMPILER_ID}")
endif()
if (PFFFT_USE_BENCH_GREEN)
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/greenffts/CMakeLists.txt")
message(STATUS "found subdir greenffts")
set(PATH_GREEN "${CMAKE_CURRENT_LIST_DIR}/greenffts")
add_subdirectory( "${PATH_GREEN}" )
else()
message(WARNING "GreenFFT not found in subdir greenffts")
endif()
endif()
if (PFFFT_USE_BENCH_KISS)
# git submodule add https://github.com/hayguen/kissfft.git
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/kissfft/CMakeLists.txt")
message(STATUS "found subdir kissfft")
set(PATH_KISS "${CMAKE_CURRENT_LIST_DIR}/kissfft")
add_subdirectory( "${PATH_KISS}" )
else()
message(WARNING "KissFFT not found in subdir kissfft")
endif()
endif()
if (PFFFT_USE_BENCH_POCKET)
# git submodule add https://github.com/hayguen/pocketfft.git
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c")
message(STATUS "found subdir pocketfft")
set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft")
add_subdirectory( "${PATH_POCKET}" )
else()
message(WARNING "PocketFFT not found in subdir pocketfft")
endif()
endif()
########################################################################
# select the release build type by default to get optimization flags
########################################################################
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
message(STATUS "Build type not specified: defaulting to release.")
endif(NOT CMAKE_BUILD_TYPE)
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
# using Visual Studio C++
message(STATUS "INFO: detected MSVC: will not link math lib m")
set(MATHLIB "")
add_definitions("/D_CRT_SECURE_NO_WARNINGS")
set(MSVC_DISABLED_WARNINGS_LIST
"C4996"
)
else()
if(PFFFT_DISABLE_LINK_WITH_M)
else()
message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
set(MATHLIB "m")
endif()
endif()
set(STDCXXLIB "")
if (MINGW)
set(STDCXXLIB "stdc++")
endif()
set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
set( SIMD_DOUBLE_HDRS simd/pf_double.h simd/pf_avx_double.h simd/pf_scalar_double.h )
if (PFFFT_USE_TYPE_FLOAT)
set( FLOAT_SOURCES pffft.c pffft.h ${SIMD_FLOAT_HDRS} )
if (INSTALL_PFFFT)
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.h)
endif()
else()
set( FLOAT_SOURCES )
endif()
if (PFFFT_USE_TYPE_DOUBLE)
set( DOUBLE_SOURCES pffft_double.c pffft_double.h ${SIMD_DOUBLE_HDRS} )
if (INSTALL_PFFFT)
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft_double.h)
endif()
else()
set( DOUBLE_SOURCES )
endif()
######################################################
add_library(PFFFT STATIC ${FLOAT_SOURCES} ${DOUBLE_SOURCES} pffft_common.c pffft_priv_impl.h pffft.hpp )
set_target_properties(PFFFT PROPERTIES OUTPUT_NAME "pffft")
target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
target_activate_c_compiler_warnings(PFFFT)
if (PFFFT_USE_SCALAR_VECT)
target_compile_definitions(PFFFT PRIVATE PFFFT_SCALVEC_ENABLED=1)
endif()
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(PFFFT PRIVATE "-fsanitize=address")
endif()
target_set_c_arch_flags(PFFFT)
if (NOT PFFFT_USE_SIMD)
target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
endif()
target_link_libraries( PFFFT ${ASANLIB} ${MATHLIB} )
set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
if (INSTALL_PFFFT)
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFFT)
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.hpp)
endif()
######################################################
if (PFFFT_USE_TYPE_FLOAT)
add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_cplx.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
set_property(TARGET PFDSP PROPERTY CXX_STANDARD 11)
set_property(TARGET PFDSP PROPERTY CXX_STANDARD_REQUIRED ON)
set_target_properties(PFDSP PROPERTIES OUTPUT_NAME "pfdsp")
target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
target_activate_cxx_compiler_warnings(PFDSP)
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(PFDSP PRIVATE "-fsanitize=address")
endif()
if (PFFFT_USE_SIMD)
target_set_cxx_arch_flags(PFDSP)
else()
target_compile_definitions(PFDSP PRIVATE PFFFT_SIMD_DISABLE=1)
endif()
target_link_libraries( PFDSP ${MATHLIB} )
set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
if (INSTALL_PFDSP)
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFDSP)
set(INSTALL_HEADERS ${INSTALL_HEADERS} pf_mixer.h pf_cplx.h pf_carrier.h pf_cic.h)
endif()
endif()
######################################################
if (PFFFT_USE_FFTPACK)
# float / single precision
add_library(FFTPACK_FLOAT STATIC fftpack.c fftpack.h)
target_compile_definitions(FFTPACK_FLOAT PRIVATE _USE_MATH_DEFINES)
target_activate_c_compiler_warnings(FFTPACK_FLOAT)
target_link_libraries( FFTPACK_FLOAT ${MATHLIB} )
set_property(TARGET FFTPACK_FLOAT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
# double precision
add_library(FFTPACK_DOUBLE STATIC fftpack.c fftpack.h)
target_compile_definitions(FFTPACK_DOUBLE PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(FFTPACK_DOUBLE PUBLIC FFTPACK_DOUBLE_PRECISION)
target_activate_c_compiler_warnings(FFTPACK_DOUBLE)
target_link_libraries( FFTPACK_DOUBLE ${MATHLIB} )
set_property(TARGET FFTPACK_DOUBLE APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
# builtin test program of fftpack
add_executable(test_fftpack_float fftpack.c fftpack.h)
target_compile_definitions(test_fftpack_float PRIVATE _USE_MATH_DEFINES TESTING_FFTPACK)
target_link_libraries(test_fftpack_float ${MATHLIB})
add_executable(test_fftpack_double fftpack.c fftpack.h)
target_compile_definitions(test_fftpack_double PRIVATE _USE_MATH_DEFINES FFTPACK_DOUBLE_PRECISION TESTING_FFTPACK)
target_link_libraries(test_fftpack_double ${MATHLIB})
endif()
######################################################
if (PFFFT_USE_TYPE_FLOAT)
# only 'float' supported in PFFASTCONV
add_library(PFFASTCONV STATIC pffastconv.c pffastconv.h pffft.h )
set_target_properties(PFFASTCONV PROPERTIES OUTPUT_NAME "pffastconv")
target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
target_activate_c_compiler_warnings(PFFASTCONV)
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
endif()
target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
if (INSTALL_PFFASTCONV)
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFASTCONV)
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffastconv.h)
endif()
endif()
######################################################
install( TARGETS ${INSTALL_TARGETS} DESTINATION lib)
install( FILES ${INSTALL_HEADERS} DESTINATION include)
add_custom_target(uninstall
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/uninstall.cmake"
)
#######################################################
if (PFFFT_USE_TYPE_FLOAT)
add_executable( test_pffft_float test_pffft.c )
target_compile_definitions(test_pffft_float PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(test_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
target_link_libraries( test_pffft_float PFFFT ${ASANLIB} )
endif()
######################################################
if (PFFFT_USE_TYPE_DOUBLE)
add_executable( test_pffft_double test_pffft.c )
target_compile_definitions(test_pffft_double PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(test_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
target_link_libraries( test_pffft_double PFFFT ${ASANLIB} )
endif()
######################################################
add_executable( test_fft_factors test_fft_factors.c )
if (PFFFT_USE_TYPE_FLOAT)
target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_FLOAT)
endif()
if (PFFFT_USE_TYPE_DOUBLE)
target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_DOUBLE)
endif()
target_link_libraries(test_fft_factors PFFFT ${ASANLIB} ${MATHLIB})
######################################################
add_executable( test_pffft_cpp test_pffft.cpp )
target_compile_definitions(test_pffft_cpp PRIVATE _USE_MATH_DEFINES)
if (PFFFT_USE_TYPE_FLOAT)
target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_FLOAT)
endif()
if (PFFFT_USE_TYPE_DOUBLE)
target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_DOUBLE)
endif()
target_link_libraries( test_pffft_cpp PFFFT ${STDCXXLIB} ${ASANLIB} )
######################################################
add_executable( test_pffft_cpp_11 test_pffft.cpp )
target_compile_definitions(test_pffft_cpp_11 PRIVATE _USE_MATH_DEFINES)
if (PFFFT_USE_TYPE_FLOAT)
target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_FLOAT)
endif()
if (PFFFT_USE_TYPE_DOUBLE)
target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_DOUBLE)
endif()
target_link_libraries( test_pffft_cpp_11 PFFFT ${STDCXXLIB} ${ASANLIB} )
set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD 11)
set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD_REQUIRED ON)
######################################################
if (PFFFT_USE_TYPE_FLOAT)
add_executable(test_pffastconv test_pffastconv.c
${SIMD_FLOAT_HDRS} ${SIMD_DOUBLE_HDRS}
)
target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
endif()
target_set_c_arch_flags(test_pffastconv)
if (NOT PFFFT_USE_SIMD)
target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
endif()
target_link_libraries( test_pffastconv PFFASTCONV ${ASANLIB} ${MATHLIB} )
endif()
######################################################
if (PFFFT_USE_TYPE_FLOAT)
add_executable(bench_pffft_float bench_pffft.c pffft.h)
target_compile_definitions(bench_pffft_float PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(bench_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(bench_pffft_float PRIVATE "-fsanitize=address")
endif()
target_link_libraries( bench_pffft_float PFFFT ${ASANLIB} )
if (PFFFT_USE_FFTPACK)
target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTPACK=1)
target_link_libraries(bench_pffft_float FFTPACK_FLOAT)
endif()
if (PFFFT_USE_BENCH_FFTW)
target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTW=1)
target_link_libraries(bench_pffft_float fftw3f)
endif()
if (PATH_GREEN AND PFFFT_USE_BENCH_GREEN)
target_compile_definitions(bench_pffft_float PRIVATE HAVE_GREEN_FFTS=1)
target_link_libraries(bench_pffft_float GreenFFT)
endif()
if (PATH_KISS AND PFFFT_USE_BENCH_KISS)
target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1)
target_link_libraries(bench_pffft_float KissFFT)
endif()
if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1)
target_link_libraries(bench_pffft_float PocketFFT)
endif()
if (PFFFT_USE_BENCH_MKL)
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
# has chances to work
else()
# other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?!
message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
endif()
message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
target_compile_definitions(bench_pffft_float PRIVATE HAVE_MKL=1)
target_link_libraries(bench_pffft_float mkl_intel_lp64 mkl_sequential -lmkl_core)
endif()
endif()
if (PFFFT_USE_TYPE_DOUBLE)
add_executable(bench_pffft_double bench_pffft.c pffft.h)
target_compile_definitions(bench_pffft_double PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(bench_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(bench_pffft_double PRIVATE "-fsanitize=address")
endif()
target_link_libraries( bench_pffft_double PFFFT ${ASANLIB} )
if (PFFFT_USE_FFTPACK)
target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTPACK=1)
target_link_libraries(bench_pffft_double FFTPACK_DOUBLE)
endif()
if (PFFFT_USE_BENCH_FFTW)
target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1)
target_link_libraries(bench_pffft_double fftw3)
endif()
if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1)
target_link_libraries(bench_pffft_double PocketFFT)
endif()
if (PFFFT_USE_BENCH_MKL)
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
# has chances to work
else()
# other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?!
message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
endif()
message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
target_compile_definitions(bench_pffft_double PRIVATE HAVE_MKL=1)
target_link_libraries(bench_pffft_double mkl_intel_lp64 mkl_sequential -lmkl_core)
endif()
endif()
######################################################
if (PFFFT_USE_TYPE_FLOAT)
add_executable(bench_pf_mixer_float bench_mixers.cpp papi_perf_counter.h)
target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
target_link_libraries( bench_pf_mixer_float ${ASANLIB} )
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(bench_pf_mixer_float PRIVATE "-fsanitize=address")
endif()
if (PAPI_FOUND)
target_compile_definitions(bench_pf_mixer_float PRIVATE HAVE_PAPI=1)
target_link_libraries(bench_pf_mixer_float ${PAPI_LIBRARIES})
endif()
target_link_libraries( bench_pf_mixer_float PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
############################################################################
add_library(pf_conv_arch_none pf_conv.cpp pf_conv.h pf_cplx.h)
target_compile_definitions(pf_conv_arch_none PRIVATE CONV_ARCH_POST=none MIPP_NO_INTRINSICS=1)
set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD 11)
set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD_REQUIRED ON)
target_activate_cxx_compiler_warnings(pf_conv_arch_none)
add_library(pf_conv_dispatcher pf_conv_dispatcher.cpp pf_conv_dispatcher.h pf_conv.h pf_cplx.h)
set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD 11)
set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD_REQUIRED ON)
target_activate_cxx_compiler_warnings(pf_conv_dispatcher)
add_library(pf_conv_arch_dflt pf_conv.cpp pf_conv.h pf_cplx.h)
target_compile_definitions(pf_conv_arch_dflt PRIVATE CONV_ARCH_POST=dflt)
set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD 11)
set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD_REQUIRED ON)
target_activate_cxx_compiler_warnings(pf_conv_arch_dflt)
target_set_cxx_arch_flags(pf_conv_arch_dflt)
target_link_libraries(pf_conv_dispatcher pf_conv_arch_none pf_conv_arch_dflt)
if ((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64"))
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(PF_CONV_ARCHES "sse3;sse4;avx;avx2")
set(PF_CONV_OPT_sse3 "core2") # emulate a map
set(PF_CONV_OPT_sse4 "nehalem")
set(PF_CONV_OPT_avx "sandybridge")
set(PF_CONV_OPT_avx2 "haswell")
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AMD64)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(PF_CONV_ARCHES "sse2;avx;avx2")
set(PF_CONV_OPT_sse2 "SSE2") # emulate a map
set(PF_CONV_OPT_avx "AVX")
set(PF_CONV_OPT_avx2 "AVX2")
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_MSVC_AMD64)
else()
set(PF_CONV_ARCHES "")
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(PF_CONV_ARCHES "armv8a")
set(PF_CONV_OPT_armv8a "armv8-a") # emulate a map for arch
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AARCH64)
else()
set(PF_CONV_ARCHES "")
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(PF_CONV_ARCHES "neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72")
set(PF_CONV_OPT_neon_vfpv4 "armv7-a") # emulate a map for arch
set(PF_CONV_EXTRA_neon_vfpv4 "neon_vfpv4") # emulate a map for additional options (EXTRA)
set(PF_CONV_OPT_neon_rpi3_a53 "armv7-a")
set(PF_CONV_EXTRA_neon_rpi3_a53 "neon_rpi3_a53")
set(PF_CONV_OPT_neon_rpi4_a72 "armv7-a")
set(PF_CONV_EXTRA_neon_rpi4_a72 "neon_rpi4_a72")
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_ARM32NEON)
else()
set(PF_CONV_ARCHES "")
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
endif()
else()
message(WARNING "this is unforseen CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
endif()
foreach (arch_opt ${PF_CONV_ARCHES})
add_library(pf_conv_arch_${arch_opt} pf_conv.cpp pf_conv.h pf_cplx.h)
set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD 11)
set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD_REQUIRED ON)
target_activate_cxx_compiler_warnings(pf_conv_arch_${arch_opt})
target_compile_definitions(pf_conv_arch_${arch_opt} PRIVATE CONV_ARCH_POST=${arch_opt})
target_set_cxx_arch_option(pf_conv_arch_${arch_opt} "${PF_CONV_OPT_${arch_opt}}" "${PF_CONV_EXTRA_${arch_opt}}" "${PF_CONV_OPT_${arch_opt}}")
target_link_libraries(pf_conv_dispatcher pf_conv_arch_${arch_opt})
message(STATUS "added library pf_conv_arch_${arch_opt} with CONV_ARCH_POST=${arch_opt}")
endforeach()
if (PFFFT_USE_DEBUG_ASAN)
foreach (arch_opt ${PF_CONV_ARCHES})
target_compile_options(pf_conv_arch_${arch_opt} PRIVATE "-fsanitize=address")
target_link_libraries( pf_conv_arch_${arch_opt} ${ASANLIB})
endforeach()
target_compile_options(pf_conv_arch_none PRIVATE "-fsanitize=address")
target_link_libraries( pf_conv_arch_none ${ASANLIB})
target_compile_options(pf_conv_dispatcher PRIVATE "-fsanitize=address")
target_link_libraries(pf_conv_dispatcher ${ASANLIB})
endif()
if(MIPP_FOUND)
foreach (arch_opt ${PF_CONV_ARCHES})
message(STATUS "link pf_conv_arch_${arch_opt} against MIPP")
target_link_libraries(pf_conv_arch_${arch_opt} MIPP)
endforeach()
message(STATUS "link pf_conv_arch_none against MIPP")
target_link_libraries(pf_conv_arch_none MIPP)
endif()
############################################################################
add_executable(bench_pf_conv_float bench_conv.cpp papi_perf_counter.h)
set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD 11)
set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD_REQUIRED ON)
target_compile_definitions(bench_pf_conv_float PRIVATE _USE_MATH_DEFINES)
target_compile_definitions(bench_pf_conv_float PRIVATE PFFFT_ENABLE_FLOAT)
if (PFFFT_USE_DEBUG_ASAN)
target_compile_options(bench_pf_conv_float PRIVATE "-fsanitize=address")
endif()
target_link_libraries( bench_pf_conv_float ${ASANLIB} )
if (PAPI_FOUND)
target_compile_definitions(bench_pf_conv_float PRIVATE HAVE_PAPI=1)
target_link_libraries(bench_pf_conv_float ${PAPI_LIBRARIES})
endif()
if(MIPP_FOUND)
target_link_libraries(bench_pf_conv_float MIPP)
endif()
target_link_libraries( bench_pf_conv_float pf_conv_dispatcher PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
endif()
######################################################
add_subdirectory(examples)
######################################################
enable_testing()
add_test(NAME test_fft_factors
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fft_factors"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
if (PFFFT_USE_FFTPACK)
add_test(NAME test_fftpack_float
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_float"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
add_test(NAME test_fftpack_double
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_double"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif()
if (PFFFT_USE_TYPE_FLOAT)
add_test(NAME bench_pffft_pow2
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--max-len" "128" "--quick"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
add_test(NAME bench_pffft_non2
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--non-pow2" "--max-len" "192" "--quick"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
# add_test(NAME bench_plots
# COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/plots.sh"
# WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
# )
add_test(NAME test_pfconv_lens_symetric
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
add_test(NAME test_pfconv_lens_non_sym
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
add_test(NAME bench_pfconv_symetric
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" "--sym"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
add_test(NAME bench_pfconv_non_sym
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif()

38
pffft/LICENSE.txt Normal file
View File

@@ -0,0 +1,38 @@
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.

352
pffft/README.md Normal file
View File

@@ -0,0 +1,352 @@
---
# PFFFT: a pretty fast FFT and fast convolution with PFFASTCONV
---
<!-- toc -->
- [Brief Description](#brief-description)
- [Why does it exist?](#why-does-it-exist)
- [CMake](#cmake)
- [History / Origin / Changes](#history--origin--changes)
- [Comparison with other FFTs](#comparison-with-other-ffts)
- [Dependencies / Required Linux packages](#dependencies--required-linux-packages)
- [Benchmarks and results](#benchmarks-and-results)
<!-- tocstop -->
---
## Brief description:
PFFFT does 1D Fast Fourier Transforms, of single precision real and
complex vectors. It tries do it fast, it tries to be correct, and it
tries to be small. Computations do take advantage of SSE1 instructions
on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
license is BSD-like.
PFFFT is a fork of [Julien Pommier's library on bitbucket](https://bitbucket.org/jpommier/pffft/)
with some changes and additions.
PFFASTCONV does fast convolution (FIR filtering), of single precision
real vectors, utilizing the PFFFT library. The license is BSD-like.
PFDSP contains a few other signal processing functions.
Currently, mixing and carrier generation functions are contained.
It is work in progress - also the API!
The fast convolution from PFFASTCONV might get merged into PFDSP.
## Why does it exist:
I (Julien Pommier) was in search of a good performing FFT library ,
preferably very small and with a very liberal license.
When one says "fft library", FFTW ("Fastest Fourier Transform in the
West") is probably the first name that comes to mind -- I guess that
99% of open-source projects that need a FFT do use FFTW, and are happy
with it. However, it is quite a large library , which does everything
fft related (2d transforms, 3d transforms, other transformations such
as discrete cosine , or fast hartley). And it is licensed under the
GNU GPL , which means that it cannot be used in non open-source
products.
An alternative to FFTW that is really small, is the venerable FFTPACK
v4, which is available on NETLIB. A more recent version (v5) exists,
but it is larger as it deals with multi-dimensional transforms. This
is a library that is written in FORTRAN 77, a language that is now
considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
its age, benchmarks show it that it still a very good performing FFT
library, see for example the 1d single precision benchmarks
[here](http://www.fftw.org/speed/opteron-2.2GHz-32bit/). It is however not
competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
Apple vDSP. The reason for that is that those libraries do take
advantage of the SSE SIMD instructions available on Intel CPUs,
available since the days of the Pentium III. These instructions deal
with small vectors of 4 floats at a time, instead of a single float
for a traditionnal FPU, so when using these instructions one may expect
a 4-fold performance improvement.
The idea was to take this fortran fftpack v4 code, translate to C,
modify it to deal with those SSE instructions, and check that the
final performance is not completely ridiculous when compared to other
SIMD FFT libraries. Translation to C was performed with [f2c](
http://www.netlib.org/f2c/). The resulting file was a bit edited in
order to remove the thousands of gotos that were introduced by
f2c. You will find the fftpack.h and fftpack.c sources in the
repository, this a complete translation of [fftpack](
http://www.netlib.org/fftpack/), with the discrete cosine transform
and the test program. There is no license information in the netlib
repository, but it was confirmed to me by the fftpack v5 curators that
the [same terms do apply to fftpack v4]
(http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html). This is a
"BSD-like" license, it is compatible with proprietary projects.
Adapting fftpack to deal with the SIMD 4-element vectors instead of
scalar single precision numbers was more complex than I originally
thought, especially with the real transforms, and I ended up writing
more code than I planned..
## The code:
### Good old C:
The FFT API is very very simple, just make sure that you read the comments in `pffft.h`.
The Fast convolution's API is also very simple, just make sure that you read the comments
in `pffastconv.h`.
### C++:
A simple C++ wrapper is available in `pffft.hpp`.
### Git:
This archive's source can be downloaded with git (without the submodules):
```
git clone https://github.com/marton78/pffft.git
```
### Only two files?:
_"Only two files, in good old C, pffft.c and pffft.h"_
This statement does **NO LONGER** hold!
With new functionality and support for AVX, there was need to restructure the sources.
But you can compile and link **pffft** as a static library.
## CMake:
There's now CMake support to build the static libraries `libPFFFT.a`
and `libPFFASTCONV.a` from the source files, plus the additional
`libFFTPACK.a` library. Later one's sources are there anyway for the benchmark.
There are several CMake options to modify library size and optimization.
You can explore all available options with `cmake-gui` or `ccmake`,
the console version - after having installed (on Debian/Ubuntu Linux) one of
```
sudo apt-get install cmake-qt-gui
sudo apt-get install cmake-curses-gui
```
Some of the options:
* `PFFFT_USE_TYPE_FLOAT` to activate single precision 'float' (default: ON)
* `PFFFT_USE_TYPE_DOUBLE` to activate 'double' precision float (default: ON)
* `PFFFT_USE_SIMD` to use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? (default: ON)
* `DISABLE_SIMD_AVX` to disable AVX CPU features (default: OFF)
* `PFFFT_USE_SIMD_NEON` to force using NEON on ARM (requires PFFFT_USE_SIMD) (default: OFF)
* `PFFFT_USE_SCALAR_VECT` to use 4-element vector scalar operations (if no other SIMD) (default: ON)
Options can be passed to `cmake` at command line, e.g.
```
cmake -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_TYPE_DOUBLE=ON
```
My Linux distribution defaults to GCC. With installed CLANG and the bash shell, you can use it with
```
mkdir build
cd build
CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -DCMAKE_BUILD_TYPE=Debug ../
cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=~ ../
ccmake . # or: cmake-gui .
cmake --build . # or simply: make
ctest # to execute some tests - including benchmarks
cmake --build . --target install # or simply: [sudo] make install
```
With MSVC on Windows, you need some different options. Following ones to build a 64-bit Release with Visual Studio 2019:
```
mkdir build
cd build
cmake -G "Visual Studio 16 2019" -A x64 ..
cmake --build . --config Release
ctest -C Release
```
see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
## History / Origin / Changes:
Origin for this code/fork is Julien Pommier's pffft on bitbucket:
[https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/)
Git history shows following first commits of the major contributors:
* Julien Pommier: November 19, 2011
* Marton Danoczy: September 30, 2015
* Hayati Ayguen: December 22, 2019
* Dario Mambro: March 24, 2020
There are a few other contributors not listed here.
The main changes include:
* improved benchmarking, see [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks)
* double support
* avx(2) support
* c++ headers (wrapper)
* additional API helper functions
* additional library for fast convolution
* cmake support
* ctest
## Comparison with other FFTs:
The idea was not to break speed records, but to get a decently fast
fft that is at least 50% as fast as the fastest FFT -- especially on
slowest computers . I'm more focused on getting the best performance
on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
on getting top performance on today fastest cpus.
It can be used in a real-time context as the fft functions do not
perform any memory allocation -- that is why they accept a 'work'
array in their arguments.
It is also a bit focused on performing 1D convolutions, that is why it
provides "unordered" FFTs , and a fourier domain convolution
operation.
Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages).
It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast.
The whole C++ implementation file is 161 lines, including the Copyright header, see
[https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp)
## Dependencies / Required Linux packages
On Debian/Ubuntu Linux following packages should be installed:
```
sudo apt-get install build-essential gcc g++ cmake
```
## Benchmarks and results
#### Quicklink
Find results at [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
#### General
My (Hayati Ayguen) first look at FFT-benchmarks was with [benchFFT](http://www.fftw.org/benchfft/)
and especially the results of the benchmarks [results](http://www.fftw.org/speed/),
which demonstrate the performance of the [FFTW](http://www.fftw.org/).
Looking at the benchmarked computer systems from todays view (2021), these are quite outdated.
Having a look into the [benchFFT source code](http://www.fftw.org/benchfft/benchfft-3.1.tar.gz),
the latest source changes, including competitive fft implementations, are dated November 2003.
In 2019, when pffft got my attention at [bitbucket](https://bitbucket.org/jpommier/pffft/src/master/),
there were also some benchmark results.
Unfortunately the results are tables with numbers - without graphical plots.
Without the plots, i could not get an impression. That was, why i started
[https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks),
which includes GnuPlot figures.
Today in June 2021, i realized the existence of [https://github.com/FFTW/benchfft](https://github.com/FFTW/benchfft).
This repository is much more up-to-date with a commit in December 2020.
Unfortunately, it looks not so simple to get it run - including the generation of plots.
Is there any website showing benchFFT results of more recent computer systems?
Of course, it's very important, that a benchmark can be compared with a bunch
of different FFT algorithms/implementations.
This requires to have these compiled/built and utilizable.
#### Git submodules for Green-, Kiss- and Pocket-FFT
Sources for [Green-](https://github.com/hayguen/greenffts),
[Kiss-](https://github.com/hayguen/kissfft)
and [Pocket-FFT](https://github.com/hayguen/pocketfft)
can be downloaded directly with the sources of this repository - using git submodules:
```
git clone --recursive https://github.com/marton78/pffft.git
```
Important is `--recursive`, that does also fetch the submodules directly.
But you might retrieve the submodules later, too:
```
git submodule update --init
```
#### Fastest Fourier Transform in the West: FFTW
To allow comparison with FFTW [http://www.fftw.org/](http://www.fftw.org/),
cmake option `-DPFFFT_USE_BENCH_FFTW=ON` has to be used with following commands.
The cmake option requires previous setup of following (debian/ubuntu) package:
```
sudo apt-get install libfftw3-dev
```
#### Intel Math Kernel Library: MKL
Intel's MKL [https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html)
currently looks even faster than FFTW.
On Ubuntu-Linux it's easy to setup with the package `intel-mkl`.
Similar on Debian: `intel-mkl-full`.
There are special repositories for following Linux distributions:
* Debian/apt: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html)
* RedHat/yum: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html)
* Gentoo/ebuild: [https://packages.gentoo.org/packages/sci-libs/mkl](https://packages.gentoo.org/packages/sci-libs/mkl)
#### Performing the benchmarks - with CMake
Benchmarks should be prepared by creating a special build folder
```
mkdir build_benches
cd build_benches
cmake ../bench
```
There are several CMake options to parametrize, which fft implementations should be benched.
You can explore all available options with `cmake-gui` or `ccmake`, see [CMake](#cmake).
Some of the options:
* `BENCH_ID` name the benchmark - used in filename
* `BENCH_ARCH` target architecture passed to compiler for code optimization
* `PFFFT_USE_BENCH_FFTW` use (system-installed) FFTW3 in fft benchmark? (default: OFF)
* `PFFFT_USE_BENCH_GREEN` use Green FFT in fft benchmark? (default: ON)
* `PFFFT_USE_BENCH_KISS` use KissFFT in fft benchmark? (default: ON)
* `PFFFT_USE_BENCH_POCKET` use PocketFFT in fft benchmark? (default: ON)
* `PFFFT_USE_BENCH_MKL` use Intel MKL in fft benchmark? (default: OFF)
These options can be passed to `cmake` at command line, e.g.
```
cmake -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
```
The benchmarks are built and executed with
```
cmake --build .
```
You can also specify to use a different compiler/version with the cmake step, e.g.:
```
CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 cmake -DBENCH_ID=gcc9 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
```
```
CC=/usr/bin/clang-11 CXX=/usr/bin/clang++-11 cmake -DBENCH_ID=clang11 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
```
For using MSVC/Windows, the cmake command requires/needs the generator and architecture options and to be called from the VS Developer prompt:
```
cmake -G "Visual Studio 16 2019" -A x64 ../bench/
```
see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
For running with different compiler version(s):
* copy the result file (.tgz), e.g. `cp *.tgz ../`
* delete the build directory: `rm -rf *`
* then continue with the cmake step
#### Benchmark results and contribution
You might contribute by providing us the results of your computer(s).
The benchmark results are stored in a separate git-repository:
See [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
This is to keep this repositories' sources small.

224
pffft/bench/CMakeLists.txt Normal file
View File

@@ -0,0 +1,224 @@
cmake_minimum_required(VERSION 2.8)
project(BENCH_PFFFT)
set(BENCH_ID "default" CACHE STRING "ID: use single word without spaces. gets part of result filename")
option(BENCH_FAST_MATH "Build with fast math - non IEEE compliant" ON)
if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge/ARM-NEON:armv7-a")
elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge")
elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC") # others: "Intel"
set(BENCH_ARCH "AVX" CACHE STRING "target architecture (/arch): SSE2/AVX")
else()
set(BENCH_ARCH "" CACHE STRING "target architecture - use full compiler option!")
endif()
# architecture/optimization options
option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
option(PFFFT_USE_SIMD_NEON "force using NEON on ARM? (requires PFFFT_USE_SIMD)" OFF)
option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF)
set(OSSTR "")
if (WIN32)
set(OSSTR "Win32")
endif (WIN32)
if (UNIX)
set(OSSTR "Unix")
endif (UNIX)
set(BUILD_DIR_TO_EXE "")
set(CMAKE_PLATFORM_OPT "")
set(CMAKE_MAKE_OPT "")
if (MSVC)
set(BUILD_DIR_TO_EXE "Release/")
set(CMAKE_PLATFORM_OPT "-A \"${CMAKE_GENERATOR_PLATFORM}\"")
set(CMAKE_MAKE_OPT "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}")
endif()
set(benchdir "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}")
set(benchdir_flt "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/float")
set(benchdir_dbl "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/double")
set(builddir_flt "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_float")
set(builddir_dbl "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_double")
add_custom_command(OUTPUT "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir}"
COMMAND ${CMAKE_COMMAND} -E echo "benchmark ${BENCH_ID}" > "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "CMake major: ${CMAKE_MAJOR_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "CMake minor: ${CMAKE_MINOR_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "OS: ${OSSTR}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "System: ${CMAKE_SYSTEM_NAME}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "System CPU: ${CMAKE_SYSTEM_PROCESSOR}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "System Version: ${CMAKE_HOST_SYSTEM_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "C Compiler: ${CMAKE_C_COMPILER_ID}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "C Version: ${CMAKE_C_COMPILER_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "C++ Compiler: ${CMAKE_CXX_COMPILER_ID}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "C++ Version: ${CMAKE_CXX_COMPILER_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "MSVC Version: ${MSVC_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "MSVC Toolset: ${MSVC_TOOLSET_VERSION}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "Exe Suffix: ${CMAKE_EXECUTABLE_SUFFIX}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "C Byte Order: ${CMAKE_C_BYTE_ORDER}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "C++ Byte Order: ${CMAKE_CXX_BYTE_ORDER}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "Architecture: ${BENCH_ARCH}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "Fast math: ${BENCH_FAST_MATH}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD=${PFFFT_USE_SIMD}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config DISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}" >> "${benchdir}/info.txt"
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}" >> "${benchdir}/info.txt"
)
if (UNIX)
add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/unix_info.sh"
DEPENDS "${benchdir}/info.txt"
WORKING_DIRECTORY ${benchdir}
)
else()
add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
DEPENDS "${benchdir}/info.txt"
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif()
add_custom_command(OUTPUT "${builddir_flt}/directory.txt"
COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_flt}"
COMMAND ${CMAKE_COMMAND} -E touch "${builddir_flt}/directory.txt"
)
add_custom_command(OUTPUT "${builddir_dbl}/directory.txt"
COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_dbl}"
COMMAND ${CMAKE_COMMAND} -E touch "${builddir_dbl}/directory.txt"
)
add_custom_command(OUTPUT "${benchdir_flt}/directory.txt"
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_flt}"
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_flt}/directory.txt"
)
add_custom_command(OUTPUT "${benchdir_dbl}/directory.txt"
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_dbl}"
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_dbl}/directory.txt"
)
add_custom_target(build_float
COMMAND ${CMAKE_COMMAND} -E echo "start cmake for float in ${builddir_flt}"
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
"${CMAKE_MAKE_OPT}"
-DCMAKE_BUILD_TYPE=Release
"-DARCH=${BENCH_ARCH}"
-DUSE_FAST_MATH=${BENCH_FAST_MATH}
-DPFFFT_USE_TYPE_FLOAT=ON
-DPFFFT_USE_TYPE_DOUBLE=OFF
-DUSE_FLOAT_PREC=ON
-DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
-DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
-DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
-DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
-DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
-DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
-DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
-DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
-DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
"${CMAKE_SOURCE_DIR}/.."
# COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for float in ${builddir_flt}"
COMMAND ${CMAKE_COMMAND} --build . --config Release
DEPENDS "${builddir_flt}/directory.txt"
WORKING_DIRECTORY "${builddir_flt}"
)
add_custom_target(build_double
COMMAND ${CMAKE_COMMAND} -E echo "start cmake for double in ${builddir_dbl}"
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
"${CMAKE_MAKE_OPT}"
-DCMAKE_BUILD_TYPE=Release
"-DARCH=${BENCH_ARCH}"
-DUSE_FAST_MATH=${BENCH_FAST_MATH}
-DPFFFT_USE_TYPE_FLOAT=OFF
-DPFFFT_USE_TYPE_DOUBLE=ON
-DUSE_FLOAT_PREC=OFF
-DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
-DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
-DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
-DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
-DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
-DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
-DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
-DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
-DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
"${CMAKE_SOURCE_DIR}/.."
COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for double in ${builddir_dbl}"
COMMAND ${CMAKE_COMMAND} --build . --config Release
DEPENDS "${builddir_dbl}/directory.txt"
WORKING_DIRECTORY "${builddir_dbl}"
)
add_custom_target(bench_float
COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for float"
COMMAND "${builddir_flt}/${BUILD_DIR_TO_EXE}bench_pffft_float${CMAKE_EXECUTABLE_SUFFIX}"
DEPENDS "${benchdir_flt}/directory.txt" build_float
WORKING_DIRECTORY "${benchdir_flt}"
)
add_custom_target(bench_double
COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for double"
COMMAND "${builddir_dbl}/${BUILD_DIR_TO_EXE}bench_pffft_double${CMAKE_EXECUTABLE_SUFFIX}"
DEPENDS "${benchdir_dbl}/directory.txt" build_double
WORKING_DIRECTORY "${benchdir_dbl}"
)
add_custom_target(bench ALL
COMMAND ${CMAKE_COMMAND} -E echo ""
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
COMMAND ${CMAKE_COMMAND} -E echo ""
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
# DEPENDS "${benchdir}/info.txt" "${benchdir}/unix_info.txt"
DEPENDS "${benchdir}/info.txt" bench_float bench_double "${benchdir}/unix_info.txt"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
)
add_custom_target(bench_float_tar
COMMAND ${CMAKE_COMMAND} -E echo ""
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
COMMAND ${CMAKE_COMMAND} -E echo ""
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
DEPENDS "${benchdir}/info.txt" bench_float "${benchdir}/unix_info.txt"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
)
add_custom_target(bench_double_tar
COMMAND ${CMAKE_COMMAND} -E echo ""
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
COMMAND ${CMAKE_COMMAND} -E echo ""
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
DEPENDS "${benchdir}/info.txt" bench_double "${benchdir}/unix_info.txt"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
)
add_custom_target(clean_results
COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_flt}"
COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_dbl}"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
)

9
pffft/bench/unix_info.sh Executable file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
lscpu > unix_lscpu.txt
cat /proc/cpuinfo > unix_cpuinfo.txt
lsb_release -a > unix_lsb_release.txt
FILES=$(ls -1 /etc/*-release)
if [ ! -z "$FILES" ]; then
cp /etc/*-release ./
fi

345
pffft/bench_conv.cpp Normal file
View File

@@ -0,0 +1,345 @@
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <algorithm>
#include <random>
#include <cstdint>
#include <complex>
#include "papi_perf_counter.h"
//#if defined(HAVE_MIPP) && !defined(NO_MIPP)
#if defined(HAVE_MIPP)
#include <mipp.h>
#define MIPP_VECTOR mipp::vector
#else
#define MIPP_VECTOR std::vector
#endif
#include "pf_conv_dispatcher.h"
#include "pf_conv.h"
#define TEST_WITH_MIN_LEN 0
MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
{
MIPP_VECTOR<float> v(N < 0 ? M : N);
std::mt19937 g;
g.seed(seed_value);
constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
for (int k = 0; k < M; ++k)
v[k] = float(int_fast32_t(g())) * scale;
for (int k = M; k < N; ++k)
v[k] = 0.0F;
return v;
}
int bench_oop_core(
const conv_f_ptrs & conv_arch,
const float * signal, const int sz_signal,
const float * filter, const int sz_filter,
const int blockLen,
float * y
)
{
conv_buffer_state state;
const auto conv_oop = conv_arch.fp_conv_float_oop;
int n_out_sum = 0;
state.offset = 0;
state.size = 0;
papi_perf_counter perf_counter(1);
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
{
state.size += blockLen;
int n_out = conv_oop(signal, &state, filter, sz_filter, y);
n_out_sum += n_out;
}
return n_out_sum;
}
int bench_inplace_core(
const conv_f_ptrs & conv_arch,
float * signal, const int sz_signal,
const float * filter, const int sz_filter,
const int blockLen
)
{
conv_buffer_state state;
const auto conv_inplace = conv_arch.fp_conv_float_inplace;
int n_out_sum = 0;
state.offset = 0;
state.size = 0;
papi_perf_counter perf_counter(1);
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
{
state.size += blockLen;
int n_out = conv_inplace(signal, &state, filter, sz_filter);
n_out_sum += n_out;
}
return n_out_sum;
}
int bench_oop(
const conv_f_ptrs & conv_arch,
float * buffer,
const float * signal, const int sz_signal,
const float * filter, const int sz_filter,
const int blockLen,
float * y
)
{
conv_buffer_state state;
const auto conv_oop = conv_arch.fp_conv_float_oop;
const auto move_rest = conv_arch.fp_conv_float_move_rest;
int n_out_sum = 0;
state.offset = 0;
state.size = 0;
papi_perf_counter perf_counter(1);
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
{
move_rest(buffer, &state);
//memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
state.size += blockLen;
int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
n_out_sum += n_out;
}
return n_out_sum;
}
int bench_cx_real_oop(
const conv_f_ptrs & conv_arch,
complexf * buffer,
const float * signal_re, const int sz_signal_re,
const float * filter, const int sz_filter,
const int blockLen,
float * y_re
)
{
conv_buffer_state state;
const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
// interpret buffer, signal and output vector y as complex data
complexf * y = reinterpret_cast<complexf *>(y_re);
const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
const int sz_signal = sz_signal_re / 2;
int n_out_sum = 0;
state.offset = 0;
state.size = 0;
papi_perf_counter perf_counter(1);
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
{
move_rest(buffer, &state);
//memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
state.size += blockLen;
int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
n_out_sum += n_out;
}
return n_out_sum;
}
int main(int argc, char *argv[])
{
// cli defaults:
// process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
int arch = 0, N = 64 * 1024 * 1024;
int filterLen = 128, blockLen = 1024;
int seed_sig = 1, seed_filter = 2;
bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
for (int i = 1; i < argc; ++i)
{
if (i+1 < argc && !strcmp(argv[i], "-a"))
arch = atoi(argv[++i]);
else if (i+1 < argc && !strcmp(argv[i], "-n"))
N = atoi(argv[++i]) * 1024 * 1024;
else if (i+1 < argc && !strcmp(argv[i], "-f"))
filterLen = atoi(argv[++i]);
else if (i+1 < argc && !strcmp(argv[i], "-b"))
blockLen = atoi(argv[++i]);
else if (i+1 < argc && !strcmp(argv[i], "-ss"))
seed_sig = atoi(argv[++i]);
else if (i+1 < argc && !strcmp(argv[i], "-sf"))
seed_filter = atoi(argv[++i]);
else if (!strcmp(argv[i], "-v"))
verbose = true;
else if (!strcmp(argv[i], "-h"))
showUsage = exitFromUsage = true;
else
fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
}
int num_arch = 0;
const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
if (verbose)
{
fprintf(stderr, "num_arch is %d\n", num_arch);
for (int a = 0; a < num_arch; ++a)
if (conv_arch_ptrs[a])
fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
else
fprintf(stderr, " arch %d is nullptr !!!\n", a );
fprintf(stderr, "\n");
}
if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
{
fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
fprintf(stderr, " [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
fprintf(stderr, "arch is one of:");
for (int a = 0; a < num_arch; ++a)
if (conv_arch_ptrs[a])
fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
fprintf(stderr, "\n");
if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
return 0;
}
if (verbose)
{
#ifdef HAVE_PAPI
fprintf(stderr, "PAPI is available\n");
#else
fprintf(stderr, "PAPI is NOT available!\n");
#endif
}
#if !defined(HAVE_MIPP)
fprintf(stderr, "MIPP is NOT available!\n");
#endif
//int float_simd_size[num_arch];
int max_simd_size = -1;
for (int a = 0; a < num_arch; ++a)
{
if (conv_arch_ptrs[a])
{
const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
//float_simd_size[a] = sz;
if (max_simd_size < sz)
max_simd_size = sz;
if (verbose)
fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
}
//else
// float_simd_size[a] = 0;
}
//const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
if (verbose)
fprintf(stderr, "max float simd size: %d\n", max_simd_size);
#if TEST_WITH_MIN_LEN
filterLen = 2;
#endif
// round up filter length
filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
#if TEST_WITH_MIN_LEN
blockLen = 1;
N = 2 * (3 + filterLen); // produce 3+1 samples
#endif
if (!conv_arch_ptrs[arch])
{
fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
return 1;
}
const conv_f_ptrs & conv_arch = *conv_arch_ptrs[arch];
if (verbose)
fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
MIPP_VECTOR<float> y(N + 1, 0.0F);
MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
#if 1 && TEST_WITH_MIN_LEN
for (int k = 0; k < N; ++k)
s[k] = (k+1);
for (int k = 0; k < filterLen; ++k)
filter[k] = (k+1);
#endif
s[N] = 123.0F;
y[N] = 321.0F;
buffer[blockLen + filterLen] = 789.0F;
buffer_cx[blockLen + filterLen].i = 987.0F;
fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
#if TEST_WITH_MIN_LEN
for (int k = 0; k < n_oop_out; ++k )
fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
fprintf(stderr, "\n");
#endif
fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
assert(s[N] == 123.0F);
assert(y[N] == 321.0F);
assert(buffer[blockLen + filterLen] == 789.0F);
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
#if TEST_WITH_MIN_LEN
for (int k = 0; k < n_oop_out; ++k )
fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
fprintf(stderr, "\n");
#endif
fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
assert(s[N] == 123.0F);
assert(y[N] == 321.0F);
assert(buffer[blockLen + filterLen] == 789.0F);
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
#if TEST_WITH_MIN_LEN
fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
for (int k = 0; k < n_oop_out; ++k )
fprintf(stderr, "y[%2d] = %g %+g * i\n", k, y[2*k], y[2*k+1]);
fprintf(stderr, "\n");
const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
const int Nc = N /2;
fprintf(stderr, "reference with std::complex<float>:\n");
for (int off = 0; off +filterLen <= Nc; ++off )
{
std::complex<float> sum(0.0F, 0.0F);
for (int k=0; k < filterLen; ++k)
sum += sc[off+k] * filter[k];
fprintf(stderr, "yv[%2d] = %g %+g * i\n", off, sum.real(), sum.imag() );
}
#endif
fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
assert(s[N] == 123.0F);
assert(y[N] == 321.0F);
assert(buffer[blockLen + filterLen] == 789.0F);
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
#if TEST_WITH_MIN_LEN
for (int k = 0; k < n_inp_out; ++k )
fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
fprintf(stderr, "\n");
#endif
fprintf(stderr, "\n");
return 0;
}

889
pffft/bench_mixers.cpp Normal file
View File

@@ -0,0 +1,889 @@
/*
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
bench for mixer algorithm/implementations
*/
#include <pf_mixer.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <string.h>
#include "papi_perf_counter.h"
#if defined(__linux__)
#define HAVE_SYS_TIMES
#endif
#ifdef HAVE_SYS_TIMES
# include <sys/times.h>
# include <unistd.h>
#endif
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
#define VC_EXTRALEAN
#include <windows.h>
#endif
#define BENCH_REF_TRIG_FUNC 1
#define BENCH_OUT_OF_PLACE_ALGOS 0
#define BENCH_INPLACE_ALGOS 1
#define SAVE_BY_DEFAULT 0
#define SAVE_LIMIT_MSPS 16
#if 0
#define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
#define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
#define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
#define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
#define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
#define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
#define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
#define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
#define BENCH_FILE_REC_OSC_CC ""
#define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
#define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
#else
#define BENCH_FILE_SHIFT_MATH_CC ""
#define BENCH_FILE_ADD_FAST_CC ""
#define BENCH_FILE_ADD_FAST_INP_C ""
#define BENCH_FILE_UNROLL_INP_C ""
#define BENCH_FILE_LTD_UNROLL_INP_C ""
#define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C ""
#define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C ""
#define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C ""
#define BENCH_FILE_REC_OSC_CC ""
#define BENCH_FILE_REC_OSC_INP_C ""
#define BENCH_FILE_REC_OSC_SSE_INP_C ""
#endif
#if defined(HAVE_SYS_TIMES)
static double ttclk = 0.;
static double uclock_sec(int find_start)
{
struct tms t0, t;
if (ttclk == 0.)
{
ttclk = sysconf(_SC_CLK_TCK);
fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
}
times(&t);
if (find_start)
{
t0 = t;
while (t0.tms_utime == t.tms_utime)
times(&t);
}
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
return ((double)t.tms_utime) / ttclk;
}
#elif defined(WIN32)
// https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
double uclock_sec(int find_start)
{
FILETIME a, b, c, d;
if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
{
// Returns total user time.
// Can be tweaked to include kernel times as well.
return
(double)(d.dwLowDateTime |
((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
}
else {
// Handle error
return 0;
}
}
#else
double uclock_sec(int find_start)
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
#endif
void save(complexf * d, int B, int N, const char * fn)
{
if (!fn || !fn[0])
{
if (! SAVE_BY_DEFAULT)
return;
fn = "/dev/shm/bench.bin";
}
FILE * f = fopen(fn, "wb");
if (!f) {
fprintf(stderr, "error writing result to %s\n", fn);
return;
}
if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
N = SAVE_LIMIT_MSPS * 1024 * 1024;
for (int off = 0; off + B <= N; off += B)
{
fwrite(d+off, sizeof(complexf), B, f);
}
fclose(f);
}
double bench_core_shift_math_cc(
const int B, const int N, const bool ignore_time,
const complexf *input,
complexf *output,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
float phase = 0.0F;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_math_cc(const int B, const int N, const bool ignore_time) {
int iter, off;
complexf *input = (complexf *)malloc(N * sizeof(complexf));
complexf *output = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_math_cc(B, N, ignore_time, input, output, iter, off);
save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
free(input);
free(output);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_shift_table_cc(int B, int N) {
double t0, t1, tstop, T, nI;
int iter, off;
int table_size=65536;
float phase = 0.0F;
complexf *input = (complexf *)malloc(N * sizeof(complexf));
complexf *output = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_table_data_t table_data = shift_table_init(table_size);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
iter = 0;
off = 0;
t0 = uclock_sec(1);
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
do {
// work
phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( t1 < tstop && off + B < N );
save(output, B, off, NULL);
free(input);
free(output);
T = ( t1 - t0 ); /* duration per fft() */
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_shift_addfast(int B, int N) {
double t0, t1, tstop, T, nI;
int iter, off;
float phase = 0.0F;
complexf *input = (complexf *)malloc(N * sizeof(complexf));
complexf *output = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_addfast_data_t state = shift_addfast_init(-0.0009F);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
iter = 0;
off = 0;
t0 = uclock_sec(1);
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
do {
// work
phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( t1 < tstop && off + B < N );
save(output, B, off, BENCH_FILE_ADD_FAST_CC);
free(input);
free(output);
T = ( t1 - t0 ); /* duration per fft() */
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_addfast_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_addfast_data_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
float phase = 0.0F;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
phase = shift_addfast_inp_c(data+off, B, &state, phase);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_addfast_inp(int B, int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_addfast_data_t state = shift_addfast_init(-0.0009F);
int iter, off;
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_addfast_inplace(
B, N, ignore_time, input, state,
iter, off
);
save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
free(input);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_shift_unroll_oop(int B, int N) {
double t0, t1, tstop, T, nI;
int iter, off;
float phase = 0.0F;
complexf *input = (complexf *)malloc(N * sizeof(complexf));
complexf *output = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
iter = 0;
off = 0;
t0 = uclock_sec(1);
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
do {
// work
phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( t1 < tstop && off + B < N );
save(output, B, off, NULL);
free(input);
free(output);
T = ( t1 - t0 ); /* duration per fft() */
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_unroll_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_unroll_data_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
float phase = 0.0F;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
phase = shift_unroll_inp_c(data+off, B, &state, phase);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_unroll_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
int iter, off;
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_unroll_inplace(
B, N, ignore_time, input, state,
iter, off
);
save(input, B, off, BENCH_FILE_UNROLL_INP_C);
free(input);
shift_unroll_deinit(&state);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_shift_limited_unroll_oop(int B, int N) {
double t0, t1, tstop, T, nI;
int iter, off;
complexf *input = (complexf *)malloc(N * sizeof(complexf));
complexf *output = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
iter = 0;
off = 0;
t0 = uclock_sec(1);
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
do {
// work
shift_limited_unroll_cc(input+off, output+off, B, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( t1 < tstop && off + B < N );
save(output, B, off, NULL);
free(input);
free(output);
T = ( t1 - t0 ); /* duration per fft() */
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_limited_unroll_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_limited_unroll_data_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
shift_limited_unroll_inp_c(data+off, B, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_limited_unroll_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
int iter, off;
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_limited_unroll_inplace(
B, N, ignore_time, input, state,
iter, off
);
save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
free(input);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_limited_unroll_A_sse_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_limited_unroll_A_sse_data_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
shift_limited_unroll_A_sse_inp_c(data+off, B, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_limited_unroll_A_sse_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_limited_unroll_A_sse_data_t *state = (shift_limited_unroll_A_sse_data_t*)malloc(sizeof(shift_limited_unroll_A_sse_data_t));
int iter, off;
*state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_limited_unroll_A_sse_inplace(
B, N, ignore_time, input, *state,
iter, off
);
save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
free(input);
free(state);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_limited_unroll_B_sse_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_limited_unroll_B_sse_data_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
shift_limited_unroll_B_sse_inp_c(data+off, B, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_limited_unroll_B_sse_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_limited_unroll_B_sse_data_t *state = (shift_limited_unroll_B_sse_data_t*)malloc(sizeof(shift_limited_unroll_B_sse_data_t));
int iter, off;
*state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
//shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_limited_unroll_B_sse_inplace(
B, N, ignore_time, input, *state,
iter, off
);
save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
free(input);
free(state);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_limited_unroll_C_sse_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_limited_unroll_C_sse_data_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
shift_limited_unroll_C_sse_inp_c(data+off, B, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_limited_unroll_C_sse_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_limited_unroll_C_sse_data_t *state = (shift_limited_unroll_C_sse_data_t*)malloc(sizeof(shift_limited_unroll_C_sse_data_t));
int iter, off;
*state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
double T = bench_core_shift_limited_unroll_C_sse_inplace(
B, N, ignore_time, input, *state,
iter, off
);
save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
free(input);
free(state);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_shift_rec_osc_cc_oop(int B, int N) {
double t0, t1, tstop, T, nI;
int iter, off;
complexf *input = (complexf *)malloc(N * sizeof(complexf));
complexf *output = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state, shift_state;
shift_recursive_osc_conf_t gen_conf, shift_conf;
shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
iter = 0;
off = 0;
t0 = uclock_sec(1);
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
do {
// work
shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( t1 < tstop && off + B < N );
save(input, B, off, BENCH_FILE_REC_OSC_CC);
save(output, B, off, NULL);
free(input);
free(output);
T = ( t1 - t0 ); /* duration per fft() */
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_rec_osc_cc_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_recursive_osc_conf_t &conf, shift_recursive_osc_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
shift_recursive_osc_inp_c(data+off, B, &conf, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_rec_osc_cc_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state, shift_state;
shift_recursive_osc_conf_t gen_conf, shift_conf;
int iter, off;
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
double T = bench_core_shift_rec_osc_cc_inplace(
B, N, ignore_time, input, shift_conf, shift_state,
iter, off
);
save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
free(input);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
double bench_core_shift_rec_osc_sse_c_inplace(
const int B, const int N, const bool ignore_time,
complexf *data,
shift_recursive_osc_sse_conf_t &conf, shift_recursive_osc_sse_t &state,
int &iters_out, int &off_out
)
{
const double t0 = uclock_sec(1);
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
double t1;
int off = 0, iter = 0;
papi_perf_counter perf_counter(1);
do {
// work
shift_recursive_osc_sse_inp_c(data+off, B, &conf, &state);
off += B;
++iter;
t1 = uclock_sec(0);
} while ( off + B < N && (ignore_time || t1 < tstop) );
iters_out = iter;
off_out = off;
return t1 - t0;
}
double bench_shift_rec_osc_sse_c_inp(const int B, const int N, const bool ignore_time) {
complexf *input = (complexf *)malloc(N * sizeof(complexf));
shift_recursive_osc_t gen_state;
shift_recursive_osc_conf_t gen_conf;
shift_recursive_osc_sse_t *shift_state = (shift_recursive_osc_sse_t*)malloc(sizeof(shift_recursive_osc_sse_t));
shift_recursive_osc_sse_conf_t shift_conf;
int iter, off;
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
double T = bench_core_shift_rec_osc_sse_c_inplace(
B, N, ignore_time, input, shift_conf, *shift_state,
iter, off
);
save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
free(input);
free(shift_state);
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
return (nI / T); /* normalized iterations per second */
}
int main(int argc, char **argv)
{
double rt;
// process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
int B = 8 * 1024;
int N = 64 * 1024 * 1024;
int showUsage = 0;
bool ignore_time = true;
if (argc == 1)
showUsage = 1;
if (1 < argc)
B = atoi(argv[1]);
if (2 < argc)
N = atoi(argv[2]) * 1024 * 1024;
if ( !B || !N || showUsage )
{
fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
if ( !B || !N )
return 0;
}
fprintf(stderr, "processing up to N = %d MSamples with block length of %d samples\n",
N / (1024 * 1024), B );
#if BENCH_REF_TRIG_FUNC
printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
rt = bench_shift_math_cc(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
#endif
#if BENCH_OUT_OF_PLACE_ALGOS
printf("starting bench of shift_table_cc (out-of-place) ..\n");
rt = bench_shift_table_cc(B, N);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
rt = bench_shift_addfast(B, N);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
rt = bench_shift_unroll_oop(B, N);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
rt = bench_shift_limited_unroll_oop(B, N);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
rt = bench_shift_rec_osc_cc_oop(B, N);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
#endif
#if BENCH_INPLACE_ALGOS
printf("starting bench of shift_addfast_inp_c in-place ..\n");
rt = bench_shift_addfast_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("starting bench of shift_unroll_inp_c in-place ..\n");
rt = bench_shift_unroll_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
rt = bench_shift_limited_unroll_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
if ( have_sse_shift_mixer_impl() )
{
printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
rt = bench_shift_limited_unroll_A_sse_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
rt = bench_shift_limited_unroll_B_sse_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
rt = bench_shift_limited_unroll_C_sse_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
}
printf("starting bench of shift_recursive_osc_cc in-place ..\n");
rt = bench_shift_rec_osc_cc_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
if ( have_sse_shift_mixer_impl() )
{
printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
rt = bench_shift_rec_osc_sse_c_inp(B, N, ignore_time);
printf(" %f MSamples/sec\n\n", rt * 1E-6);
}
#endif
return 0;
}

1402
pffft/bench_pffft.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
if(MIPP_INCLUDE_DIRS)
set(MIPP_FIND_QUIETLY TRUE)
endif()
find_path(MIPP_INCLUDE_DIRS NAMES mipp.h
HINTS
${MIPP_ROOT}
$ENV{HOME}/.local
PATH_SUFFIXES include/mipp
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MIPP DEFAULT_MSG MIPP_INCLUDE_DIRS)
if(MIPP_FOUND AND NOT TARGET MIPP)
message(STATUS "MIPP_FOUND -> creating interface library MIPP at ${MIPP_INCLUDE_DIRS}")
add_library(MIPP INTERFACE)
target_compile_definitions(MIPP INTERFACE HAVE_MIPP=1)
target_include_directories(MIPP INTERFACE ${MIPP_INCLUDE_DIRS})
target_compile_features(MIPP INTERFACE cxx_std_11)
else()
message(WARNING "MIPP not found.")
endif()
mark_as_advanced(MIPP_INCLUDE_DIRS)

View File

@@ -0,0 +1,25 @@
# Find PAPI libraries
# Once done this will define
# PAPI_FOUND - System has PAPI
# PAPI_INCLUDE_DIRS - The PAPI include directories
# PAPI_LIBRARIES - The libraries needed to use PAPI
if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)
set(PAPI_FIND_QUIETLY TRUE)
endif()
find_path(PAPI_INCLUDE_DIRS NAMES papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include)
find_library(PAPI_LIBRARIES NAMES papi HINTS ${PAPI_ROOT} PATH_SUFFIXES lib lib64)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARIES PAPI_INCLUDE_DIRS)
if(PAPI_FOUND AND NOT TARGET PAPI::PAPI)
set(PAPI_LIBRARIES ${PAPI_LIBRARIES} rt)
add_library(PAPI::PAPI SHARED IMPORTED)
set_target_properties(PAPI::PAPI PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}"
IMPORTED_LOCATION "${PAPI_LIBRARIES}")
endif()
mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)

View File

@@ -0,0 +1,11 @@
function(target_activate_cxx_compiler_warnings target)
target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
endfunction()
function(target_activate_c_compiler_warnings target)
target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
endfunction()

View File

@@ -0,0 +1,197 @@
# cmake options: TARGET_C_ARCH / TARGET_CPP_ARCH:
# and optionally: TARGET_C_EXTRA TARGET_CXX_EXTRA
#
# provided:
# - function: target_set_c_arch_flags(<target>) # uses options TARGET_C_ARCH and TARGET_C_EXTRA
# - function: target_set_cxx_arch_flags(<target>) # uses options TARGET_CXX_ARCH and TARGET_CXX_EXTRA
# - macro: target_set_cxx_arch_option(<target> <gcc/clang_march> <gcc/clang_extra> <msvc_arch>)
#
# see https://en.wikichip.org/wiki/x86/extensions
# and https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
# for gcc specific architecture options
# and https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
# or https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
# for msvc specific architecture options
# https://en.wikichip.org/wiki/arm/versions
# https://en.wikipedia.org/wiki/Raspberry_Pi
# https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#ARM-Options
# https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores
# https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
# arm32_rpi1 untested
# -mcpu=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp -mtune=arm1176jzf-s
# arm32_rpi2 untested
# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
# arm32_rpi3 with "armv7-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53"
# arm32_rpi4 untested
# RPi 4 Model B: Cortex-A72 => "-mtune=cortex-a72" ?
# "-mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits -mtune=cortex-a72"
set(MSVC_EXTRA_OPT_none "")
set(GCC_EXTRA_OPT_none "")
set(GCC_EXTRA_OPT_neon_vfpv4 "-mfloat-abi=hard" "-mfpu=neon-vfpv4")
set(GCC_EXTRA_OPT_neon_rpi3_a53 "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53")
set(GCC_EXTRA_OPT_neon_rpi4_a72 "-mfloat-abi=hard" "-mfpu=neon-fp-armv8" "-mtune=cortex-a72")
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
set(GCC_MARCH_DESC "native/SSE2:pentium4/SSE3:core2/SSE4:nehalem/AVX:sandybridge/AVX2:haswell")
set(GCC_MARCH_VALUES "none;native;pentium4;core2;nehalem;sandybridge;haswell" CACHE INTERNAL "List of possible architectures")
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible EXTRA options")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(GCC_MARCH_DESC "native/ARMwNEON:armv8-a")
set(GCC_MARCH_VALUES "none;native;armv8-a" CACHE INTERNAL "List of possible architectures")
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
set(GCC_MARCH_DESC "native/ARMwNEON:armv7-a")
set(GCC_MARCH_VALUES "none;native;armv7-a" CACHE INTERNAL "List of possible architectures")
set(GCC_EXTRA_VALUES "none;neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72" CACHE INTERNAL "List of possible additional options")
else()
message(WARNING "unsupported CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}'")
# other PROCESSORs could be "ppc", "ppc64", "arm" - or something else?!
set(GCC_MARCH_DESC "native")
set(GCC_MARCH_VALUES "none;native" CACHE INTERNAL "List of possible architectures")
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
endif()
# cmake options - depending on C/C++ compiler
# how are chances, that C and C++ compilers are from different vendors?
if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
set(TARGET_C_ARCH "none" CACHE STRING "gcc target C architecture (-march): ${GCC_MARCH_DESC}")
set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
endif()
elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
set(TARGET_C_ARCH "none" CACHE STRING "clang target C architecture (-march): ${GCC_MARCH_DESC}")
set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
endif()
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
set(TARGET_C_ARCH "none" CACHE STRING "msvc target C architecture (/arch): SSE2/AVX/AVX2/AVX512")
set(TARGET_C_EXTRA "none" CACHE STRING "msvc additional options")
else()
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(TARGET_CXX_ARCH "none" CACHE STRING "gcc target C++ architecture (-march): ${GCC_MARCH_DESC}")
set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
set(TARGET_CXX_EXTRA "none" CACHE STRING "gcc additional options for C++")
set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
endif()
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(TARGET_CXX_ARCH "none" CACHE STRING "clang target C++ architecture (-march): ${GCC_MARCH_DESC}")
set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
set(TARGET_CXX_EXTRA "none" CACHE STRING "clang additional options for C++")
set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
endif()
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(TARGET_CXX_ARCH "none" CACHE STRING "msvc target C++ architecture (/arch): SSE2/AVX/AVX2/AVX512")
set(TARGET_CXX_EXTRA "none" CACHE STRING "msvc additional options")
else()
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
######################################################
function(target_set_c_arch_flags target)
if ( ("${TARGET_C_ARCH}" STREQUAL "") OR ("${TARGET_C_ARCH}" STREQUAL "none") )
message(STATUS "C ARCH for target ${target} is not set!")
else()
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
target_compile_options(${target} PRIVATE "-march=${TARGET_C_ARCH}")
message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
target_compile_options(${target} PRIVATE "/arch:${TARGET_C_ARCH}")
message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
else()
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
endif()
if ( ("${TARGET_C_EXTRA}" STREQUAL "") OR ("${TARGET_C_EXTRA}" STREQUAL "none") )
message(STATUS "C additional options for target ${target} is not set!")
else()
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
message(STATUS "C additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
# target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_C_EXTRA}}")
message(STATUS "C additional options for target ${target} not usable with MSVC")
else()
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
if ( ("${TARGET_C_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
message(STATUS "additional option contains neon: setting PFFFT_ENABLE_NEON for C target ${target}")
target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
endif()
endif()
endfunction()
function(target_set_cxx_arch_flags target)
if ( ("${TARGET_CXX_ARCH}" STREQUAL "") OR ("${TARGET_CXX_ARCH}" STREQUAL "none") )
message(STATUS "C++ ARCH for target ${target} is not set!")
else()
if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
target_compile_options(${target} PRIVATE "-march=${TARGET_CXX_ARCH}")
message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
target_compile_options(${target} PRIVATE "/arch:${TARGET_CXX_ARCH}")
message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
else()
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
endif()
if ( ("${TARGET_CXX_EXTRA}" STREQUAL "") OR ("${TARGET_CXX_EXTRA}" STREQUAL "none") )
message(STATUS "C++ additional options for target ${target} is not set!")
else()
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
message(STATUS "C++ additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
# target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
message(STATUS "C++ additional options for target ${target} not usable with MSVC")
else()
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
if ( ("${TARGET_CXX_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
message(STATUS "additional option contains 'neon': setting PFFFT_ENABLE_NEON for C++ target ${target}")
target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
endif()
endif()
endfunction()
macro(target_set_cxx_arch_option target gcc_clang_arch gcc_clang_extra msvc_arch )
if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
if ( NOT (("${gcc_clang_arch}" STREQUAL "") OR ("${gcc_clang_arch}" STREQUAL "none") ) )
target_compile_options(${target} PRIVATE "-march=${gcc_clang_arch}")
message(STATUS "C++ ARCH for target ${target}: ${gcc_clang_arch}")
endif()
if (NOT ( ("${gcc_clang_extra}" STREQUAL "") OR ("${gcc_clang_extra}" STREQUAL "none") ) )
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${gcc_clang_extra}}")
message(STATUS "C++ additional options for target ${target}: ${GCC_EXTRA_OPT_${gcc_clang_extra}}")
endif()
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
if ( NOT (("${msvc_arch}" STREQUAL "") OR ("${msvc_arch}" STREQUAL "none") ) )
target_compile_options(${target} PRIVATE "/arch:${msvc_arch}")
message(STATUS "C++ ARCH for target ${target} set: ${msvc_arch}")
endif()
else()
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_option(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
endif()
endmacro()

25
pffft/cross_build_mingw32.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
# requires debian/ubuntu packages: zip gcc-mingw-w64
if [ -z "$1" ]; then
echo "usage: $0 <zip-post> <any other cmake options>"
exit 1
fi
ZIP_POST="$1"
shift
CROSS="i686-w64-mingw32"
WN="w32"
TOOLCHAIN="mingw-w32-i686.cmake"
rm -rf build_${WN}_${ZIP_POST}
echo -e "\n\n********************************************************"
echo "start build of pffft_${WN}_${ZIP_POST}"
mkdir build_${WN}_${ZIP_POST} && \
cmake -S . -B build_${WN}_${ZIP_POST} \
-DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
-DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
"$@" && \
cmake --build build_${WN}_${ZIP_POST}

25
pffft/cross_build_mingw64.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
# requires debian/ubuntu packages: zip gcc-mingw-w64
if [ -z "$1" ]; then
echo "usage: $0 <zip-post> <any other cmake options>"
exit 1
fi
ZIP_POST="$1"
shift
# CROSS="x86_64-w64-mingw32"
WN="w64"
TOOLCHAIN="mingw-w64-x64_64.cmake"
rm -rf build_${WN}_${ZIP_POST}
echo -e "\n\n********************************************************"
echo "start build of pffft_${WN}_${ZIP_POST}"
mkdir build_${WN}_${ZIP_POST} && \
cmake -S . -B build_${WN}_${ZIP_POST} \
-DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
-DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
"$@" && \
cmake --build build_${WN}_${ZIP_POST}

View File

@@ -0,0 +1,63 @@
cmake_minimum_required(VERSION 3.1)
project(examples)
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
# using Visual Studio C++
message(STATUS "INFO: detected MSVC: will not link math lib m")
set(MATHLIB "")
add_definitions("/D_CRT_SECURE_NO_WARNINGS")
set(MSVC_DISABLED_WARNINGS_LIST "C4996")
else()
if(PFFFT_DISABLE_LINK_WITH_M)
else()
message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
set(MATHLIB "m")
endif()
endif()
set(STDCXXLIB "")
if (MINGW)
set(STDCXXLIB "stdc++")
endif()
set(CMAKE_CXX_EXTENSIONS OFF)
if (PFFFT_USE_TYPE_DOUBLE)
add_executable(example_cpp11_real_dbl_fwd example_cpp11_real_dbl_fwd.cpp)
target_compile_definitions(example_cpp11_real_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
target_link_libraries(example_cpp11_real_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD 11)
set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
add_executable(example_cpp11_cplx_dbl_fwd example_cpp11_cplx_dbl_fwd.cpp)
target_compile_definitions(example_cpp11_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
target_link_libraries(example_cpp11_cplx_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD 11)
set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
add_executable(example_c_cplx_dbl_fwd example_c_cplx_dbl_fwd.c)
target_compile_definitions(example_c_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_FLOAT)
target_link_libraries(example_c_cplx_dbl_fwd PFFFT ${MATHLIB})
endif()
if (PFFFT_USE_TYPE_FLOAT)
add_executable(example_cpp98_real_flt_fwd example_cpp98_real_flt_fwd.cpp)
target_compile_definitions(example_cpp98_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
target_link_libraries(example_cpp98_real_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD 98)
set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
add_executable(example_cpp98_cplx_flt_fwd example_cpp98_cplx_flt_fwd.cpp)
target_compile_definitions(example_cpp98_cplx_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
target_link_libraries(example_cpp98_cplx_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD 98)
set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
add_executable(example_c_real_flt_fwd example_c_real_flt_fwd.c)
target_compile_definitions(example_c_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
target_link_libraries(example_c_real_flt_fwd PFFFT ${MATHLIB})
endif()

View File

@@ -0,0 +1,69 @@
#include "pffft_double.h"
#include <stdio.h>
#include <stdlib.h>
void c_forward_complex_double(const int transformLen)
{
printf("running %s()\n", __FUNCTION__);
/* first check - might be skipped */
if (transformLen < pffftd_min_fft_size(PFFFT_COMPLEX))
{
fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffftd_min_fft_size(PFFFT_COMPLEX));
return;
}
/* instantiate FFT and prepare transformation for length N */
PFFFTD_Setup *ffts = pffftd_new_setup(transformLen, PFFFT_COMPLEX);
/* one more check */
if (!ffts)
{
fprintf(stderr,
"Error: transformation length %d is not decomposable into small prime factors. "
"Next valid transform size is: %d ; next power of 2 is: %d\n",
transformLen,
pffftd_nearest_transform_size(transformLen, PFFFT_COMPLEX, 1),
pffftd_next_power_of_two(transformLen) );
return;
}
/* allocate aligned vectors for input X and output Y */
double *X = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */
double *Y = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */
double *W = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));
/* prepare some input data */
for (int k = 0; k < 2 * transformLen; k += 4)
{
X[k] = k / 2; /* real */
X[k+1] = (k / 2) & 1; /* imag */
X[k+2] = -1 - k / 2; /* real */
X[k+3] = (k / 2) & 1; /* imag */
}
/* do the forward transform; write complex spectrum result into Y */
pffftd_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
/* print spectral output */
printf("output should be complex spectrum with %d complex bins\n", transformLen);
for (int k = 0; k < 2 * transformLen; k += 2)
printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
pffftd_aligned_free(W);
pffftd_aligned_free(Y);
pffftd_aligned_free(X);
pffftd_destroy_setup(ffts);
}
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 16;
c_forward_complex_double(N);
return 0;
}

View File

@@ -0,0 +1,66 @@
#include "pffft.h"
#include <stdio.h>
#include <stdlib.h>
void c_forward_real_float(const int transformLen)
{
printf("running %s()\n", __FUNCTION__);
/* first check - might be skipped */
if (transformLen < pffft_min_fft_size(PFFFT_REAL))
{
fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffft_min_fft_size(PFFFT_REAL));
return;
}
/* instantiate FFT and prepare transformation for length N */
PFFFT_Setup *ffts = pffft_new_setup(transformLen, PFFFT_REAL);
/* one more check */
if (!ffts)
{
fprintf(stderr,
"Error: transformation length %d is not decomposable into small prime factors. "
"Next valid transform size is: %d ; next power of 2 is: %d\n",
transformLen,
pffft_nearest_transform_size(transformLen, PFFFT_REAL, 1),
pffft_next_power_of_two(transformLen) );
return;
}
/* allocate aligned vectors for input X and output Y */
float *X = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
float *Y = (float*)pffft_aligned_malloc(transformLen * sizeof(float)); /* complex: re/im interleaved */
float *W = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
/* prepare some input data */
for (int k = 0; k < transformLen; k += 2)
{
X[k] = k;
X[k+1] = -1-k;
}
/* do the forward transform; write complex spectrum result into Y */
pffft_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
/* print spectral output */
printf("output should be complex spectrum with %d complex bins\n", transformLen /2);
for (int k = 0; k < transformLen; k += 2)
printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
pffft_aligned_free(W);
pffft_aligned_free(Y);
pffft_aligned_free(X);
pffft_destroy_setup(ffts);
}
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 32;
c_forward_real_float(N);
return 0;
}

View File

@@ -0,0 +1,66 @@
#include "pffft.hpp"
#include <complex>
#include <iostream>
void cxx11_forward_complex_double(const int transformLen)
{
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
// first check - might be skipped
using FFT_T = pffft::Fft< std::complex<double> >;
if (transformLen < FFT_T::minFFtsize())
{
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
return;
}
// instantiate FFT and prepare transformation for length N
pffft::Fft< std::complex<double> > fft(transformLen);
// one more check
if (!fft.isValid())
{
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
return;
}
// allocate aligned vectors for input X and output Y
auto X = fft.valueVector();
auto Y = fft.spectrumVector();
// alternative access: get raw pointers to aligned vectors
std::complex<double> *Xs = X.data();
std::complex<double> *Ys = Y.data();
// prepare some input data
for (int k = 0; k < transformLen; k += 2)
{
X[k] = std::complex<double>(k, k&1); // access through AlignedVector<double>
Xs[k+1] = std::complex<double>(-1-k, k&1); // access through raw pointer
}
// do the forward transform; write complex spectrum result into Y
fft.forward(X, Y);
// print spectral output
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
for (unsigned k = 0; k < Y.size(); k += 2)
{
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
}
}
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 16;
cxx11_forward_complex_double(N);
return 0;
}

View File

@@ -0,0 +1,66 @@
#include "pffft.hpp"
#include <complex>
#include <iostream>
void cxx11_forward_real_double(const int transformLen)
{
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
// first check - might be skipped
using FFT_T = pffft::Fft<double>;
if (transformLen < FFT_T::minFFtsize())
{
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
return;
}
// instantiate FFT and prepare transformation for length N
pffft::Fft<double> fft { transformLen };
// one more check
if (!fft.isValid())
{
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
return;
}
// allocate aligned vectors for (real) input X and (complex) output Y
auto X = fft.valueVector(); // input vector; type is AlignedVector<double>
auto Y = fft.spectrumVector(); // output vector; type is AlignedVector< std::complex<double> >
// alternative access: get raw pointers to aligned vectors
double *Xs = X.data();
std::complex<double> *Ys = Y.data();
// prepare some input data
for (int k = 0; k < transformLen; k += 2)
{
X[k] = k; // access through AlignedVector<double>
Xs[k+1] = -1-k; // access through raw pointer
}
// do the forward transform; write complex spectrum result into Y
fft.forward(X, Y);
// print spectral output
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
for (unsigned k = 0; k < Y.size(); k += 2)
{
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
}
}
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 32;
cxx11_forward_real_double(N);
return 0;
}

View File

@@ -0,0 +1,66 @@
#include "pffft.hpp"
#include <complex>
#include <iostream>
void cxx98_forward_complex_float(const int transformLen)
{
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
// first check - might be skipped
typedef pffft::Fft< std::complex<float> > FFT_T;
if (transformLen < FFT_T::minFFtsize())
{
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
return;
}
// instantiate FFT and prepare transformation for length N
pffft::Fft< std::complex<float> > fft(transformLen);
// one more check
if (!fft.isValid())
{
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
return;
}
// allocate aligned vectors for input X and output Y
pffft::AlignedVector< std::complex<float> > X = fft.valueVector();
pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
// alternative access: get raw pointers to aligned vectors
std::complex<float> *Xs = X.data();
std::complex<float> *Ys = Y.data();
// prepare some input data
for (int k = 0; k < transformLen; k += 2)
{
X[k] = std::complex<float>(k, k&1); // access through AlignedVector<float>
Xs[k+1] = std::complex<float>(-1-k, k&1); // access through raw pointer
}
// do the forward transform; write complex spectrum result into Y
fft.forward(X, Y);
// print spectral output
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
for (unsigned k = 0; k < Y.size(); k += 2)
{
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
}
}
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 16;
cxx98_forward_complex_float(N);
return 0;
}

View File

@@ -0,0 +1,66 @@
#include "pffft.hpp"
#include <complex>
#include <iostream>
void cxx98_forward_real_float(const int transformLen)
{
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
// first check - might be skipped
typedef pffft::Fft<float> FFT_T;
if (transformLen < FFT_T::minFFtsize())
{
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
return;
}
// instantiate FFT and prepare transformation for length N
pffft::Fft<float> fft(transformLen);
// one more check
if (!fft.isValid())
{
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
return;
}
// allocate aligned vectors for input X and output Y
pffft::AlignedVector<float> X = fft.valueVector();
pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
// alternative access: get raw pointers to aligned vectors
float *Xs = X.data();
std::complex<float> *Ys = Y.data();
// prepare some input data
for (int k = 0; k < transformLen; k += 2)
{
X[k] = k; // access through AlignedVector<float>
Xs[k+1] = -1-k; // access through raw pointer
}
// do the forward transform; write complex spectrum result into Y
fft.forward(X, Y);
// print spectral output
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
for (unsigned k = 0; k < Y.size(); k += 2)
{
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
}
}
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 32;
cxx98_forward_real_float(N);
return 0;
}

3130
pffft/fftpack.c Normal file

File diff suppressed because it is too large Load Diff

799
pffft/fftpack.h Normal file
View File

@@ -0,0 +1,799 @@
/*
Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
ChangeLog:
2011/10/02: this is my first release of this file.
*/
#ifndef FFTPACK_H
#define FFTPACK_H
#ifdef __cplusplus
extern "C" {
#endif
/* just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft */
#ifndef FFTPACK_DOUBLE_PRECISION
typedef float fftpack_real;
typedef int fftpack_int;
#else
typedef double fftpack_real;
typedef int fftpack_int;
#endif
void cffti(fftpack_int n, fftpack_real *wsave);
void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
void rffti(fftpack_int n, fftpack_real *wsave);
void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
void cosqi(fftpack_int n, fftpack_real *wsave);
void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
void costi(fftpack_int n, fftpack_real *wsave);
void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
void sinqi(fftpack_int n, fftpack_real *wsave);
void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
void sinti(fftpack_int n, fftpack_real *wsave);
void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
#ifdef __cplusplus
}
#endif
#endif /* FFTPACK_H */
/*
FFTPACK
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
version 4 april 1985
a package of fortran subprograms for the fast fourier
transform of periodic and other symmetric sequences
by
paul n swarztrauber
national center for atmospheric research boulder,colorado 80307
which is sponsored by the national science foundation
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
this package consists of programs which perform fast fourier
transforms for both complex and real periodic sequences and
certain other symmetric sequences that are listed below.
1. rffti initialize rfftf and rfftb
2. rfftf forward transform of a real periodic sequence
3. rfftb backward transform of a real coefficient array
4. ezffti initialize ezfftf and ezfftb
5. ezfftf a simplified real periodic forward transform
6. ezfftb a simplified real periodic backward transform
7. sinti initialize sint
8. sint sine transform of a real odd sequence
9. costi initialize cost
10. cost cosine transform of a real even sequence
11. sinqi initialize sinqf and sinqb
12. sinqf forward sine transform with odd wave numbers
13. sinqb unnormalized inverse of sinqf
14. cosqi initialize cosqf and cosqb
15. cosqf forward cosine transform with odd wave numbers
16. cosqb unnormalized inverse of cosqf
17. cffti initialize cfftf and cfftb
18. cfftf forward transform of a complex periodic sequence
19. cfftb unnormalized inverse of cfftf
******************************************************************
subroutine rffti(n,wsave)
****************************************************************
subroutine rffti initializes the array wsave which is used in
both rfftf and rfftb. the prime factorization of n together with
a tabulation of the trigonometric functions are computed and
stored in wsave.
input parameter
n the length of the sequence to be transformed.
output parameter
wsave a work array which must be dimensioned at least 2*n+15.
the same work array can be used for both rfftf and rfftb
as long as n remains unchanged. different wsave arrays
are required for different values of n. the contents of
wsave must not be changed between calls of rfftf or rfftb.
******************************************************************
subroutine rfftf(n,r,wsave)
******************************************************************
subroutine rfftf computes the fourier coefficients of a real
perodic sequence (fourier analysis). the transform is defined
below at output parameter r.
input parameters
n the length of the array r to be transformed. the method
is most efficient when n is a product of small primes.
n may change so long as different work arrays are provided
r a real array of length n which contains the sequence
to be transformed
wsave a work array which must be dimensioned at least 2*n+15.
in the program that calls rfftf. the wsave array must be
initialized by calling subroutine rffti(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
the same wsave array can be used by rfftf and rfftb.
output parameters
r r(1) = the sum from i=1 to i=n of r(i)
if n is even set l =n/2 , if n is odd set l = (n+1)/2
then for k = 2,...,l
r(2*k-2) = the sum from i = 1 to i = n of
r(i)*cos((k-1)*(i-1)*2*pi/n)
r(2*k-1) = the sum from i = 1 to i = n of
-r(i)*sin((k-1)*(i-1)*2*pi/n)
if n is even
r(n) = the sum from i = 1 to i = n of
(-1)**(i-1)*r(i)
***** note
this transform is unnormalized since a call of rfftf
followed by a call of rfftb will multiply the input
sequence by n.
wsave contains results which must not be destroyed between
calls of rfftf or rfftb.
******************************************************************
subroutine rfftb(n,r,wsave)
******************************************************************
subroutine rfftb computes the real perodic sequence from its
fourier coefficients (fourier synthesis). the transform is defined
below at output parameter r.
input parameters
n the length of the array r to be transformed. the method
is most efficient when n is a product of small primes.
n may change so long as different work arrays are provided
r a real array of length n which contains the sequence
to be transformed
wsave a work array which must be dimensioned at least 2*n+15.
in the program that calls rfftb. the wsave array must be
initialized by calling subroutine rffti(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
the same wsave array can be used by rfftf and rfftb.
output parameters
r for n even and for i = 1,...,n
r(i) = r(1)+(-1)**(i-1)*r(n)
plus the sum from k=2 to k=n/2 of
2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
-2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
for n odd and for i = 1,...,n
r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
-2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
***** note
this transform is unnormalized since a call of rfftf
followed by a call of rfftb will multiply the input
sequence by n.
wsave contains results which must not be destroyed between
calls of rfftb or rfftf.
******************************************************************
subroutine sinti(n,wsave)
******************************************************************
subroutine sinti initializes the array wsave which is used in
subroutine sint. the prime factorization of n together with
a tabulation of the trigonometric functions are computed and
stored in wsave.
input parameter
n the length of the sequence to be transformed. the method
is most efficient when n+1 is a product of small primes.
output parameter
wsave a work array with at least int(2.5*n+15) locations.
different wsave arrays are required for different values
of n. the contents of wsave must not be changed between
calls of sint.
******************************************************************
subroutine sint(n,x,wsave)
******************************************************************
subroutine sint computes the discrete fourier sine transform
of an odd sequence x(i). the transform is defined below at
output parameter x.
sint is the unnormalized inverse of itself since a call of sint
followed by another call of sint will multiply the input sequence
x by 2*(n+1).
the array wsave which is used by subroutine sint must be
initialized by calling subroutine sinti(n,wsave).
input parameters
n the length of the sequence to be transformed. the method
is most efficient when n+1 is the product of small primes.
x an array which contains the sequence to be transformed
wsave a work array with dimension at least int(2.5*n+15)
in the program that calls sint. the wsave array must be
initialized by calling subroutine sinti(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
output parameters
x for i=1,...,n
x(i)= the sum from k=1 to k=n
2*x(k)*sin(k*i*pi/(n+1))
a call of sint followed by another call of
sint will multiply the sequence x by 2*(n+1).
hence sint is the unnormalized inverse
of itself.
wsave contains initialization calculations which must not be
destroyed between calls of sint.
******************************************************************
subroutine costi(n,wsave)
******************************************************************
subroutine costi initializes the array wsave which is used in
subroutine cost. the prime factorization of n together with
a tabulation of the trigonometric functions are computed and
stored in wsave.
input parameter
n the length of the sequence to be transformed. the method
is most efficient when n-1 is a product of small primes.
output parameter
wsave a work array which must be dimensioned at least 3*n+15.
different wsave arrays are required for different values
of n. the contents of wsave must not be changed between
calls of cost.
******************************************************************
subroutine cost(n,x,wsave)
******************************************************************
subroutine cost computes the discrete fourier cosine transform
of an even sequence x(i). the transform is defined below at output
parameter x.
cost is the unnormalized inverse of itself since a call of cost
followed by another call of cost will multiply the input sequence
x by 2*(n-1). the transform is defined below at output parameter x
the array wsave which is used by subroutine cost must be
initialized by calling subroutine costi(n,wsave).
input parameters
n the length of the sequence x. n must be greater than 1.
the method is most efficient when n-1 is a product of
small primes.
x an array which contains the sequence to be transformed
wsave a work array which must be dimensioned at least 3*n+15
in the program that calls cost. the wsave array must be
initialized by calling subroutine costi(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
output parameters
x for i=1,...,n
x(i) = x(1)+(-1)**(i-1)*x(n)
+ the sum from k=2 to k=n-1
2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
a call of cost followed by another call of
cost will multiply the sequence x by 2*(n-1)
hence cost is the unnormalized inverse
of itself.
wsave contains initialization calculations which must not be
destroyed between calls of cost.
******************************************************************
subroutine sinqi(n,wsave)
******************************************************************
subroutine sinqi initializes the array wsave which is used in
both sinqf and sinqb. the prime factorization of n together with
a tabulation of the trigonometric functions are computed and
stored in wsave.
input parameter
n the length of the sequence to be transformed. the method
is most efficient when n is a product of small primes.
output parameter
wsave a work array which must be dimensioned at least 3*n+15.
the same work array can be used for both sinqf and sinqb
as long as n remains unchanged. different wsave arrays
are required for different values of n. the contents of
wsave must not be changed between calls of sinqf or sinqb.
******************************************************************
subroutine sinqf(n,x,wsave)
******************************************************************
subroutine sinqf computes the fast fourier transform of quarter
wave data. that is , sinqf computes the coefficients in a sine
series representation with only odd wave numbers. the transform
is defined below at output parameter x.
sinqb is the unnormalized inverse of sinqf since a call of sinqf
followed by a call of sinqb will multiply the input sequence x
by 4*n.
the array wsave which is used by subroutine sinqf must be
initialized by calling subroutine sinqi(n,wsave).
input parameters
n the length of the array x to be transformed. the method
is most efficient when n is a product of small primes.
x an array which contains the sequence to be transformed
wsave a work array which must be dimensioned at least 3*n+15.
in the program that calls sinqf. the wsave array must be
initialized by calling subroutine sinqi(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
output parameters
x for i=1,...,n
x(i) = (-1)**(i-1)*x(n)
+ the sum from k=1 to k=n-1 of
2*x(k)*sin((2*i-1)*k*pi/(2*n))
a call of sinqf followed by a call of
sinqb will multiply the sequence x by 4*n.
therefore sinqb is the unnormalized inverse
of sinqf.
wsave contains initialization calculations which must not
be destroyed between calls of sinqf or sinqb.
******************************************************************
subroutine sinqb(n,x,wsave)
******************************************************************
subroutine sinqb computes the fast fourier transform of quarter
wave data. that is , sinqb computes a sequence from its
representation in terms of a sine series with odd wave numbers.
the transform is defined below at output parameter x.
sinqf is the unnormalized inverse of sinqb since a call of sinqb
followed by a call of sinqf will multiply the input sequence x
by 4*n.
the array wsave which is used by subroutine sinqb must be
initialized by calling subroutine sinqi(n,wsave).
input parameters
n the length of the array x to be transformed. the method
is most efficient when n is a product of small primes.
x an array which contains the sequence to be transformed
wsave a work array which must be dimensioned at least 3*n+15.
in the program that calls sinqb. the wsave array must be
initialized by calling subroutine sinqi(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
output parameters
x for i=1,...,n
x(i)= the sum from k=1 to k=n of
4*x(k)*sin((2k-1)*i*pi/(2*n))
a call of sinqb followed by a call of
sinqf will multiply the sequence x by 4*n.
therefore sinqf is the unnormalized inverse
of sinqb.
wsave contains initialization calculations which must not
be destroyed between calls of sinqb or sinqf.
******************************************************************
subroutine cosqi(n,wsave)
******************************************************************
subroutine cosqi initializes the array wsave which is used in
both cosqf and cosqb. the prime factorization of n together with
a tabulation of the trigonometric functions are computed and
stored in wsave.
input parameter
n the length of the array to be transformed. the method
is most efficient when n is a product of small primes.
output parameter
wsave a work array which must be dimensioned at least 3*n+15.
the same work array can be used for both cosqf and cosqb
as long as n remains unchanged. different wsave arrays
are required for different values of n. the contents of
wsave must not be changed between calls of cosqf or cosqb.
******************************************************************
subroutine cosqf(n,x,wsave)
******************************************************************
subroutine cosqf computes the fast fourier transform of quarter
wave data. that is , cosqf computes the coefficients in a cosine
series representation with only odd wave numbers. the transform
is defined below at output parameter x
cosqf is the unnormalized inverse of cosqb since a call of cosqf
followed by a call of cosqb will multiply the input sequence x
by 4*n.
the array wsave which is used by subroutine cosqf must be
initialized by calling subroutine cosqi(n,wsave).
input parameters
n the length of the array x to be transformed. the method
is most efficient when n is a product of small primes.
x an array which contains the sequence to be transformed
wsave a work array which must be dimensioned at least 3*n+15
in the program that calls cosqf. the wsave array must be
initialized by calling subroutine cosqi(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
output parameters
x for i=1,...,n
x(i) = x(1) plus the sum from k=2 to k=n of
2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
a call of cosqf followed by a call of
cosqb will multiply the sequence x by 4*n.
therefore cosqb is the unnormalized inverse
of cosqf.
wsave contains initialization calculations which must not
be destroyed between calls of cosqf or cosqb.
******************************************************************
subroutine cosqb(n,x,wsave)
******************************************************************
subroutine cosqb computes the fast fourier transform of quarter
wave data. that is , cosqb computes a sequence from its
representation in terms of a cosine series with odd wave numbers.
the transform is defined below at output parameter x.
cosqb is the unnormalized inverse of cosqf since a call of cosqb
followed by a call of cosqf will multiply the input sequence x
by 4*n.
the array wsave which is used by subroutine cosqb must be
initialized by calling subroutine cosqi(n,wsave).
input parameters
n the length of the array x to be transformed. the method
is most efficient when n is a product of small primes.
x an array which contains the sequence to be transformed
wsave a work array that must be dimensioned at least 3*n+15
in the program that calls cosqb. the wsave array must be
initialized by calling subroutine cosqi(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
output parameters
x for i=1,...,n
x(i)= the sum from k=1 to k=n of
4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
a call of cosqb followed by a call of
cosqf will multiply the sequence x by 4*n.
therefore cosqf is the unnormalized inverse
of cosqb.
wsave contains initialization calculations which must not
be destroyed between calls of cosqb or cosqf.
******************************************************************
subroutine cffti(n,wsave)
******************************************************************
subroutine cffti initializes the array wsave which is used in
both cfftf and cfftb. the prime factorization of n together with
a tabulation of the trigonometric functions are computed and
stored in wsave.
input parameter
n the length of the sequence to be transformed
output parameter
wsave a work array which must be dimensioned at least 4*n+15
the same work array can be used for both cfftf and cfftb
as long as n remains unchanged. different wsave arrays
are required for different values of n. the contents of
wsave must not be changed between calls of cfftf or cfftb.
******************************************************************
subroutine cfftf(n,c,wsave)
******************************************************************
subroutine cfftf computes the forward complex discrete fourier
transform (the fourier analysis). equivalently , cfftf computes
the fourier coefficients of a complex periodic sequence.
the transform is defined below at output parameter c.
the transform is not normalized. to obtain a normalized transform
the output must be divided by n. otherwise a call of cfftf
followed by a call of cfftb will multiply the sequence by n.
the array wsave which is used by subroutine cfftf must be
initialized by calling subroutine cffti(n,wsave).
input parameters
n the length of the complex sequence c. the method is
more efficient when n is the product of small primes. n
c a complex array of length n which contains the sequence
wsave a real work array which must be dimensioned at least 4n+15
in the program that calls cfftf. the wsave array must be
initialized by calling subroutine cffti(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
the same wsave array can be used by cfftf and cfftb.
output parameters
c for j=1,...,n
c(j)=the sum from k=1,...,n of
c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
where i=sqrt(-1)
wsave contains initialization calculations which must not be
destroyed between calls of subroutine cfftf or cfftb
******************************************************************
subroutine cfftb(n,c,wsave)
******************************************************************
subroutine cfftb computes the backward complex discrete fourier
transform (the fourier synthesis). equivalently , cfftb computes
a complex periodic sequence from its fourier coefficients.
the transform is defined below at output parameter c.
a call of cfftf followed by a call of cfftb will multiply the
sequence by n.
the array wsave which is used by subroutine cfftb must be
initialized by calling subroutine cffti(n,wsave).
input parameters
n the length of the complex sequence c. the method is
more efficient when n is the product of small primes.
c a complex array of length n which contains the sequence
wsave a real work array which must be dimensioned at least 4n+15
in the program that calls cfftb. the wsave array must be
initialized by calling subroutine cffti(n,wsave) and a
different wsave array must be used for each different
value of n. this initialization does not have to be
repeated so long as n remains unchanged thus subsequent
transforms can be obtained faster than the first.
the same wsave array can be used by cfftf and cfftb.
output parameters
c for j=1,...,n
c(j)=the sum from k=1,...,n of
c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
where i=sqrt(-1)
wsave contains initialization calculations which must not be
destroyed between calls of subroutine cfftf or cfftb
*/

20
pffft/fmv.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef FMV_H
#if HAVE_FUNC_ATTRIBUTE_IFUNC
#if defined(__has_attribute)
#if __has_attribute(target_clones)
#if defined(__x86_64)
// see https://gcc.gnu.org/wiki/FunctionMultiVersioning
#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
#define HAVE_PF_TARGET_CLONES 1
#endif
#endif
#endif
#endif
#ifndef PF_TARGET_CLONES
#define PF_TARGET_CLONES
#endif
#endif

View File

@@ -0,0 +1,25 @@
# Sample toolchain file for building for Windows from an Ubuntu Linux system.
#
# Typical usage:
# *) install cross compiler: `sudo apt-get install mingw-w64`
# *) cd build
# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w32-i686.cmake ..
#
# build for Windows' 32 bit architecture
set(CMAKE_SYSTEM_NAME Windows)
set(CMAKE_SYSTEM_PROCESSOR x86_64)
set(TOOLCHAIN_PREFIX i686-w64-mingw32)
# cross compilers to use for C, C++ and Fortran
set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
# target environment on the build host system
set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
# modify default behavior of FIND_XXX() commands
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

View File

@@ -0,0 +1,25 @@
# Sample toolchain file for building for Windows from an Ubuntu Linux system.
#
# Typical usage:
# *) install cross compiler: `sudo apt-get install mingw-w64`
# *) cd build
# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w64-x86_64.cmake ..
#
# build for Windows' 64 bit architecture
set(CMAKE_SYSTEM_NAME Windows)
set(CMAKE_SYSTEM_PROCESSOR x86_64)
set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
# cross compilers to use for C, C++ and Fortran
set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
# target environment on the build host system
set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
# modify default behavior of FIND_XXX() commands
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

97
pffft/papi_perf_counter.h Normal file
View File

@@ -0,0 +1,97 @@
#pragma once
/* for measurement of CPU cycles ..
*
* requires
* sudo apt-get install libpapi-dev papi-tools
* on debian/ubuntu linux distributions
*
*/
#ifdef HAVE_PAPI
#include <papi.h>
#endif
#include <stdio.h>
struct papi_perf_counter
{
papi_perf_counter()
: realTime(0.0F), processTime(0.0F), instructions(0LL), ipc(0.0F)
, started(false), finished(false), print_at_destruction(false)
{ }
papi_perf_counter(int _start, bool print_at_destruction_ = true)
: print_at_destruction(print_at_destruction_)
{
(void)_start;
start();
}
~papi_perf_counter()
{
if (print_at_destruction)
print(stderr);
}
bool start()
{
static bool reported_start_error = false;
#ifdef HAVE_PAPI
int ret = PAPI_ipc(&realTime, &processTime, &instructions, &ipc);
if (ret && !reported_start_error)
{
reported_start_error = true;
fprintf(stderr, "papi_perf_counter::start(): PAPI_ipc() returned error %d\n", ret);
}
#else
if (!reported_start_error)
{
reported_start_error = true;
fprintf(stderr, "papi_perf_counter::start(): no HAVE_PAPI\n");
}
int ret = 1;
#endif
started = (!ret);
finished = false;
return started;
}
bool finish()
{
papi_perf_counter end(1, false);
if (started && !finished && end.started)
{
realTime = end.realTime - realTime;
processTime = end.processTime - processTime;
instructions = end.instructions - instructions;
ipc = end.ipc;
finished = true;
return true;
}
return false;
}
void print(FILE *f = stdout)
{
if (started && !finished)
finish();
if (!started || !finished)
return;
double cycles = instructions / ipc;
fprintf(f, "real %g, process %g, instructions %lld, ins/cycle %f => cycles %g\n"
, realTime, processTime, instructions, ipc, cycles
);
started = false;
}
float realTime;
float processTime;
long long instructions;
float ipc;
bool started;
bool finished;
bool print_at_destruction;
};

298
pffft/pf_carrier.cpp Normal file
View File

@@ -0,0 +1,298 @@
/*
This software is part of pffft/pfdsp, a set of simple DSP routines.
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* include own header first, to see missing includes */
#include "pf_carrier.h"
#include "fmv.h"
#include <limits.h>
#include <assert.h>
PF_TARGET_CLONES
void generate_dc_f(float* output, int size)
{
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+i*0 */
output[i++]=(127.0F / 128.0F);
output[i++]=0.0F;
}
}
PF_TARGET_CLONES
void generate_dc_s16(short* output, int size)
{
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+i*0 */
output[i++]=SHRT_MAX;
output[i++]=0;
}
}
PF_TARGET_CLONES
void generate_pos_fs4_f(float* output, int size)
{
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+i*0 */
output[i++]=(127.0F / 128.0F);
output[i++]=0.0F;
/* exp(i* +pi/2) = 0+i*1 */
output[i++]=0.0F;
output[i++]=(127.0F / 128.0F);
/* exp(i* +pi) = -1+i*0 */
output[i++]=(-127.0F / 128.0F);
output[i++]=0.0F;
/* exp(i* -pi/2) = 0+i*-1 */
output[i++]=0.0F;
output[i++]=(-127.0F / 128.0F);
}
}
PF_TARGET_CLONES
void generate_pos_fs4_s16(short* output, int size)
{
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+i*0 */
output[i++]=SHRT_MAX;
output[i++]=0;
/* exp(i* +pi/2) = 0+i*1 */
output[i++]=0;
output[i++]=SHRT_MAX;
/* exp(i* +pi) = -1+i*0 */
output[i++]=-SHRT_MAX;
output[i++]=0;
/* exp(i* -pi/2) = 0+i*-1 */
output[i++]=0;
output[i++]=-SHRT_MAX;
}
}
PF_TARGET_CLONES
void generate_neg_fs4_f(float* output, int size)
{
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+i*0 */
output[i++]=(127.0F / 128.0F);
output[i++]=0.0F;
/* exp(i* -pi/2) = 0+i*-1 */
output[i++]=0.0F;
output[i++]=(-127.0F / 128.0F);
/* exp(i* +pi) = -1+i*0 */
output[i++]=(-127.0F / 128.0F);
output[i++]=0.0F;
/* exp(i* +pi/2) = 0+i*1 */
output[i++]=0.0F;
output[i++]=(127.0F / 128.0F);
}
}
PF_TARGET_CLONES
void generate_neg_fs4_s16(short* output, int size)
{
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+i*0 */
output[i++]=SHRT_MAX;
output[i++]=0;
/* exp(i* -pi/2) = 0+i*-1 */
output[i++]=0;
output[i++]=-SHRT_MAX;
/* exp(i* +pi) = -1+i*0 */
output[i++]=-SHRT_MAX;
output[i++]=0;
/* exp(i* +pi/2) = 0+i*1 */
output[i++]=0;
output[i++]=SHRT_MAX;
}
}
/****************************************************/
PF_TARGET_CLONES
void generate_dc_pos_fs4_s16(short* output, int size)
{
const int m = SHRT_MAX / 2;
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+1+i*0 */
output[i++]=m+m;
output[i++]=0;
/* exp(i* +pi/2) = 1+0+i*1 */
output[i++]=m+0;
output[i++]=m;
/* exp(i* +pi) = 1-1+i*0 */
output[i++]=m-m;
output[i++]=0;
/* exp(i* -pi/2) = 1+0+i*-1 */
output[i++]=m;
output[i++]=-m;
}
}
PF_TARGET_CLONES
void generate_dc_neg_fs4_s16(short* output, int size)
{
const int m = SHRT_MAX / 2;
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* exp(i*0) = 1+1+i*0 */
output[i++]=m+m;
output[i++]=0;
/* exp(i* -pi/2) = 1+0+i*-1 */
output[i++]=m+0;
output[i++]=-m;
/* exp(i* +pi) = 1-1+i*0 */
output[i++]=m-m;
output[i++]=0;
/* exp(i* +pi/2) = 1+0+i*1 */
output[i++]=m+0;
output[i++]=m;
}
}
PF_TARGET_CLONES
void generate_pos_neg_fs4_s16(short* output, int size)
{
const int m = SHRT_MAX / 2;
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* pos(0) + neg(0) = exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */
output[i++]=m;
output[i++]=-m;
/* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */
output[i++]=-m;
output[i++]=m;
/* pos(2) + neg(2) = exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */
output[i++]=-m;
output[i++]=m;
/* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */
output[i++]=m;
output[i++]=-m;
}
}
PF_TARGET_CLONES
void generate_dc_pos_neg_fs4_s16(short* output, int size)
{
const int m = SHRT_MAX / 2;
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* dc + pos(0) + neg(0) = dc + exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */
output[i++]=m+m;
output[i++]=-m;
/* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */
output[i++]=0;
output[i++]=m;
/* dc + pos(2) + neg(2) = dc + exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */
output[i++]=0;
output[i++]=m;
/* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */
output[i++]=m+m;
output[i++]=-m;
}
}
PF_TARGET_CLONES
void generate_pos_neg_fs2_s16(short* output, int size)
{
const int m = SHRT_MAX / 2;
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* dc + exp(i* 0 ) = +1 */
output[i++]=m;
output[i++]=0;
/* dc + exp(i* pi) = -1 */
output[i++]=-m;
output[i++]=0;
/* dc + exp(i* 0 ) = +1 */
output[i++]=m;
output[i++]=0;
/* dc + exp(i* pi) = -1 */
output[i++]=-m;
output[i++]=0;
}
}
PF_TARGET_CLONES
void generate_dc_pos_neg_fs2_s16(short* output, int size)
{
const int m = SHRT_MAX / 2;
/* size must be multiple of 4 */
assert(!(size&3));
for(int i=0;i<2*size;)
{
/* with dc = i*1 */
/* dc + exp(i* 0 ) = i*1 +1 */
output[i++]=m;
output[i++]=m;
/* dc + exp(i* pi) = i*1 -1 */
output[i++]=-m;
output[i++]=m;
/* dc + exp(i* 0 ) = i*1 +1 */
output[i++]=m;
output[i++]=m;
/* dc + exp(i* pi) = i*1 -1 */
output[i++]=-m;
output[i++]=m;
}
}

75
pffft/pf_carrier.h Normal file
View File

@@ -0,0 +1,75 @@
/*
This software is part of pffft/pfdsp, a set of simple DSP routines.
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stdio.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
_____ _
/ ____| | |
| | ___ _ __ ___ _ __ | | _____ __
| | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
| |___| (_) | | | | | | |_) | | __/> <
\_____\___/|_| |_| |_| .__/|_|\___/_/\_\
| |
|_|
*/
typedef struct complexf_s { float i; float q; } complexf;
/* generation functions */
void generate_dc_f(float* output, int size);
void generate_dc_s16(short* output, int size);
void generate_pos_fs4_f(float* output, int size);
void generate_pos_fs4_s16(short* output, int size);
void generate_neg_fs4_f(float* output, int size);
void generate_neg_fs4_s16(short* output, int size);
void generate_dc_pos_fs4_s16(short* output, int size);
void generate_dc_neg_fs4_s16(short* output, int size);
void generate_pos_neg_fs4_s16(short* output, int size);
void generate_dc_pos_neg_fs4_s16(short* output, int size);
void generate_pos_neg_fs2_s16(short* output, int size);
void generate_dc_pos_neg_fs2_s16(short* output, int size);
#ifdef __cplusplus
}
#endif

255
pffft/pf_cic.cpp Normal file
View File

@@ -0,0 +1,255 @@
/*
This software is part of pffft/pfdsp, a set of simple DSP routines.
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* gcc requires this for M_PI !? */
#undef __STRICT_ANSI__
/* include own header first, to see missing includes */
#include "pf_cic.h"
#include "fmv.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
/*
____ ___ ____ ____ ____ ____
/ ___|_ _/ ___| | _ \| _ \ / ___|
| | | | | | | | | | | | |
| |___ | | |___ | |_| | |_| | |___
\____|___\____| |____/|____/ \____|
*/
#define SINESHIFT 12
#define SINESIZE (1<<SINESHIFT)
typedef int64_t cic_dt; // data type used for integrators and combs
typedef struct {
int factor;
uint64_t phase;
float gain;
cic_dt ig0a, ig0b, ig1a, ig1b;
cic_dt comb0a, comb0b, comb1a, comb1b;
int16_t *sinetable;
} cicddc_t;
void *cicddc_init(int factor) {
int i;
int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
cicddc_t *s;
s = (cicddc_t *)malloc(sizeof(cicddc_t));
memset(s, 0, sizeof(cicddc_t));
float sineamp = 32767.0f;
s->factor = factor;
s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
double f = 2.0 * M_PI / (double)SINESIZE;
for(i = 0; i < sinesize2; i++) {
s->sinetable[i] = sineamp * cos(f * i);
}
return s;
}
void cicddc_free(void *state) {
cicddc_t *s = (cicddc_t *)state;
free(s->sinetable);
free(s);
}
PF_TARGET_CLONES
void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
cicddc_t *s = (cicddc_t *)state;
int k;
int factor = s->factor;
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
uint64_t phase = s->phase, freq;
int16_t *sinetable = s->sinetable;
float gain = s->gain;
freq = rate * ((float)(1ULL << 63) * 2);
int16_t *inp = input;
for(k = 0; k < outsize; k++) {
int i;
cic_dt out0a, out0b, out1a, out1b;
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
for(i = 0; i < factor; i++) {
cic_dt in_a, in_b;
int sinep = phase >> (64-SINESHIFT);
in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
phase += freq;
/* integrators:
The calculations are ordered so that each integrator
takes a result from previous loop iteration
to make the code more "pipeline-friendly". */
ig2a += ig1a; ig2b += ig1b;
ig1a += ig0a; ig1b += ig0b;
ig0a += in_a; ig0b += in_b;
}
inp += factor;
// comb filters:
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
comb0a = ig2a; comb0b = ig2b;
out1a = out0a - comb1a; out1b = out0b - comb1b;
comb1a = out0a; comb1b = out0b;
output[k].i = (float)out1a * gain;
output[k].q = (float)out1b * gain;
}
s->ig0a = ig0a; s->ig0b = ig0b;
s->ig1a = ig1a; s->ig1b = ig1b;
s->comb0a = comb0a; s->comb0b = comb0b;
s->comb1a = comb1a; s->comb1b = comb1b;
s->phase = phase;
}
PF_TARGET_CLONES
void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
cicddc_t *s = (cicddc_t *)state;
int k;
int factor = s->factor;
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
uint64_t phase = s->phase, freq;
int16_t *sinetable = s->sinetable;
float gain = s->gain;
freq = rate * ((float)(1ULL << 63) * 2);
int16_t *inp = input;
for(k = 0; k < outsize; k++) {
int i;
cic_dt out0a, out0b, out1a, out1b;
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
for(i = 0; i < factor; i++) {
cic_dt in_a, in_b;
int32_t m_a, m_b, m_c, m_d;
int sinep = phase >> (64-SINESHIFT);
m_a = inp[2*i];
m_b = inp[2*i+1];
m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
m_d = (int32_t)sinetable[sinep];
// complex multiplication:
in_a = m_a*m_c - m_b*m_d;
in_b = m_a*m_d + m_b*m_c;
phase += freq;
/* integrators:
The calculations are ordered so that each integrator
takes a result from previous loop iteration
to make the code more "pipeline-friendly". */
ig2a += ig1a; ig2b += ig1b;
ig1a += ig0a; ig1b += ig0b;
ig0a += in_a; ig0b += in_b;
}
inp += 2*factor;
// comb filters:
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
comb0a = ig2a; comb0b = ig2b;
out1a = out0a - comb1a; out1b = out0b - comb1b;
comb1a = out0a; comb1b = out0b;
output[k].i = (float)out1a * gain;
output[k].q = (float)out1b * gain;
}
s->ig0a = ig0a; s->ig0b = ig0b;
s->ig1a = ig1a; s->ig1b = ig1b;
s->comb0a = comb0a; s->comb0b = comb0b;
s->comb1a = comb1a; s->comb1b = comb1b;
s->phase = phase;
}
/* This is almost copy paste from cicddc_cs16_c.
I'm afraid this is going to be annoying to maintain... */
PF_TARGET_CLONES
void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
cicddc_t *s = (cicddc_t *)state;
int k;
int factor = s->factor;
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
uint64_t phase = s->phase, freq;
int16_t *sinetable = s->sinetable;
float gain = s->gain;
freq = rate * ((float)(1ULL << 63) * 2);
uint8_t *inp = input;
for(k = 0; k < outsize; k++) {
int i;
cic_dt out0a, out0b, out1a, out1b;
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
for(i = 0; i < factor; i++) {
cic_dt in_a, in_b;
int32_t m_a, m_b, m_c, m_d;
int sinep = phase >> (64-SINESHIFT);
// subtract 127.4 (good for rtl-sdr)
m_a = (((int32_t)inp[2*i]) << 8) - 32614;
m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
m_d = (int32_t)sinetable[sinep];
// complex multiplication:
in_a = m_a*m_c - m_b*m_d;
in_b = m_a*m_d + m_b*m_c;
phase += freq;
/* integrators:
The calculations are ordered so that each integrator
takes a result from previous loop iteration
to make the code more "pipeline-friendly". */
ig2a += ig1a; ig2b += ig1b;
ig1a += ig0a; ig1b += ig0b;
ig0a += in_a; ig0b += in_b;
}
inp += 2*factor;
// comb filters:
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
comb0a = ig2a; comb0b = ig2b;
out1a = out0a - comb1a; out1b = out0b - comb1b;
comb1a = out0a; comb1b = out0b;
output[k].i = (float)out1a * gain;
output[k].q = (float)out1b * gain;
}
s->ig0a = ig0a; s->ig0b = ig0b;
s->ig1a = ig1a; s->ig1b = ig1b;
s->comb0a = comb0a; s->comb0b = comb0b;
s->comb1a = comb1a; s->comb1b = comb1b;
s->phase = phase;
}

58
pffft/pf_cic.h Normal file
View File

@@ -0,0 +1,58 @@
/*
This software is part of pffft/pfdsp, a set of simple DSP routines.
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
____ ___ ____ ____ ____ ____
/ ___|_ _/ ___| | _ \| _ \ / ___|
| | | | | | | | | | | | |
| |___ | | |___ | |_| | |_| | |___
\____|___\____| |____/|____/ \____|
*/
typedef struct complexf_s { float i; float q; } complexf;
void *cicddc_init(int factor);
void cicddc_free(void *state);
void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
#ifdef __cplusplus
}
#endif

322
pffft/pf_conv.cpp Normal file
View File

@@ -0,0 +1,322 @@
#include "pf_conv.h"
#include <string.h>
#include <assert.h>
#include <algorithm>
#if 0
#include <stdio.h>
#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
#else
#define DPRINT(...) do { } while (0)
#endif
#ifdef HAVE_MIPP
#include <mipp.h>
#endif
#ifndef CONV_ARCH_POST
#error CONV_ARCH_POST not defined
#endif
#define PP_STRINGIFY(X) #X
#define PP_TOSTRING(X) PP_STRINGIFY(X)
#define PP_CONCAT_IMPL(x, y) x##y
#define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y )
#define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST)
const char * ARCHFUNCNAME(id)()
{
return PP_TOSTRING(CONV_ARCH_POST);
}
int ARCHFUNCNAME(conv_float_simd_size)()
{
#if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP)
// have a completely MIPP independent implementation
return 1;
#else
return mipp::N<float>();
#endif
}
void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state)
{
int R = state->size - state->offset; // this many samples from prev conv_float were not processed
if (R > 0)
{
// memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
std::copy(&s[state->offset], &s[state->size], s);
}
else
R = 0;
state->offset = 0; // data - to be processed - is at begin
state->size = R; // this many unprocessed samples
}
void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state)
{
int R = state->size - state->offset; // this many samples from prev conv_float were not processed
if (R > 0)
{
// memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
std::copy(&s[state->offset], &s[state->size], s);
}
else
R = 0;
state->offset = 0; // data - to be processed - is at begin
state->size = R; // this many unprocessed samples
}
#if defined(MIPP_NO_INTRINSICS)
// have a completely MIPP independent implementation
// #error missing HAVE_MIPP: there is no MIPP-independent implementation
int ARCHFUNCNAME(conv_float_inplace)(
float * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter
)
{
const int off0 = state->offset;
const int sz_s = state->size;
int offset;
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
{
float accu = 0.0F;
for (int k = 0; k < sz_filter; ++k)
accu += s[offset+k] * filter[k];
s[offset] = accu;
}
state->offset = offset;
return offset - off0;
}
int ARCHFUNCNAME(conv_float_oop)(
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter,
float * RESTRICT y
)
{
const int off0 = state->offset;
const int sz_s = state->size;
int offset;
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
{
float accu = 0.0F;
for (int k = 0; k < sz_filter; ++k)
accu += s[offset+k] * filter[k];
y[offset] = accu;
}
state->offset = offset;
return offset - off0;
}
int ARCHFUNCNAME(conv_cplx_float_oop)(
const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter,
complexf * RESTRICT y_cplx
)
{
const int off0 = state->offset;
const int sz_s = state->size;
const int sz_f = sz_filter;
int offset;
for ( offset = off0; offset + sz_f <= sz_s; ++offset)
{
float accu_re = 0.0F;
float accu_im = 0.0F;
for (int k = 0; k < sz_filter; ++k)
{
accu_re = s_cplx[offset+k].i * filter[k]; // accu += rS * rH;
accu_im = s_cplx[offset+k].q * filter[k]; // accu += rS * rH;
}
y_cplx[offset].i = accu_re; // == hadd() == sum of real parts
y_cplx[offset].q = accu_im; // == hadd() == sum of imag parts
}
state->offset = offset;
return offset - off0;
}
#elif defined(HAVE_MIPP)
int ARCHFUNCNAME(conv_float_inplace)(
float * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter
)
{
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
mipp::Reg<float> accu, rS, rH;
const int off0 = state->offset;
const int sz_s = state->size;
int offset;
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
{
accu.set0();
for (int k = 0; k < sz_filter; k += mipp::N<float>())
{
rS.load(&s[offset+k]);
rH.load(&filter[k]);
accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
}
s[offset] = accu.sum(); // == hadd()
}
state->offset = offset;
return offset - off0;
}
int ARCHFUNCNAME(conv_float_oop)(
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter,
float * RESTRICT y
)
{
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
mipp::Reg<float> accu, rS, rH;
const int off0 = state->offset;
const int sz_s = state->size;
int offset;
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
{
accu.set0();
for (int k = 0; k < sz_filter; k += mipp::N<float>())
{
rS.loadu(&s[offset+k]);
rH.load(&filter[k]);
accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
}
y[offset] = accu.sum(); // == hadd()
}
state->offset = offset;
return offset - off0;
}
int ARCHFUNCNAME(conv_cplx_float_oop)(
const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter,
complexf * RESTRICT y_cplx
)
{
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
const float * RESTRICT s = &(s_cplx[0].i);
float * RESTRICT y = &(y_cplx[0].i);
mipp::Regx2<float> accu_x2, rS_x2, H_x2;
const int off0 = 2 * state->offset;
const int sz_s = 2 * state->size;
const int sz_f2 = 2 * sz_filter;
int offset;
for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2)
{
accu_x2.val[0].set0();
accu_x2.val[1].set0();
for (int k = 0; k < sz_filter; k += mipp::N<float>())
{
mipp::Reg<float> rH;
rS_x2.loadu(&s[offset+2*k]);
rH.load(&filter[k]);
H_x2 = mipp::interleave<float>(rH, rH);
accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]); // accu += rS * rH;
accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]); // accu += rS * rH;
}
H_x2 = mipp::deinterleave(accu_x2);
y[offset] = H_x2.val[0].sum(); // == hadd() == sum of real parts
y[offset+1] = H_x2.val[1].sum(); // == hadd() == sum of imag parts
}
state->offset = offset /2;
return (offset - off0) / 2;
}
#endif
static const conv_f_ptrs conv_ptrs =
{
PP_TOSTRING(CONV_ARCH_POST),
#ifndef MIPP_NO_INTRINSICS
1,
#else
0,
#endif
ARCHFUNCNAME(id),
ARCHFUNCNAME(conv_float_simd_size),
#if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP)
ARCHFUNCNAME(conv_float_move_rest),
ARCHFUNCNAME(conv_float_inplace),
ARCHFUNCNAME(conv_float_oop),
ARCHFUNCNAME(conv_cplx_move_rest),
ARCHFUNCNAME(conv_cplx_float_oop)
#else
nullptr,
nullptr,
nullptr,
nullptr,
nullptr
#endif
};
const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)()
{
DPRINT("arch pointer for '%s':\n", conv_ptrs.id);
if (!strcmp(conv_ptrs.id, "none"))
return &conv_ptrs;
#if defined(MIPP_NO_INTRINSICS)
DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id);
return &conv_ptrs;
#elif defined(HAVE_MIPP)
DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id);
DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp);
DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size());
if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1)
return &conv_ptrs;
else
DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size());
#else
DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id);
#endif
DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id);
return nullptr;
}
#if defined(__cplusplus) && (__cplusplus >= 201703L)
[[maybe_unused]]
#endif
static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);

109
pffft/pf_conv.h Normal file
View File

@@ -0,0 +1,109 @@
#pragma once
/* pf_conv.h/.cpp implements linear "slow" convolution.
* this code is primarily for test/demonstration of runtime dispatching.
* each "kernel" is compiled with different compiler/architecture options,
* that activates different implementations in the MIPP headers.
*
* the dispatcher library 'pf_conv_dispatcher' collects (links agains)
* all the pf_conv_arch_<opt> libraries ..
* and provides the get_all_conv_arch_ptrs() function,
* which delivers an array of pointers to the struct (conv_f_ptrs)
* containing the function pointers for the different implementations.
*
* requirement(s):
* - installed MIPP headers
* - compiler definitions for the different architecture types:
* see CMakeLists.txt CONV_ARCH_MSVC_AMD64, CONV_ARCH_GCC_ARM32NEON, ..
* - one cmake library target pf_conv_arch_<opt> for each architecture option.
* each one gets it's specific architecture/compiler options
* utilizing the target_set_cxx_arch_option() macro in the CMakeLists.txt
*/
#include "pf_cplx.h"
#if defined(_MSC_VER)
# define RESTRICT __restrict
#elif defined(__GNUC__)
# define RESTRICT __restrict
#else
# define RESTRICT
#endif
struct conv_buffer_state
{
int offset; // sample index where data (to process) starts
int size; // actual - or previous - size in amount of samples from buffer start (NOT offset)
};
// declare provided function pointer types
typedef const char * (*f_conv_id)();
typedef int (*f_conv_float_simd_size)();
typedef void (*f_conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state);
typedef void (*f_conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state);
typedef int (*f_conv_float_inplace)(
float * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter
);
typedef int (*f_conv_float_oop)(
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter,
float * RESTRICT y
);
typedef int (*f_conv_cplx_float_oop)(
const complexf * RESTRICT s, conv_buffer_state * RESTRICT state,
const float * RESTRICT filter, const int sz_filter,
complexf * RESTRICT y
);
// struct with the provided function pointers
struct conv_f_ptrs
{
const char * id;
const int using_mipp;
f_conv_id fp_id;
f_conv_float_simd_size fp_conv_float_simd_size;
f_conv_float_move_rest fp_conv_float_move_rest;
f_conv_float_inplace fp_conv_float_inplace;
f_conv_float_oop fp_conv_float_oop;
f_conv_cplx_move_rest fp_conv_cplx_move_rest;
f_conv_cplx_float_oop fp_conv_cplx_float_oop;
};
typedef const conv_f_ptrs * ptr_to_conv_f_ptrs;
// function pointer type, delivering the struct with the function pointers
typedef const conv_f_ptrs* (*f_conv_ptrs)();
// helper for systematic function names
#define CONV_FN_ARCH(FN, ARCH) FN##_##ARCH
// declare all functions - returning the structs with the function pointers
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, none)(); // = conv_ptrs_none()
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, dflt)(); // simd / mipp is activated
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse3)(); // = conv_ptrs_sse3()
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse4)();
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)();
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse2)();
//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)(); // already declared
//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); // already declared
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_vfpv4)(); // for armv7l / 32-bit ARM
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, armv8a)(); // for aarch64

View File

@@ -0,0 +1,61 @@
#include "pf_conv_dispatcher.h"
#if 0
#include <stdio.h>
#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
#else
#define DPRINT(...) do { } while (0)
#endif
#define N_DEFAULT_ARCHES 2
// 0 is "none"
// 1 "dflt"
ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch)
{
static ptr_to_conv_f_ptrs * all_arches = nullptr;
static int n_arch = 0;
if (!all_arches)
{
n_arch = N_DEFAULT_ARCHES;
// @TODO: runtime check if actual CPU supports specific architecture
#if defined(CONV_ARCH_GCC_AMD64)
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+4] = {0};
DPRINT("CONV_ARCH_GCC_AMD64: sse3, sse4, avx, avx2\n");
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse3)();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse4)();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
#elif defined(CONV_ARCH_MSVC_AMD64)
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
DPRINT("CONV_ARCH_MSVC_AMD64: sse2, avx, avx2\n");
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse2)();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
#elif defined(CONV_ARCH_GCC_ARM32NEON)
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
DPRINT("CONV_ARCH_GCC_ARM32NEON: neon_vfpv4, neon_rpi3_a53\n");
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
#elif defined(CONV_ARCH_GCC_AARCH64)
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+1] = {0};
DPRINT("CONV_ARCH_GCC_AARCH64: -\n");
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, armv8a)();
#else
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES] = {0};
DPRINT("unknown CONV_ARCH: -\n");
#endif
conv_arch_ptrs[0] = CONV_FN_ARCH(conv_ptrs, none)();
conv_arch_ptrs[1] = CONV_FN_ARCH(conv_ptrs, dflt)();
all_arches = conv_arch_ptrs;
}
if (p_num_arch)
*p_num_arch = n_arch;
return all_arches;
}

View File

@@ -0,0 +1,6 @@
#pragma once
#include "pf_conv.h"
ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch);

44
pffft/pf_cplx.h Normal file
View File

@@ -0,0 +1,44 @@
/*
This software is part of pffft/pfdsp, a set of simple DSP routines.
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
/*
_____ _
/ ____| | |
| | ___ _ __ ___ _ __ | | _____ __
| | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
| |___| (_) | | | | | | |_) | | __/> <
\_____\___/|_| |_| |_| .__/|_|\___/_/\_\
| |
|_|
*/
typedef struct complexf_s { float i; float q; } complexf;

1148
pffft/pf_mixer.cpp Normal file

File diff suppressed because it is too large Load Diff

270
pffft/pf_mixer.h Normal file
View File

@@ -0,0 +1,270 @@
/*
This software is part of pffft/pfdsp, a set of simple DSP routines.
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stdio.h>
#include <stdint.h>
#include "pf_cplx.h"
#ifdef __cplusplus
extern "C" {
#endif
// =================================================================================
int have_sse_shift_mixer_impl();
/*********************************************************************/
/**************/
/*** ALGO A ***/
/**************/
float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase);
/*********************************************************************/
/**************/
/*** ALGO B ***/
/**************/
typedef struct shift_table_data_s
{
float* table;
int table_size;
} shift_table_data_t;
void shift_table_deinit(shift_table_data_t table_data);
shift_table_data_t shift_table_init(int table_size);
float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
/*********************************************************************/
/**************/
/*** ALGO C ***/
/**************/
typedef struct shift_addfast_data_s
{
float dsin[4];
float dcos[4];
float phase_increment;
} shift_addfast_data_t;
shift_addfast_data_t shift_addfast_init(float rate);
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
/*********************************************************************/
/**************/
/*** ALGO D ***/
/**************/
typedef struct shift_unroll_data_s
{
float* dsin;
float* dcos;
float phase_increment;
int size;
} shift_unroll_data_t;
shift_unroll_data_t shift_unroll_init(float rate, int size);
void shift_unroll_deinit(shift_unroll_data_t* d);
float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
/*********************************************************************/
/**************/
/*** ALGO E ***/
/**************/
/* similar to shift_unroll_cc() - but, have fixed and limited precalc size
* idea: smaller cache usage by table
* size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
*/
#define PF_SHIFT_LIMITED_UNROLL_SIZE 128
#define PF_SHIFT_LIMITED_SIMD_SZ 4
typedef struct shift_limited_unroll_data_s
{
float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
complexf complex_phase;
float phase_increment;
} shift_limited_unroll_data_t;
shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
/* starting_phase for next call is kept internal in state */
void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
/*********************************************************************/
/**************/
/*** ALGO F ***/
/**************/
typedef struct shift_limited_unroll_A_sse_data_s
{
/* small/limited trig table */
float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
/* 4 times complex phase */
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
/* N_cplx_per_block times increment - for future parallel variants */
float dcos_blk;
float dsin_blk;
/* */
float phase_increment;
} shift_limited_unroll_A_sse_data_t;
shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
/*********************************************************************/
/**************/
/*** ALGO G ***/
/**************/
typedef struct shift_limited_unroll_B_sse_data_s
{
/* small/limited trig table */
float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
/* 4 times complex phase */
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
/* N_cplx_per_block times increment - for future parallel variants */
float dcos_blk;
float dsin_blk;
/* */
float phase_increment;
} shift_limited_unroll_B_sse_data_t;
shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
/*********************************************************************/
/**************/
/*** ALGO H ***/
/**************/
typedef struct shift_limited_unroll_C_sse_data_s
{
/* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
/* 4 times complex phase */
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
/* N_cplx_per_block times increment - for future parallel variants */
float dcos_blk;
float dsin_blk;
/* */
float phase_increment;
} shift_limited_unroll_C_sse_data_t;
shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
/*********************************************************************/
/**************/
/*** ALGO I ***/
/**************/
/* Recursive Quadrature Oscillator functions "recursive_osc"
* see https://www.vicanek.de/articles/QuadOsc.pdf
*/
#define PF_SHIFT_RECURSIVE_SIMD_SZ 8
typedef struct shift_recursive_osc_s
{
float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
} shift_recursive_osc_t;
typedef struct shift_recursive_osc_conf_s
{
float k1;
float k2;
} shift_recursive_osc_conf_t;
void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
/* starting_phase for next call is kept internal in state */
void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
/*********************************************************************/
/**************/
/*** ALGO J ***/
/**************/
#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ 4
typedef struct shift_recursive_osc_sse_s
{
float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
} shift_recursive_osc_sse_t;
typedef struct shift_recursive_osc_sse_conf_s
{
float k1;
float k2;
} shift_recursive_osc_sse_conf_t;
void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
#ifdef __cplusplus
}
#endif

264
pffft/pffastconv.c Normal file
View File

@@ -0,0 +1,264 @@
/*
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
*/
#include "pffastconv.h"
#include "pffft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#define FASTCONV_DBG_OUT 0
/* detect compiler flavour */
#if defined(_MSC_VER)
# define RESTRICT __restrict
#pragma warning( disable : 4244 4305 4204 4456 )
#elif defined(__GNUC__)
# define RESTRICT __restrict
#endif
void *pffastconv_malloc(size_t nb_bytes)
{
return pffft_aligned_malloc(nb_bytes);
}
void pffastconv_free(void *p)
{
pffft_aligned_free(p);
}
int pffastconv_simd_size()
{
return pffft_simd_size();
}
struct PFFASTCONV_Setup
{
float * Xt; /* input == x in time domain - copy for alignment */
float * Xf; /* input == X in freq domain */
float * Hf; /* filterCoeffs == H in freq domain */
float * Mf; /* input * filterCoeffs in freq domain */
PFFFT_Setup *st;
int filterLen; /* convolution length */
int Nfft; /* FFT/block length */
int flags;
float scale;
};
PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
{
PFFASTCONV_Setup * s = NULL;
const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
const int minFftLen = 2*pffft_simd_size()*pffft_simd_size();
int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1);
#if FASTCONV_DBG_OUT
const int iOldBlkLen = *blockLen;
#endif
if ( Nfft < minFftLen )
Nfft = minFftLen;
if ( flags & PFFASTCONV_CPLX_FILTER )
return NULL;
s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
if ( *blockLen > Nfft ) {
Nfft = *blockLen;
Nfft = pffft_next_power_of_two(Nfft);
}
*blockLen = Nfft; /* this is in (complex) samples */
Nfft *= cplxFactor;
if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
s->Xt = NULL;
else
s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */
s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */
if ( cplxFactor == 2 )
s->filterLen = 2 * filterLen - 1;
s->Nfft = Nfft; /* FFT/block length */
s->flags = flags;
s->scale = (float)( 1.0 / Nfft );
memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
if ( flags & PFFASTCONV_CORRELATION ) {
for ( i = 0; i < filterLen; ++i )
s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ];
} else {
for ( i = 0; i < filterLen; ++i )
s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
}
pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
#if FASTCONV_DBG_OUT
printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
, filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
#endif
return s;
}
void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
{
if (!s)
return;
pffft_destroy_setup(s->st);
pffastconv_free(s->Mf);
pffastconv_free(s->Hf);
pffastconv_free(s->Xf);
if ( s->Xt )
pffastconv_free(s->Xt);
pffastconv_free(s);
}
int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
{
const float * RESTRICT X = input_;
float * RESTRICT Y = output_;
const int Nfft = s->Nfft;
const int filterLen = s->filterLen;
const int flags = s->flags;
const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
const int inputLen = cplxFactor * cplxInputLen;
int inpOff, procLen, numOut = 0, j, part, cplxOff;
/* applyFlush != 0:
* inputLen - inpOff -filterLen + 1 > 0
* <=> inputLen -filterLen + 1 > inpOff
* <=> inpOff < inputLen -filterLen + 1
*
* applyFlush == 0:
* inputLen - inpOff >= Nfft
* <=> inputLen - Nfft >= inpOff
* <=> inpOff <= inputLen - Nfft
* <=> inpOff < inputLen - Nfft + 1
*/
if ( cplxFactor == 2 )
{
const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
#if 0
printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
#endif
for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
{
procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
numOut = ( procLen - filterLen + 1 ) & ( ~1 );
if (!numOut)
break;
#if 0
if (!inpOff)
printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
if (inpOff + filterLen + 2 >= maxOff )
printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
#endif
if ( flags & PFFASTCONV_DIRECT_INP )
{
pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
}
else
{
memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
if ( procLen < Nfft )
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
}
pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
if ( flags & PFFASTCONV_DIRECT_OUT )
{
pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
}
else
{
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
}
}
return inpOff / cplxFactor;
}
else
{
const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
{
procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
numOut = procLen - filterLen + 1;
for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */
{
if ( flags & PFFASTCONV_CPLX_INP_OUT )
{
cplxOff = 2 * inpOff + part;
for ( j = 0; j < procLen; ++j )
s->Xt[j] = X[cplxOff + 2 * j];
if ( procLen < Nfft )
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
}
else if ( flags & PFFASTCONV_DIRECT_INP )
{
pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
}
else
{
memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
if ( procLen < Nfft )
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
}
pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
if ( flags & PFFASTCONV_CPLX_INP_OUT )
{
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
cplxOff = 2 * inpOff + part;
for ( j = 0; j < numOut; ++j )
Y[ cplxOff + 2 * j ] = s->Xf[j];
}
else if ( flags & PFFASTCONV_DIRECT_OUT )
{
pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
}
else
{
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
}
}
}
return inpOff;
}
}

171
pffft/pffastconv.h Normal file
View File

@@ -0,0 +1,171 @@
/* Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of PFFFT, PFFASTCONV, nor the names of its
sponsors or contributors may be used to endorse or promote products
derived from this Software without specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
/*
PFFASTCONV : a Pretty Fast Fast Convolution
This is basically the implementation of fast convolution,
utilizing the FFT (pffft).
Restrictions:
- 1D transforms only, with 32-bit single precision.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
#ifndef PFFASTCONV_H
#define PFFASTCONV_H
#include <stddef.h> /* for size_t */
#include "pffft.h"
#ifdef __cplusplus
extern "C" {
#endif
/* opaque struct holding internal stuff
this struct can't be shared by many threads as it contains
temporary data, computed within the convolution
*/
typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
typedef enum {
PFFASTCONV_CPLX_INP_OUT = 1,
/* set when input and output is complex,
* with real and imag part interleaved in both vectors.
* input[] has inputLen complex values: 2 * inputLen floats,
* output[] is also written with complex values.
* without this flag, the input is interpreted as real vector
*/
PFFASTCONV_CPLX_FILTER = 2,
/* set when filterCoeffs is complex,
* with real and imag part interleaved.
* filterCoeffs[] has filterLen complex values: 2 * filterLen floats
* without this flag, the filter is interpreted as real vector
* ATTENTION: this is not implemented yet!
*/
PFFASTCONV_DIRECT_INP = 4,
/* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
* 1- input vecor X must be aligned
* 2- (all) inputLen <= ouput blockLen
* 3- X must have minimum length of output BlockLen
* 4- the additional samples from inputLen .. BlockLen-1
* must contain valid small and non-NAN samples (ideally zero)
*
* this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
*/
PFFASTCONV_DIRECT_OUT = 8,
/* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
* 1- output vector Y must be aligned
* 2- (all) inputLen <= ouput blockLen
* 3- Y must have minimum length of output blockLen
*
* this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
*/
PFFASTCONV_CPLX_SINGLE_FFT = 16,
/* hint to process complex data with one single FFT;
* default is to use 2 FFTs: one for real part, one for imag part
* */
PFFASTCONV_SYMMETRIC = 32,
/* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
PFFASTCONV_CORRELATION = 64,
/* filterCoeffs[] of pffastconv_new_setup are for correlation;
* thus, do not flip them for the internal fft calculation
* - as necessary for the fast convolution */
} pffastconv_flags_t;
/*
prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
The output 'blockLen' might be bigger to allow the fast convolution.
'flags' are bitmask over the 'pffastconv_flags_t' enum.
PFFASTCONV_Setup structure can't be shared accross multiple filters
or concurrent threads.
*/
PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
void pffastconv_destroy_setup(PFFASTCONV_Setup *);
/*
Perform the fast convolution.
'input' and 'output' don't need to be aligned - unless any of
PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
in this case, multiple FFTs are called internally, to process the
input[].
'output' vector must have size >= (inputLen - filterLen + 1)
set bool option 'applyFlush' to process the full input[].
with this option, 'tail samples' of input are also processed.
This might be inefficient, because the FFT is called to produce
few(er) output samples, than possible.
This option is useful to process the last samples of an input (file)
or to reduce latency.
return value is the number of produced samples in output[].
the same amount of samples is processed from input[]. to continue
processing, the caller must save/move the remaining samples of
input[].
*/
int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
void *pffastconv_malloc(size_t nb_bytes);
void pffastconv_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
int pffastconv_simd_size();
#ifdef __cplusplus
}
#endif
#endif /* PFFASTCONV_H */

134
pffft/pffft.c Normal file
View File

@@ -0,0 +1,134 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
Based on original fortran 77 code from FFTPACKv4 from NETLIB
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
of NCAR, in 1985.
As confirmed by the NCAR fftpack software curators, the following
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
released under the same terms.
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
PFFFT : a Pretty Fast FFT.
This file is largerly based on the original FFTPACK implementation, modified in
order to take advantage of SIMD instructions of modern CPUs.
*/
/*
ChangeLog:
- 2011/10/02, version 1: This is the very first release of this file.
*/
#include "pffft.h"
/* detect compiler flavour */
#if defined(_MSC_VER)
# define COMPILER_MSVC
#elif defined(__GNUC__)
# define COMPILER_GCC
#endif
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
#if defined(COMPILER_GCC)
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
# define RESTRICT __restrict
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
#elif defined(COMPILER_MSVC)
# define ALWAYS_INLINE(return_type) __forceinline return_type
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
# define RESTRICT __restrict
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
#endif
#ifdef COMPILER_MSVC
#pragma warning( disable : 4244 4305 4204 4456 )
#endif
/*
vector support macros: the rest of the code is independant of
SSE/Altivec/NEON -- adding support for other platforms with 4-element
vectors should be limited to these macros
*/
#include "simd/pf_float.h"
/* have code comparable with this definition */
#define SETUP_STRUCT PFFFT_Setup
#define FUNC_NEW_SETUP pffft_new_setup
#define FUNC_DESTROY pffft_destroy_setup
#define FUNC_TRANSFORM_UNORDRD pffft_transform
#define FUNC_TRANSFORM_ORDERED pffft_transform_ordered
#define FUNC_ZREORDER pffft_zreorder
#define FUNC_ZCONVOLVE_ACCUMULATE pffft_zconvolve_accumulate
#define FUNC_ZCONVOLVE_NO_ACCU pffft_zconvolve_no_accu
#define FUNC_ALIGNED_MALLOC pffft_aligned_malloc
#define FUNC_ALIGNED_FREE pffft_aligned_free
#define FUNC_SIMD_SIZE pffft_simd_size
#define FUNC_MIN_FFT_SIZE pffft_min_fft_size
#define FUNC_IS_VALID_SIZE pffft_is_valid_size
#define FUNC_NEAREST_SIZE pffft_nearest_transform_size
#define FUNC_SIMD_ARCH pffft_simd_arch
#define FUNC_VALIDATE_SIMD_A validate_pffft_simd
#define FUNC_VALIDATE_SIMD_EX validate_pffft_simd_ex
#define FUNC_CPLX_FINALIZE pffft_cplx_finalize
#define FUNC_CPLX_PREPROCESS pffft_cplx_preprocess
#define FUNC_REAL_PREPROCESS_4X4 pffft_real_preprocess_4x4
#define FUNC_REAL_PREPROCESS pffft_real_preprocess
#define FUNC_REAL_FINALIZE_4X4 pffft_real_finalize_4x4
#define FUNC_REAL_FINALIZE pffft_real_finalize
#define FUNC_TRANSFORM_INTERNAL pffft_transform_internal
#define FUNC_COS cosf
#define FUNC_SIN sinf
#include "pffft_priv_impl.h"

241
pffft/pffft.h Normal file
View File

@@ -0,0 +1,241 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
As confirmed by the NCAR fftpack software curators, the following
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
released under the same terms.
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
/*
PFFFT : a Pretty Fast FFT.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
Restrictions:
- 1D transforms only, with 32-bit single precision.
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
#ifndef PFFFT_H
#define PFFFT_H
#include <stddef.h> /* for size_t */
#ifdef __cplusplus
extern "C" {
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only
read-only data.
*/
typedef struct PFFFT_Setup PFFFT_Setup;
#ifndef PFFFT_COMMON_ENUMS
#define PFFFT_COMMON_ENUMS
/* direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
#endif
/*
prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
*/
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer should point to an area of N (2*N for complex
fft) floats, properly aligned. If 'work' is NULL, then stack will
be used instead (this is probably the best strategy for small
FFTs, say for N < 16384). Threads usually have a small stack, that
there's no sufficient amount of memory, usually leading to a crash!
Use the heap with pffft_aligned_malloc() in this case.
For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
input with input(=transformation) length N, the output array is
'mostly' complex:
index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N
index k == 0 is a special case:
the real() part contains the result for the DC frequency 0,
the imag() part contains the result for the Nyquist frequency Samplerate/2
both 0-frequency and half frequency components, which are real,
are assembled in the first entry as F(0)+i*F(N/2).
With the output size N/2 complex values (=N real/imag values), it is
obvious, that the result for negative frequencies are not output,
cause of symmetry.
input and output may alias.
*/
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
*/
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
*/
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and put result in dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
int pffft_simd_size();
/* return string identifier of used architecture (SSE/NEON/Altivec/..) */
const char * pffft_simd_arch();
/* following functions are identical to the pffftd_ functions */
/* simple helper to get minimum possible fft size */
int pffft_min_fft_size(pffft_transform_t transform);
/* simple helper to determine next power of 2
- without inexact/rounding floating point operations
*/
int pffft_next_power_of_two(int N);
/* simple helper to determine if power of 2 - returns bool */
int pffft_is_power_of_two(int N);
/* simple helper to determine size N is valid
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
returns bool
*/
int pffft_is_valid_size(int N, pffft_transform_t cplx);
/* determine nearest valid transform size (by brute-force testing)
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
higher: bool-flag to find nearest higher value; else lower.
*/
int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
/*
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
*/
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *);
#ifdef __cplusplus
}
#endif
#endif /* PFFFT_H */

1060
pffft/pffft.hpp Normal file

File diff suppressed because it is too large Load Diff

53
pffft/pffft_common.c Normal file
View File

@@ -0,0 +1,53 @@
#include "pffft.h"
#include <stdlib.h>
/* SSE and co like 16-bytes aligned pointers
* with a 64-byte alignment, we are even aligned on L2 cache lines... */
#define MALLOC_V4SF_ALIGNMENT 64
static void * Valigned_malloc(size_t nb_bytes) {
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
if (!p0) return (void *) 0;
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
*((void **) p - 1) = p0;
return p;
}
static void Valigned_free(void *p) {
if (p) free(*((void **) p - 1));
}
static int next_power_of_two(int N) {
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
/* compute the next highest power of 2 of 32-bit v */
unsigned v = N;
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
static int is_power_of_two(int N) {
/* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
int f = N && !(N & (N - 1));
return f;
}
void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
void pffft_aligned_free(void *p) { Valigned_free(p); }
int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
void pffftd_aligned_free(void *p) { Valigned_free(p); }
int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }

147
pffft/pffft_double.c Normal file
View File

@@ -0,0 +1,147 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
of NCAR, in 1985.
As confirmed by the NCAR fftpack software curators, the following
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
released under the same terms.
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
PFFFT : a Pretty Fast FFT.
This file is largerly based on the original FFTPACK implementation, modified in
order to take advantage of SIMD instructions of modern CPUs.
*/
/*
NOTE: This file is adapted from Julien Pommier's original PFFFT,
which works on 32 bit floating point precision using SSE instructions,
to work with 64 bit floating point precision using AVX instructions.
Author: Dario Mambro @ https://github.com/unevens/pffft
*/
#include "pffft_double.h"
/* detect compiler flavour */
#if defined(_MSC_VER)
# define COMPILER_MSVC
#elif defined(__GNUC__)
# define COMPILER_GCC
#endif
#ifdef COMPILER_MSVC
# define _USE_MATH_DEFINES
# include <malloc.h>
#elif defined(__MINGW32__) || defined(__MINGW64__)
# include <malloc.h>
#else
# include <alloca.h>
#endif
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
#if defined(COMPILER_GCC)
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
# define RESTRICT __restrict
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
#elif defined(COMPILER_MSVC)
# define ALWAYS_INLINE(return_type) __forceinline return_type
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
# define RESTRICT __restrict
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
#endif
#ifdef COMPILER_MSVC
#pragma warning( disable : 4244 4305 4204 4456 )
#endif
/*
vector support macros: the rest of the code is independant of
AVX -- adding support for other platforms with 4-element
vectors should be limited to these macros
*/
#include "simd/pf_double.h"
/* have code comparable with this definition */
#define float double
#define SETUP_STRUCT PFFFTD_Setup
#define FUNC_NEW_SETUP pffftd_new_setup
#define FUNC_DESTROY pffftd_destroy_setup
#define FUNC_TRANSFORM_UNORDRD pffftd_transform
#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered
#define FUNC_ZREORDER pffftd_zreorder
#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate
#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu
#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc
#define FUNC_ALIGNED_FREE pffftd_aligned_free
#define FUNC_SIMD_SIZE pffftd_simd_size
#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size
#define FUNC_IS_VALID_SIZE pffftd_is_valid_size
#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size
#define FUNC_SIMD_ARCH pffftd_simd_arch
#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd
#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex
#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize
#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess
#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4
#define FUNC_REAL_PREPROCESS pffftd_real_preprocess
#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4
#define FUNC_REAL_FINALIZE pffftd_real_finalize
#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal
#define FUNC_COS cos
#define FUNC_SIN sin
#include "pffft_priv_impl.h"

236
pffft/pffft_double.h Normal file
View File

@@ -0,0 +1,236 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
As confirmed by the NCAR fftpack software curators, the following
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
released under the same terms.
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
/*
NOTE: This file is adapted from Julien Pommier's original PFFFT,
which works on 32 bit floating point precision using SSE instructions,
to work with 64 bit floating point precision using AVX instructions.
Author: Dario Mambro @ https://github.com/unevens/pffft
*/
/*
PFFFT : a Pretty Fast FFT.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
Restrictions:
- 1D transforms only, with 64-bit double precision.
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- all (double*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 32 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
#ifndef PFFFT_DOUBLE_H
#define PFFFT_DOUBLE_H
#include <stddef.h> /* for size_t */
#ifdef __cplusplus
extern "C" {
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only
read-only data.
*/
typedef struct PFFFTD_Setup PFFFTD_Setup;
#ifndef PFFFT_COMMON_ENUMS
#define PFFFT_COMMON_ENUMS
/* direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
#endif
/*
prepare for performing transforms of size N -- the returned
PFFFTD_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
*/
PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
void pffftd_destroy_setup(PFFFTD_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer should point to an area of N (2*N for complex
fft) doubles, properly aligned. If 'work' is NULL, then stack will
be used instead (this is probably the best strategy for small
FFTs, say for N < 16384). Threads usually have a small stack, that
there's no sufficient amount of memory, usually leading to a crash!
Use the heap with pffft_aligned_malloc() in this case.
input and output may alias.
*/
void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
/*
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
*/
void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
*/
void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and put result in dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
/* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
int pffftd_simd_size();
/* return string identifier of used architecture (AVX/..) */
const char * pffftd_simd_arch();
/* simple helper to get minimum possible fft size */
int pffftd_min_fft_size(pffft_transform_t transform);
/* simple helper to determine size N is valid
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
*/
int pffftd_is_valid_size(int N, pffft_transform_t cplx);
/* determine nearest valid transform size (by brute-force testing)
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
higher: bool-flag to find nearest higher value; else lower.
*/
int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
/* following functions are identical to the pffft_ functions - both declared */
/* simple helper to determine next power of 2
- without inexact/rounding floating point operations
*/
int pffftd_next_power_of_two(int N);
int pffft_next_power_of_two(int N);
/* simple helper to determine if power of 2 - returns bool */
int pffftd_is_power_of_two(int N);
int pffft_is_power_of_two(int N);
/*
the double buffers must have the correct alignment (32-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
*/
void *pffftd_aligned_malloc(size_t nb_bytes);
void *pffft_aligned_malloc(size_t nb_bytes);
void pffftd_aligned_free(void *);
void pffft_aligned_free(void *);
#ifdef __cplusplus
}
#endif
#endif /* PFFFT_DOUBLE_H */

2233
pffft/pffft_priv_impl.h Normal file

File diff suppressed because it is too large Load Diff

50
pffft/plots.sh Executable file
View File

@@ -0,0 +1,50 @@
#!/bin/bash
OUTPNG="1"
W="1024"
H="768"
PTS="20"
LWS="20"
for f in $(ls -1 *-4-*.csv *-6-*.csv); do
b=$(basename "$f" ".csv")
#echo $b
LASTCOL="$(head -n 1 $f |sed 's/,/,\n/g' |grep -c ',')"
echo "${b}: last column is $LASTCOL"
if [ $(echo "$b" |grep -c -- "-1-") -gt 0 ]; then
YL="duration in ms; less is better"
elif [ $(echo "$b" |grep -c -- "-4-") -gt 0 ]; then
YL="duration relative to pffft; less is better"
else
YL=""
fi
E=""
if [ "${OUTPNG}" = "1" ]; then
E="set terminal png size $W,$H"
E="${E} ; set output '${b}.png'"
fi
if [ -z "${E}" ]; then
E="set key outside"
else
E="${E} ; set key outside"
fi
E="${E} ; set datafile separator ','"
E="${E} ; set title '${b}'"
E="${E} ; set xlabel 'fft order: fft size N = 2\\^order'"
if [ ! -z "${YL}" ]; then
#echo " setting Y label to ${YL}"
E="${E} ; set ylabel '${YL}'"
fi
# unfortunately no effect for
#for LNO in $(seq 1 ${LASTCOL}) ; do
# E="${E} ; set style line ${LNO} ps ${PTS} lw ${LWS}"
#done
E="${E} ; plot for [col=3:${LASTCOL}] '${f}' using 2:col with lines title columnhead"
if [ "${OUTPNG}" = "1" ]; then
gnuplot -e "${E}"
else
gnuplot -e "${E}" --persist
fi
done

View File

@@ -0,0 +1,81 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_ALTIVEC_FLT_H
#define PF_ALTIVEC_FLT_H
/*
Altivec support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
typedef vector float v4sf;
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
float f[SIMD_SZ];
} v4sf_union;
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
# define VARCH "ALTIVEC"
# define VZERO() ((vector float) vec_splat_u8(0))
# define VMUL(a,b) vec_madd(a,b, VZERO())
# define VADD(a,b) vec_add(a,b)
# define VMADD(a,b,c) vec_madd(a,b,c)
# define VSUB(a,b) vec_sub(a,b)
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
# define LD_PS1(p) ld_ps1(&p)
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
}
# define VTRANSPOSE4(x0,x1,x2,x3) { \
v4sf y0 = vec_mergeh(x0, x2); \
v4sf y1 = vec_mergel(x0, x2); \
v4sf y2 = vec_mergeh(x1, x3); \
v4sf y3 = vec_mergel(x1, x3); \
x0 = vec_mergeh(y0, y2); \
x1 = vec_mergel(y0, y2); \
x2 = vec_mergeh(y1, y3); \
x3 = vec_mergel(y1, y3); \
}
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
#endif
#endif /* PF_SSE1_FLT_H */

145
pffft/simd/pf_avx_double.h Normal file
View File

@@ -0,0 +1,145 @@
/*
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
*/
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_AVX_DBL_H
#define PF_AVX_DBL_H
/*
vector support macros: the rest of the code is independant of
AVX -- adding support for other platforms with 4-element
vectors should be limited to these macros
*/
/*
AVX support macros
*/
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
#pragma message( __FILE__ ": AVX macros are defined" )
#include <immintrin.h>
typedef __m256d v4sf;
/* 4 doubles by simd vector */
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
double f[SIMD_SZ];
} v4sf_union;
# define VARCH "AVX"
# define VREQUIRES_ALIGN 1
# define VZERO() _mm256_setzero_pd()
# define VMUL(a,b) _mm256_mul_pd(a,b)
# define VADD(a,b) _mm256_add_pd(a,b)
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
# define VSUB(a,b) _mm256_sub_pd(a,b)
# define LD_PS1(p) _mm256_set1_pd(p)
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
*/
# define INTERLEAVE2(in1, in2, out1, out2) { \
__m128d low1__ = _mm256_castpd256_pd128(in1); \
__m128d low2__ = _mm256_castpd256_pd128(in2); \
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
__m256d tmp__ = _mm256_insertf128_pd( \
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
_mm_shuffle_pd(low1__, low2__, 3), \
1); \
out2 = _mm256_insertf128_pd( \
_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
_mm_shuffle_pd(high1__, high2__, 3), \
1); \
out1 = tmp__; \
}
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
*/
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
__m128d low1__ = _mm256_castpd256_pd128(in1); \
__m128d low2__ = _mm256_castpd256_pd128(in2); \
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
__m256d tmp__ = _mm256_insertf128_pd( \
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
_mm_shuffle_pd(low2__, high2__, 0), \
1); \
out2 = _mm256_insertf128_pd( \
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
_mm_shuffle_pd(low2__, high2__, 3), \
1); \
out1 = tmp__; \
}
# define VTRANSPOSE4(row0, row1, row2, row3) { \
__m256d tmp3, tmp2, tmp1, tmp0; \
\
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
\
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
}
/*VSWAPHL(a, b) pseudo code:
return [ b[0], b[1], a[2], a[3] ]
*/
# define VSWAPHL(a,b) \
_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
/* reverse/flip all floats */
# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
/* reverse/flip complex floats */
# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
#endif
#endif /* PF_AVX_DBL_H */

84
pffft/simd/pf_double.h Normal file
View File

@@ -0,0 +1,84 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_DBL_H
#define PF_DBL_H
#include <assert.h>
#include <string.h>
#include <stdint.h>
/*
* SIMD reference material:
*
* general SIMD introduction:
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
*
* SSE 1:
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
*
* ARM NEON:
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
*
* Altivec:
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
* better one?
*
*/
typedef double vsfscalar;
#include "pf_avx_double.h"
#include "pf_sse2_double.h"
#include "pf_neon_double.h"
#ifndef SIMD_SZ
# if !defined(PFFFT_SIMD_DISABLE)
# pragma message( "building double with simd disabled !" )
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
# endif
#endif
#include "pf_scalar_double.h"
/* shortcuts for complex multiplcations */
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
#ifndef SVMUL
/* multiply a scalar with a vector */
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
#endif
#endif /* PF_DBL_H */

84
pffft/simd/pf_float.h Normal file
View File

@@ -0,0 +1,84 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_FLT_H
#define PF_FLT_H
#include <assert.h>
#include <string.h>
#include <stdint.h>
/*
* SIMD reference material:
*
* general SIMD introduction:
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
*
* SSE 1:
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
*
* ARM NEON:
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
*
* Altivec:
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
* better one?
*
*/
typedef float vsfscalar;
#include "pf_sse1_float.h"
#include "pf_neon_float.h"
#include "pf_altivec_float.h"
#ifndef SIMD_SZ
# if !defined(PFFFT_SIMD_DISABLE)
# pragma message( "building float with simd disabled !" )
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
# endif
#endif
#include "pf_scalar_float.h"
/* shortcuts for complex multiplcations */
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
#ifndef SVMUL
/* multiply a scalar with a vector */
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
#endif
#endif /* PF_FLT_H */

203
pffft/simd/pf_neon_double.h Normal file
View File

@@ -0,0 +1,203 @@
/*
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
*/
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_NEON_DBL_H
#define PF_NEON_DBL_H
/*
NEON 64bit support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
#include "pf_neon_double_from_avx.h"
typedef __m256d v4sf;
/* 4 doubles by simd vector */
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
double f[SIMD_SZ];
} v4sf_union;
# define VARCH "NEON"
# define VREQUIRES_ALIGN 1
# define VZERO() _mm256_setzero_pd()
# define VMUL(a,b) _mm256_mul_pd(a,b)
# define VADD(a,b) _mm256_add_pd(a,b)
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
# define VSUB(a,b) _mm256_sub_pd(a,b)
# define LD_PS1(p) _mm256_set1_pd(p)
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
{
__m256d res;
res.vect_f64[0] = a.vect_f64[0];
res.vect_f64[1] = b;
return res;
}
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
{
float64x1_t al = vget_low_f64(a);
float64x1_t bl = vget_low_f64(b);
return vcombine_f64(al, bl);
}
FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
{
float64x1_t ah = vget_high_f64(a);
float64x1_t bh = vget_high_f64(b);
return vcombine_f64(ah, bh);
}
FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
{
__m256d res;
res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
return res;
}
FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
{
__m256d res;
res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
return res;
}
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
__m256d res;
res.vect_f64[0] = a.vect_f64[0];
res.vect_f64[1] = b.vect_f64[0];
return res;
}
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
{
__m256d res;
res.vect_f64[0] = a.vect_f64[1];
res.vect_f64[1] = b.vect_f64[1];
return res;
}
FORCE_INLINE __m256d _mm256_reverse(__m256d x)
{
__m256d res;
float64x2_t low = x.vect_f64[0];
float64x2_t high = x.vect_f64[1];
float64x1_t a = vget_low_f64(low);
float64x1_t b = vget_high_f64(low);
float64x1_t c = vget_low_f64(high);
float64x1_t d = vget_high_f64(high);
res.vect_f64[0] = vcombine_f64(d, c);
res.vect_f64[1] = vcombine_f64(b, a);
return res;
}
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
*/
# define INTERLEAVE2(in1, in2, out1, out2) { \
__m128d low1__ = _mm256_castpd256_pd128(in1); \
__m128d low2__ = _mm256_castpd256_pd128(in2); \
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
__m256d tmp__ = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
_mm_shuffle_pd_11(low1__, low2__)); \
out2 = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
_mm_shuffle_pd_11(high1__, high2__)); \
out1 = tmp__; \
}
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
*/
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
__m128d low1__ = _mm256_castpd256_pd128(in1); \
__m128d low2__ = _mm256_castpd256_pd128(in2); \
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
__m256d tmp__ = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
_mm_shuffle_pd_00(low2__, high2__)); \
out2 = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
_mm_shuffle_pd_11(low2__, high2__)); \
out1 = tmp__; \
}
# define VTRANSPOSE4(row0, row1, row2, row3) { \
__m256d tmp3, tmp2, tmp1, tmp0; \
\
tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
\
(row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
(row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
(row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
(row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
}
/*VSWAPHL(a, b) pseudo code:
return [ b[0], b[1], a[2], a[3] ]
*/
# define VSWAPHL(a,b) \
_mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
/* reverse/flip all floats */
# define VREV_S(a) _mm256_reverse(a)
/* reverse/flip complex floats */
# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
#endif
#endif /* PF_AVX_DBL_H */

View File

@@ -0,0 +1,123 @@
/*
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//see https://github.com/kunpengcompute/AvxToNeon
#ifndef PF_NEON_DBL_FROM_AVX_H
#define PF_NEON_DBL_FROM_AVX_H
#include <arm_neon.h>
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("FORCE_INLINE")
#define FORCE_INLINE static inline __attribute__((always_inline))
#else
#error "Macro name collisions may happens with unknown compiler"
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif
#define FORCE_INLINE static inline
#endif
typedef struct {
float32x4_t vect_f32[2];
} __m256;
typedef struct {
float64x2_t vect_f64[2];
} __m256d;
typedef float64x2_t __m128d;
FORCE_INLINE __m256d _mm256_setzero_pd(void)
{
__m256d ret;
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
return ret;
}
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
{
__m256d res_m256d;
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
return res_m256d;
}
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
{
__m256d res_m256d;
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
return res_m256d;
}
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
{
__m256d res_m256d;
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
return res_m256d;
}
FORCE_INLINE __m256d _mm256_set1_pd(double a)
{
__m256d ret;
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
return ret;
}
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
{
__m256d res;
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
return res;
}
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
{
__m256d res;
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
return res;
}
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
{
return a.vect_f64[0];
}
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
{
assert(imm8 >= 0 && imm8 <= 1);
return a.vect_f64[imm8];
}
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
{
__m256d res;
res.vect_f64[0] = a;
return res;
}
#endif /* PF_AVX_DBL_H */

View File

@@ -0,0 +1,87 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_NEON_FLT_H
#define PF_NEON_FLT_H
/*
ARM NEON support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
#pragma message( __FILE__ ": ARM NEON macros are defined" )
# include <arm_neon.h>
typedef float32x4_t v4sf;
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
float f[SIMD_SZ];
} v4sf_union;
# define VARCH "NEON"
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
# define VZERO() vdupq_n_f32(0)
# define VMUL(a,b) vmulq_f32(a,b)
# define VADD(a,b) vaddq_f32(a,b)
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
# define VSUB(a,b) vsubq_f32(a,b)
# define LD_PS1(p) vld1q_dup_f32(&(p))
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define VTRANSPOSE4(x0,x1,x2,x3) { \
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
}
// marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
/* reverse/flip all floats */
# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
/* reverse/flip complex floats */
# define VREV_C(a) vextq_f32(a, a, 2)
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
#else
/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
#endif
#endif /* PF_NEON_FLT_H */

View File

@@ -0,0 +1,185 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_SCAL_DBL_H
#define PF_SCAL_DBL_H
/*
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
*/
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
typedef struct {
vsfscalar a;
vsfscalar b;
vsfscalar c;
vsfscalar d;
} v4sf;
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
vsfscalar f[SIMD_SZ];
} v4sf_union;
# define VARCH "4xScalar"
# define VREQUIRES_ALIGN 0
static ALWAYS_INLINE(v4sf) VZERO() {
v4sf r = { 0.f, 0.f, 0.f, 0.f };
return r;
}
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
return r;
}
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
return r;
}
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
return r;
}
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
return r;
}
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
v4sf r = { v, v, v, v };
return r;
}
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
/* INTERLEAVE2() */
#define INTERLEAVE2( A, B, C, D) \
do { \
v4sf Cr = { A.a, B.a, A.b, B.b }; \
v4sf Dr = { A.c, B.c, A.d, B.d }; \
C = Cr; \
D = Dr; \
} while (0)
/* UNINTERLEAVE2() */
#define UNINTERLEAVE2(A, B, C, D) \
do { \
v4sf Cr = { A.a, A.c, B.a, B.c }; \
v4sf Dr = { A.b, A.d, B.b, B.d }; \
C = Cr; \
D = Dr; \
} while (0)
/* VTRANSPOSE4() */
#define VTRANSPOSE4(A, B, C, D) \
do { \
v4sf Ar = { A.a, B.a, C.a, D.a }; \
v4sf Br = { A.b, B.b, C.b, D.b }; \
v4sf Cr = { A.c, B.c, C.c, D.c }; \
v4sf Dr = { A.d, B.d, C.d, D.d }; \
A = Ar; \
B = Br; \
C = Cr; \
D = Dr; \
} while (0)
/* VSWAPHL() */
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
v4sf r = { B.a, B.b, A.c, A.d };
return r;
}
/* reverse/flip all floats */
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
v4sf r = { A.d, A.c, A.b, A.a };
return r;
}
/* reverse/flip complex floats */
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
v4sf r = { A.c, A.d, A.a, A.b };
return r;
}
#else
/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
#endif
#if !defined(SIMD_SZ)
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
typedef vsfscalar v4sf;
# define SIMD_SZ 1
typedef union v4sf_union {
v4sf v;
vsfscalar f[SIMD_SZ];
} v4sf_union;
# define VARCH "Scalar"
# define VREQUIRES_ALIGN 0
# define VZERO() 0.0
# define VMUL(a,b) ((a)*(b))
# define VADD(a,b) ((a)+(b))
# define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p)
# define VLOAD_UNALIGNED(ptr) (*(ptr))
# define VLOAD_ALIGNED(ptr) (*(ptr))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
#else
/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
#endif
#endif /* PF_SCAL_DBL_H */

View File

@@ -0,0 +1,185 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_SCAL_FLT_H
#define PF_SCAL_FLT_H
/*
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
*/
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
typedef struct {
vsfscalar a;
vsfscalar b;
vsfscalar c;
vsfscalar d;
} v4sf;
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
vsfscalar f[SIMD_SZ];
} v4sf_union;
# define VARCH "4xScalar"
# define VREQUIRES_ALIGN 0
static ALWAYS_INLINE(v4sf) VZERO() {
v4sf r = { 0.f, 0.f, 0.f, 0.f };
return r;
}
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
return r;
}
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
return r;
}
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
return r;
}
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
return r;
}
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
v4sf r = { v, v, v, v };
return r;
}
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
/* INTERLEAVE2() */
#define INTERLEAVE2( A, B, C, D) \
do { \
v4sf Cr = { A.a, B.a, A.b, B.b }; \
v4sf Dr = { A.c, B.c, A.d, B.d }; \
C = Cr; \
D = Dr; \
} while (0)
/* UNINTERLEAVE2() */
#define UNINTERLEAVE2(A, B, C, D) \
do { \
v4sf Cr = { A.a, A.c, B.a, B.c }; \
v4sf Dr = { A.b, A.d, B.b, B.d }; \
C = Cr; \
D = Dr; \
} while (0)
/* VTRANSPOSE4() */
#define VTRANSPOSE4(A, B, C, D) \
do { \
v4sf Ar = { A.a, B.a, C.a, D.a }; \
v4sf Br = { A.b, B.b, C.b, D.b }; \
v4sf Cr = { A.c, B.c, C.c, D.c }; \
v4sf Dr = { A.d, B.d, C.d, D.d }; \
A = Ar; \
B = Br; \
C = Cr; \
D = Dr; \
} while (0)
/* VSWAPHL() */
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
v4sf r = { B.a, B.b, A.c, A.d };
return r;
}
/* reverse/flip all floats */
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
v4sf r = { A.d, A.c, A.b, A.a };
return r;
}
/* reverse/flip complex floats */
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
v4sf r = { A.c, A.d, A.a, A.b };
return r;
}
#else
/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
#endif
#if !defined(SIMD_SZ)
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
typedef vsfscalar v4sf;
# define SIMD_SZ 1
typedef union v4sf_union {
v4sf v;
vsfscalar f[SIMD_SZ];
} v4sf_union;
# define VARCH "Scalar"
# define VREQUIRES_ALIGN 0
# define VZERO() 0.f
# define VMUL(a,b) ((a)*(b))
# define VADD(a,b) ((a)+(b))
# define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p)
# define VLOAD_UNALIGNED(ptr) (*(ptr))
# define VLOAD_ALIGNED(ptr) (*(ptr))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
#else
/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
#endif
#endif /* PF_SCAL_FLT_H */

View File

@@ -0,0 +1,82 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_SSE1_FLT_H
#define PF_SSE1_FLT_H
/*
SSE1 support macros
*/
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
#pragma message( __FILE__ ": SSE1 float macros are defined" )
#include <xmmintrin.h>
typedef __m128 v4sf;
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
float f[SIMD_SZ];
} v4sf_union;
# define VARCH "SSE1"
# define VREQUIRES_ALIGN 1
# define VZERO() _mm_setzero_ps()
# define VMUL(a,b) _mm_mul_ps(a,b)
# define VADD(a,b) _mm_add_ps(a,b)
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
# define VSUB(a,b) _mm_sub_ps(a,b)
# define LD_PS1(p) _mm_set1_ps(p)
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
/* reverse/flip all floats */
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
/* reverse/flip complex floats */
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
#else
/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
#endif
#endif /* PF_SSE1_FLT_H */

281
pffft/simd/pf_sse2_double.h Normal file
View File

@@ -0,0 +1,281 @@
/*
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
*/
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#ifndef PF_SSE2_DBL_H
#define PF_SSE2_DBL_H
//detect sse2 support under MSVC
#if defined ( _M_IX86_FP )
# if _M_IX86_FP == 2
# if !defined(__SSE2__)
# define __SSE2__
# endif
# endif
#endif
/*
SSE2 64bit support macros
*/
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
#pragma message (__FILE__ ": SSE2 double macros are defined" )
#include <emmintrin.h>
typedef struct {
__m128d d128[2];
} m256d;
typedef m256d v4sf;
# define SIMD_SZ 4
typedef union v4sf_union {
v4sf v;
double f[SIMD_SZ];
} v4sf_union;
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("FORCE_INLINE")
#define FORCE_INLINE static inline __attribute__((always_inline))
#elif defined (_MSC_VER)
#define FORCE_INLINE static __forceinline
#else
#error "Macro name collisions may happens with unknown compiler"
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif
#define FORCE_INLINE static inline
#endif
FORCE_INLINE m256d mm256_setzero_pd(void)
{
m256d ret;
ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
return ret;
}
FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
{
m256d ret;
ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
return ret;
}
FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
{
m256d ret;
ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
return ret;
}
FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
{
m256d ret;
ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
return ret;
}
FORCE_INLINE m256d mm256_set1_pd(double a)
{
m256d ret;
ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
return ret;
}
FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
{
m256d res;
res.d128[0] = _mm_load_pd((const double *)mem_addr);
res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
return res;
}
FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
{
m256d res;
res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
return res;
}
# define VARCH "SSE2"
# define VREQUIRES_ALIGN 1
# define VZERO() mm256_setzero_pd()
# define VMUL(a,b) mm256_mul_pd(a,b)
# define VADD(a,b) mm256_add_pd(a,b)
# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
# define VSUB(a,b) mm256_sub_pd(a,b)
# define LD_PS1(p) mm256_set1_pd(p)
# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
{
return a.d128[0];
}
FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
{
assert(imm8 >= 0 && imm8 <= 1);
return a.d128[imm8];
}
FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
{
m256d res;
res.d128[0] = a.d128[0];
res.d128[1] = b;
return res;
}
FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
{
m256d res;
res.d128[0] = a;
return res;
}
FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
{
m256d res;
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
return res;
}
FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
{
m256d res;
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
return res;
}
FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
m256d res;
res.d128[0] = a.d128[0];
res.d128[1] = b.d128[0];
return res;
}
FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
{
m256d res;
res.d128[0] = a.d128[1];
res.d128[1] = b.d128[1];
return res;
}
FORCE_INLINE m256d mm256_reverse(m256d x)
{
m256d res;
res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
return res;
}
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
*/
# define INTERLEAVE2(in1, in2, out1, out2) { \
__m128d low1__ = mm256_castpd256_pd128(in1); \
__m128d low2__ = mm256_castpd256_pd128(in2); \
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
m256d tmp__ = mm256_insertf128_pd_1( \
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
_mm_shuffle_pd(low1__, low2__, 3)); \
out2 = mm256_insertf128_pd_1( \
mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
_mm_shuffle_pd(high1__, high2__, 3)); \
out1 = tmp__; \
}
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
*/
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
__m128d low1__ = mm256_castpd256_pd128(in1); \
__m128d low2__ = mm256_castpd256_pd128(in2); \
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
m256d tmp__ = mm256_insertf128_pd_1( \
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
_mm_shuffle_pd(low2__, high2__, 0)); \
out2 = mm256_insertf128_pd_1( \
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
_mm_shuffle_pd(low2__, high2__, 3)); \
out1 = tmp__; \
}
# define VTRANSPOSE4(row0, row1, row2, row3) { \
m256d tmp3, tmp2, tmp1, tmp0; \
\
tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
\
(row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
(row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
(row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
(row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
}
/*VSWAPHL(a, b) pseudo code:
return [ b[0], b[1], a[2], a[3] ]
*/
# define VSWAPHL(a,b) \
mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
/* reverse/flip all floats */
# define VREV_S(a) mm256_reverse(a)
/* reverse/flip complex floats */
# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
#endif
#endif

5956
pffft/sse2neon.h Normal file

File diff suppressed because it is too large Load Diff

142
pffft/test_fft_factors.c Normal file
View File

@@ -0,0 +1,142 @@
#ifdef PFFFT_ENABLE_FLOAT
#include "pffft.h"
#endif
#ifdef PFFFT_ENABLE_DOUBLE
#include "pffft_double.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#ifdef PFFFT_ENABLE_FLOAT
int test_float(int TL)
{
PFFFT_Setup * S;
for (int dir_i = 0; dir_i <= 1; ++dir_i)
{
for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
{
const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
const int N_min = pffft_min_fft_size(cplx);
const int N_max = N_min * 11 + N_min;
int NTL = pffft_nearest_transform_size(TL, cplx, (!dir_i));
double near_off = (NTL - TL) * 100.0 / (double)TL;
fprintf(stderr, "testing float, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
(!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
for (int N = (N_min/2); N <= N_max; N += (N_min/2))
{
int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
const int factorizable = pffft_is_valid_size(N, cplx);
while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; }
while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; }
while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; }
tmp_f = (R == N_min) ? 1 : 0;
assert( factorizable == tmp_f );
S = pffft_new_setup(N, cplx);
if ( S && !factorizable )
{
fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
return 1;
}
else if ( !S && factorizable)
{
fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
return 1;
}
if (S)
pffft_destroy_setup(S);
}
}
}
return 0;
}
#endif
#ifdef PFFFT_ENABLE_DOUBLE
int test_double(int TL)
{
PFFFTD_Setup * S;
for (int dir_i = 0; dir_i <= 1; ++dir_i)
{
for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
{
const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
const int N_min = pffftd_min_fft_size(cplx);
const int N_max = N_min * 11 + N_min;
int NTL = pffftd_nearest_transform_size(TL, cplx, (!dir_i));
double near_off = (NTL - TL) * 100.0 / (double)TL;
fprintf(stderr, "testing double, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
(!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
for (int N = (N_min/2); N <= N_max; N += (N_min/2))
{
int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
const int factorizable = pffftd_is_valid_size(N, cplx);
while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; }
while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; }
while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; }
tmp_f = (R == N_min) ? 1 : 0;
assert( factorizable == tmp_f );
S = pffftd_new_setup(N, cplx);
if ( S && !factorizable )
{
fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
return 1;
}
else if ( !S && factorizable)
{
fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
return 1;
}
if (S)
pffftd_destroy_setup(S);
}
}
}
return 0;
}
#endif
int main(int argc, char *argv[])
{
int N = (1 < argc) ? atoi(argv[1]) : 2;
int r = 0;
#ifdef PFFFT_ENABLE_FLOAT
r = test_float(N);
if (r)
return r;
#endif
#ifdef PFFFT_ENABLE_DOUBLE
r = test_double(N);
#endif
return r;
}

991
pffft/test_pffastconv.c Normal file
View File

@@ -0,0 +1,991 @@
/*
Copyright (c) 2013 Julien Pommier.
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
*/
#define _WANT_SNAN 1
#include "pffft.h"
#include "pffastconv.h"
#include <math.h>
#include <float.h>
#include <limits.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <string.h>
#ifdef HAVE_SYS_TIMES
# include <sys/times.h>
# include <unistd.h>
#endif
/* benchmark duration: 250 ms */
#define BENCH_TEST_DURATION_IN_SEC 0.5
/*
vector support macros: the rest of the code is independant of
SSE/Altivec/NEON -- adding support for other platforms with 4-element
vectors should be limited to these macros
*/
#if 0
#include "simd/pf_float.h"
#endif
#if defined(_MSC_VER)
# define RESTRICT __restrict
#elif defined(__GNUC__)
# define RESTRICT __restrict
#else
# define RESTRICT
#endif
#if defined(_MSC_VER)
#pragma warning( disable : 4244 )
#endif
#ifdef SNANF
#define INVALID_FLOAT_VAL SNANF
#elif defined(SNAN)
#define INVALID_FLOAT_VAL SNAN
#elif defined(NAN)
#define INVALID_FLOAT_VAL NAN
#elif defined(INFINITY)
#define INVALID_FLOAT_VAL INFINITY
#else
#define INVALID_FLOAT_VAL FLT_MAX
#endif
#if defined(HAVE_SYS_TIMES)
inline double uclock_sec(void) {
static double ttclk = 0.;
struct tms t;
if (ttclk == 0.)
ttclk = sysconf(_SC_CLK_TCK);
times(&t);
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
return ((double)t.tms_utime)) / ttclk;
}
# else
double uclock_sec(void)
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
#endif
typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags);
typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
typedef void (*pfnConvDestroy) (void * setup);
struct ConvSetup
{
pfnConvolution pfn;
int N;
int B;
float * H;
int flags;
};
void * convSetupRev( float * H, int N, int * BlkLen, int flags )
{
struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
int i, Nr = N;
if (flags & PFFASTCONV_CPLX_INP_OUT)
Nr *= 2;
Nr += 4;
s->pfn = NULL;
s->N = N;
s->B = *BlkLen;
s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
s->flags = flags;
memset(s->H, 0, (unsigned)Nr * sizeof(float));
if (flags & PFFASTCONV_CPLX_INP_OUT)
{
for ( i = 0; i < N; ++i ) {
s->H[2*(N-1 -i) ] = H[i];
s->H[2*(N-1 -i)+1] = H[i];
}
/* simpler detection of overruns */
s->H[ 2*N ] = INVALID_FLOAT_VAL;
s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
}
else
{
for ( i = 0; i < N; ++i )
s->H[ N-1 -i ] = H[i];
/* simpler detection of overruns */
s->H[ N ] = INVALID_FLOAT_VAL;
s->H[ N +1 ] = INVALID_FLOAT_VAL;
s->H[ N +2 ] = INVALID_FLOAT_VAL;
s->H[ N +3 ] = INVALID_FLOAT_VAL;
}
return s;
}
void convDestroyRev( void * setup )
{
struct ConvSetup * s = (struct ConvSetup*)setup;
pffastconv_free(s->H);
pffastconv_free(setup);
}
pfnConvolution ConvGetFnPtrRev( void * setup )
{
struct ConvSetup * s = (struct ConvSetup*)setup;
if (!s)
return NULL;
return s->pfn;
}
void convSimdDestroy( void * setup )
{
convDestroyRev(setup);
}
void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
{
void * p = pffastconv_new_setup( H, N, BlkLen, flags );
if (!p)
printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
return p;
}
void fastConvDestroy( void * setup )
{
pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
}
int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
{
struct ConvSetup * p = (struct ConvSetup*)setup;
const float * RESTRICT X = input;
const float * RESTRICT Hrev = p->H;
float * RESTRICT Y = output;
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
int i, j;
(void)Yref;
(void)applyFlush;
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
{
for ( i = 0; i <= lenNr; i += 2 )
{
float sumRe = 0.0F, sumIm = 0.0F;
for ( j = 0; j < Nr; j += 2 )
{
sumRe += X[i+j ] * Hrev[j];
sumIm += X[i+j+1] * Hrev[j+1];
}
Y[i ] = sumRe;
Y[i+1] = sumIm;
}
return i/2;
}
else
{
for ( i = 0; i <= lenNr; ++i )
{
float sum = 0.0F;
for (j = 0; j < Nr; ++j )
sum += X[i+j] * Hrev[j];
Y[i] = sum;
}
return i;
}
}
int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
{
float sum[4];
struct ConvSetup * p = (struct ConvSetup*)setup;
const float * RESTRICT X = input;
const float * RESTRICT Hrev = p->H;
float * RESTRICT Y = output;
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
int i, j;
(void)Yref;
(void)applyFlush;
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
{
if ( (Nr & 3) == 0 )
{
for ( i = 0; i <= lenNr; i += 2 )
{
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < Nr; j += 4 )
{
sum[0] += X[i+j] * Hrev[j];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
Y[i ] = sum[0] + sum[2];
Y[i+1] = sum[1] + sum[3];
}
}
else
{
const int M = Nr & (~3);
for ( i = 0; i <= lenNr; i += 2 )
{
float tailSumRe = 0.0F, tailSumIm = 0.0F;
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < M; j += 4 )
{
sum[0] += X[i+j ] * Hrev[j ];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
for ( ; j < Nr; j += 2 ) {
tailSumRe += X[i+j ] * Hrev[j ];
tailSumIm += X[i+j+1] * Hrev[j+1];
}
Y[i ] = ( sum[0] + sum[2] ) + tailSumRe;
Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
}
}
return i/2;
}
else
{
if ( (Nr & 3) == 0 )
{
for ( i = 0; i <= lenNr; ++i )
{
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < Nr; j += 4 )
{
sum[0] += X[i+j] * Hrev[j];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
}
return i;
}
else
{
const int M = Nr & (~3);
/* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
for ( i = 0; i <= lenNr; ++i )
{
float tailSum = 0.0;
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < M; j += 4 )
{
sum[0] += X[i+j] * Hrev[j];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
for ( ; j < Nr; ++j )
tailSum += X[i+j] * Hrev[j];
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
}
return i;
}
}
}
int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
{
float sum[4];
struct ConvSetup * p = (struct ConvSetup*)setup;
(void)Yref;
(void)applyFlush;
if (p->flags & PFFASTCONV_SYMMETRIC)
{
const float * RESTRICT X = input;
const float * RESTRICT Hrev = p->H;
float * RESTRICT Y = output;
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
const int h = Nr / 2 -4;
const int E = Nr -4;
int i, j;
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
{
for ( i = 0; i <= lenNr; i += 2 )
{
const int k = i + E;
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j <= h; j += 4 )
{
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] );
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] );
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
}
Y[i ] = sum[0] + sum[2];
Y[i+1] = sum[1] + sum[3];
}
return i/2;
}
else
{
for ( i = 0; i <= lenNr; ++i )
{
const int k = i + E;
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j <= h; j += 4 )
{
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] );
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] );
}
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
}
return i;
}
}
else
{
const float * RESTRICT X = input;
const float * RESTRICT Hrev = p->H;
float * RESTRICT Y = output;
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
int i, j;
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
{
for ( i = 0; i <= lenNr; i += 2 )
{
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < Nr; j += 4 )
{
sum[0] += X[i+j] * Hrev[j];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
Y[i ] = sum[0] + sum[2];
Y[i+1] = sum[1] + sum[3];
}
return i/2;
}
else
{
if ( (Nr & 3) == 0 )
{
for ( i = 0; i <= lenNr; ++i )
{
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < Nr; j += 4 )
{
sum[0] += X[i+j] * Hrev[j];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
}
return i;
}
else
{
const int M = Nr & (~3);
/* printf("B: Nr = %d\n", Nr ); */
for ( i = 0; i <= lenNr; ++i )
{
float tailSum = 0.0;
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
for (j = 0; j < M; j += 4 )
{
sum[0] += X[i+j] * Hrev[j];
sum[1] += X[i+j+1] * Hrev[j+1];
sum[2] += X[i+j+2] * Hrev[j+2];
sum[3] += X[i+j+3] * Hrev[j+3];
}
for ( ; j < Nr; ++j )
tailSum += X[i+j] * Hrev[j];
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
}
return i;
}
}
}
}
int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
{
(void)Yref;
return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
}
void printFirst( const float * V, const char * st, const int N, const int perLine )
{
(void)V; (void)st; (void)N; (void)perLine;
return;
#if 0
int i;
for ( i = 0; i < N; ++i )
{
if ( (i % perLine) == 0 )
printf("\n%s[%d]", st, i);
printf("\t%.1f", V[i]);
}
printf("\n");
#endif
}
#define NUMY 15
int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed, int abortFirstFastAlgo, int printErrValues, int printAsCSV, int *pIsFirstFilterLen) {
double t0, t1, tstop, td, tdref;
float *X, *H;
float *Y[NUMY];
int64_t outN[NUMY];
/* 256 KFloats or 16 MFloats data */
#if 1
const int len = testOutLen ? (1 << 18) : (1 << 24);
#elif 0
const int len = testOutLen ? (1 << 18) : (1 << 13);
#else
const int len = testOutLen ? (1 << 18) : (1024);
#endif
const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
const int lenC = len / cplxFactor;
int yi, yc, posMaxErr;
float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
int i, j, numErrOverLimit, iter;
int retErr = 0;
/* 0 1 2 3 4 5 6 7 8 9, 10, 11, 12, 13 */
pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, };
pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, };
pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, };
const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K", "fast_conv_8K", "fast_conv_16K", "fast_conv_32K", "fast_conv_64K", };
int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, };
void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
//int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, };
int aBlkLen[NUMY] = { 8192, 8192, 8192, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, };
#if 1
int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096, FILTERLEN<8192, FILTERLEN<16384, FILTERLEN<32768, FILTERLEN<65536, };
#elif 0
int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, };
#else
int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, };
#endif
double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
int aNumIters[NUMY], aNumLoops[NUMY];
X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
for ( i=0; i < NUMY; ++i)
{
if ( 1 || i < 2 )
Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
else
Y[i] = Y[1];
Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */
aSpeedFactor[i] = -1.0;
aDuration[i] = -1.0;
procSmpPerSec[i] = -1.0;
aNumIters[i] = 0;
aNumLoops[i] = 0;
}
H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
/* initialize input */
if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
{
for ( i = 0; i < lenC; ++i )
{
X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
X[2*i+1] = (float)((i+2048) % 4093);
}
}
else
{
for ( i = 0; i < len; ++i )
X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
}
X[ len ] = INVALID_FLOAT_VAL;
X[ len +1 ] = INVALID_FLOAT_VAL;
X[ len +2 ] = INVALID_FLOAT_VAL;
X[ len +3 ] = INVALID_FLOAT_VAL;
if (!testOutLen)
printFirst( X, "X", 64, 8 );
/* filter coeffs */
memset( H, 0, FILTERLEN * sizeof(float) );
#if 1
if ( convFlags & PFFASTCONV_SYMMETRIC )
{
const int half = FILTERLEN / 2;
for ( j = 0; j < half; ++j ) {
switch (j % 3) {
case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break;
case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break;
case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break;
}
}
}
else
{
for ( j = 0; j < FILTERLEN; ++j ) {
switch (j % 3) {
case 0: H[j] = -1.0F; break;
case 1: H[j] = 1.0F; break;
case 2: H[j] = 0.5F; break;
}
}
}
#else
H[0] = 1.0F;
H[FILTERLEN -1] = 1.0F;
#endif
if (!testOutLen)
printFirst( H, "H", FILTERLEN, 8 );
if (!printAsCSV)
{
printf("\n");
printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
}
int hadFastAlgo = 0;
while (1)
{
for ( yi = 0; yi < NUMY; ++yi )
{
if (!aRunAlgo[yi])
continue;
if ( aFastAlgo[yi] && abortFirstFastAlgo && hadFastAlgo )
{
aRunAlgo[yi] = 0;
continue;
}
hadFastAlgo = hadFastAlgo | aFastAlgo[yi];
aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
/* get effective apply function ptr */
if ( aSetupCfg[yi] && aGetFnPtr[yi] )
aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
if ( aSetupCfg[yi] && aConv[yi] )
{
if (testOutLen)
{
t0 = uclock_sec();
outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
t1 = uclock_sec();
td = t1 - t0;
}
else
{
//const int blkLen = 4096; /* required for 'fast_conv_4K' */
const int blkLen = aBlkLen[yi];
int64_t offC = 0, offS, Nout;
int k;
iter = 0;
outN[yi] = 0;
aNumLoops[yi] = 1;
t0 = uclock_sec();
tstop = t0 + BENCH_TEST_DURATION_IN_SEC;
do
{
const int prev_iter = iter;
for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
{
offS = cplxFactor * offC;
Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], 0 /* applyFlush */ );
offC += Nout;
++iter;
if ( !Nout )
break;
}
//if ( !Nout )
// break;
t1 = uclock_sec();
if ( prev_iter == iter ) // restart from begin of input?
{
offC = 0;
++aNumLoops[yi];
}
} while ( t1 < tstop );
outN[yi] = offC;
td = t1 - t0;
procSmpPerSec[yi] = cplxFactor * (double)outN[yi] * (1.0 / td);
aNumIters[yi] = iter;
aDuration[yi] = td;
//printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%.1f ms\n",
// convText[yi], (double)outN[yi]/(1000.0 * 1000.0), 1000.0 * aDuration[yi], procSmpPerSec[yi] * 0.001, aNumIters[yi], 1000.0 * td );
}
}
else
{
outN[yi] = 0;
}
if ( yi == 0 ) {
const float * Yvals = Y[0];
const int64_t refOutLen = cplxFactor * outN[0];
tdref = td;
if (printDbg) {
printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
}
aSpeedFactor[yi] = 1.0;
/* */
yRangeMin = FLT_MAX;
yRangeMax = FLT_MIN;
for ( i = 0; i < refOutLen; ++i )
{
if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i];
if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i];
}
yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
/* yErrLimit = 0.01F; */
if (testOutLen) {
if (1) {
printf("reference output len = %" PRId64 " smp\n", outN[0]);
printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
}
printFirst( Yvals, "Yref", 64, 8 );
}
}
else
{
aSpeedFactor[yi] = tdref / td;
if (printDbg) {
printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
}
}
}
int iMaxSpeedSlowAlgo = -1;
int iFirstFastAlgo = -1;
int iMaxSpeedFastAlgo = -1;
int iPrintedRefOutLen = 0;
{
for ( yc = 1; yc < NUMY; ++yc )
{
if (!aRunAlgo[yc])
continue;
if (aFastAlgo[yc]) {
if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
iMaxSpeedFastAlgo = yc;
if (iFirstFastAlgo < 0)
iFirstFastAlgo = yc;
}
else
{
if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
iMaxSpeedSlowAlgo = yc;
}
}
if (printSpeed)
{
if (testOutLen)
{
if (iMaxSpeedSlowAlgo >= 0 )
printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]);
if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]);
if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
{
printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
}
printf("\n");
}
else
{
// print columns in 1st line
if (printAsCSV && *pIsFirstFilterLen)
{
printf("\n# filterLen, filterOrder, Re/Cx, type, sym, ");
for ( yc = 0; yc < NUMY; ++yc )
{
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
continue;
if (printAsCSV)
printf("%s, ", convText[yc]);
}
*pIsFirstFilterLen = 0;
}
for ( yc = 0; yc < NUMY; ++yc )
{
if (!yc)
{
double filterExp = log10((double)FILTERLEN) / log10(2.0);
printf("\n%5d, %5.1f, %s, %s, %s, ", FILTERLEN, filterExp,
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym")
);
}
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
continue;
if (printAsCSV)
printf("%.0f, ", procSmpPerSec[yc] * 0.001);
else
printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%d loops\n",
convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001, aNumIters[yc], aNumLoops[yc] );
}
}
}
}
for ( yc = 1; yc < NUMY; ++yc )
{
const float * Yref;
const float * Ycurr;
int outMin;
if (!aRunAlgo[yc])
continue;
if (printDbg)
printf("\n");
if ( outN[yc] == 0 )
{
if (!printAsCSV)
printf("output size 0: '%s' not implemented\n", convText[yc]);
}
else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
{
if (!iPrintedRefOutLen)
{
printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
iPrintedRefOutLen = 1;
}
printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
retErr = 1;
}
posMaxErr = 0;
maxErr = -1.0;
Yref = Y[0];
Ycurr = Y[yc];
outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
numErrOverLimit = 0;
for ( i = 0; i < outMin; ++i )
{
if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit && printErrValues )
{
printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
++numErrOverLimit;
}
if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
{
maxErr = fabsf(Ycurr[i] - Yref[i]);
posMaxErr = i;
}
}
if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
}
break;
}
pffastconv_free(X);
for ( i=0; i < NUMY; ++i)
{
if ( 1 || i < 2 )
pffastconv_free( Y[i] );
if (!aRunAlgo[i])
continue;
aDestroy[i]( aSetupCfg[i] );
}
pffastconv_free(H);
return retErr;
}
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
void validate_pffft_simd();
int validate_pffft_simd_ex(FILE * DbgOut);
int main(int argc, char **argv)
{
int result = 0;
int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
int testReal = 1, testCplx = 1, testSymetric = 0, abortFirstFastAlgo = 1, printErrValues = 0, printAsCSV = 1;
int isFirstFilterLen = 1;
for ( i = 1; i < argc; ++i ) {
if (!strcmp(argv[i], "--test-simd")) {
int numErrs = validate_pffft_simd_ex(stdout);
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
return ( numErrs > 0 ? 1 : 0 );
}
if (!strcmp(argv[i], "--no-len")) {
testOutLens = 0;
}
else if (!strcmp(argv[i], "--no-bench")) {
benchConv = 0;
}
else if (!strcmp(argv[i], "--quick")) {
quickTest = 1;
}
else if (!strcmp(argv[i], "--slow")) {
slowTest = 1;
}
else if (!strcmp(argv[i], "--real")) {
testCplx = 0;
}
else if (!strcmp(argv[i], "--cplx")) {
testReal = 0;
}
else if (!strcmp(argv[i], "--sym")) {
testSymetric = 1;
}
else /* if (!strcmp(argv[i], "--help")) */ {
printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
exit(1);
}
}
if (testOutLens)
{
for ( k = 0; k < 3; ++k )
{
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
continue;
printf("\n\n==========\n");
printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
printf("==========\n");
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
testOutLen = 1;
printDbg = 0;
printSpeed = 0;
for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
{
if ( (M % 16) != 0 && testSymetric )
continue;
result |= test(M, flagsB, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, 0, &isFirstFilterLen);
}
}
}
if (benchConv)
{
printf("quickTest is %d\n", quickTest);
printf("slowTest is %d\n", slowTest);
for ( k = 0; k < 3; ++k )
{
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
continue;
if (!printAsCSV)
{
printf("\n\n==========\n");
printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
printf("==========\n");
}
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
testOutLen = 0;
printDbg = 0;
printSpeed = 1;
if (!slowTest) {
if (!quickTest) {
result |= test(32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(32 + 16, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
}
result |= test(64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
if (!quickTest) {
result |= test(64 + 32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
}
}
if (!quickTest) {
result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(256, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(512, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(1024, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(2048, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(4096, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(8192, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(16384, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
result |= test(32768, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
}
if (printAsCSV)
printf("\n");
}
}
return result;
}

371
pffft/test_pffft.c Normal file
View File

@@ -0,0 +1,371 @@
/*
Copyright (c) 2013 Julien Pommier.
Small test for PFFFT
How to build:
on linux, with fftw3:
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
on macos, without fftw3:
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
on macos, with fftw3:
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
as alternative: replace clang by gcc.
on windows, with visual c++:
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
build without SIMD instructions:
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
*/
#ifdef PFFFT_ENABLE_FLOAT
#include "pffft.h"
typedef float pffft_scalar;
#else
/*
Note: adapted for double precision dynamic range version.
*/
#include "pffft_double.h"
typedef double pffft_scalar;
#endif
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <string.h>
/* define own constants required to turn off g++ extensions .. */
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
/* EXPECTED_DYN_RANGE in dB:
* single precision float has 24 bits mantissa
* => 24 Bits * 6 dB = 144 dB
* allow a few dB tolerance (even 144 dB looks good on my PC)
*/
#ifdef PFFFT_ENABLE_FLOAT
#define EXPECTED_DYN_RANGE 140.0
#else
#define EXPECTED_DYN_RANGE 215.0
#endif
/* maximum allowed phase error in degree */
#define DEG_ERR_LIMIT 1E-4
/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
#define MAG_ERR_LIMIT 1E-6
#define PRINT_SPEC 0
#define PWR2LOG(PWR) ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
int test(int N, int cplx, int useOrdered) {
int Nfloat = (cplx ? N*2 : N);
#ifdef PFFFT_ENABLE_FLOAT
pffft_scalar *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *R = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
#else
pffft_scalar *X = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *Y = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *R = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *Z = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
pffft_scalar *W = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
#endif
pffft_scalar amp = (pffft_scalar)1.0;
double freq, dPhi, phi, phi0;
double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
int k, j, m, iter, kmaxOther, retError = 0;
#ifdef PFFFT_ENABLE_FLOAT
assert( pffft_is_power_of_two(N) );
PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
#else
assert( pffftd_is_power_of_two(N) );
PFFFTD_Setup *s = pffftd_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
#endif
assert(s);
if (!s) {
printf("Error setting up PFFFT!\n");
return 1;
}
for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
{
amp = (pffft_scalar)( ( (m % 3) == 0 ) ? 1.0 : 1.1 );
freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
dPhi = 2.0 * M_PI * freq;
if ( dPhi < 0.0 )
dPhi += 2.0 * M_PI;
iter = -1;
while (1)
{
++iter;
if (iter)
printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
/* generate cosine carrier as time signal - start at defined phase phi0 */
phi = phi0 = (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
for ( j = 0; j < N; ++j )
{
if (cplx) {
X[2*j] = amp * (pffft_scalar)cos(phi); /* real part */
X[2*j+1] = amp * (pffft_scalar)sin(phi); /* imag part */
}
else
X[j] = amp * (pffft_scalar)cos(phi); /* only real part */
/* phase increment .. stay normalized - cos()/sin() might degrade! */
phi += dPhi;
if ( phi >= M_PI )
phi -= 2.0 * M_PI;
}
/* forward transform from X --> Y .. using work buffer W */
#ifdef PFFFT_ENABLE_FLOAT
if ( useOrdered )
pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
else
{
pffft_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */
pffft_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
}
#else
if ( useOrdered )
pffftd_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
else
{
pffftd_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */
pffftd_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
}
#endif
pwrOther = -1.0;
pwrCar = 0;
/* for positive frequencies: 0 to 0.5 * samplerate */
/* and also for negative frequencies: -0.5 * samplerate to 0 */
for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
{
if (!cplx && !j) /* special treatment for DC for real input */
pwr = Y[j]*Y[j];
else if (!cplx && j == N/2) /* treat 0.5 * samplerate */
pwr = Y[1] * Y[1]; /* despite j (for freq calculation) we have index 1 */
else
pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
if (iter || PRINT_SPEC)
printf("%s fft %d: pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
if (k == j)
pwrCar = pwr;
else if ( pwr > pwrOther ) {
pwrOther = pwr;
kmaxOther = j;
}
}
if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
printf(" carrier power at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
printf(" max other pwr at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
printf(" dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
retError = 1;
if ( iter == 0 )
continue;
}
if ( k > 0 && k != N/2 )
{
phi = atan2( Y[2*k+1], Y[2*k] );
if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
{
retError = 1;
printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg expected = %f deg\n",
(cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
}
}
expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
mag = sqrt(pwrCar) / N;
if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
{
retError = 1;
printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
}
/* now convert spectrum back */
#ifdef PFFFT_ENABLE_FLOAT
if (useOrdered)
pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
else
pffft_transform(s, R, Z, W, PFFFT_BACKWARD);
#else
if (useOrdered)
pffftd_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
else
pffftd_transform(s, R, Z, W, PFFFT_BACKWARD);
#endif
errSum = 0.0;
for ( j = 0; j < (cplx ? (2*N) : N); ++j )
{
/* scale back */
Z[j] /= N;
/* square sum errors over real (and imag parts) */
err = (X[j]-Z[j]) * (X[j]-Z[j]);
errSum += err;
}
if ( errSum > N * 1E-7 )
{
retError = 1;
printf("%s fft %d bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
}
break;
}
}
#ifdef PFFFT_ENABLE_FLOAT
pffft_destroy_setup(s);
pffft_aligned_free(X);
pffft_aligned_free(Y);
pffft_aligned_free(Z);
pffft_aligned_free(R);
pffft_aligned_free(W);
#else
pffftd_destroy_setup(s);
pffftd_aligned_free(X);
pffftd_aligned_free(Y);
pffftd_aligned_free(Z);
pffftd_aligned_free(R);
pffftd_aligned_free(W);
#endif
return retError;
}
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
void validate_pffft_simd();
int validate_pffft_simd_ex(FILE * DbgOut);
void validate_pffftd_simd();
int validate_pffftd_simd_ex(FILE * DbgOut);
int main(int argc, char **argv)
{
int N, result, resN, resAll, i, k, resNextPw2, resIsPw2, resFFT;
int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
for ( i = 1; i < argc; ++i ) {
if (!strcmp(argv[i], "--test-simd")) {
#ifdef PFFFT_ENABLE_FLOAT
int numErrs = validate_pffft_simd_ex(stdout);
#else
int numErrs = validate_pffftd_simd_ex(stdout);
#endif
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
return ( numErrs > 0 ? 1 : 0 );
}
}
resNextPw2 = 0;
resIsPw2 = 0;
for ( k = 0; k < (sizeof(inp_power_of_two)/sizeof(inp_power_of_two[0])); ++k) {
#ifdef PFFFT_ENABLE_FLOAT
N = pffft_next_power_of_two(inp_power_of_two[k]);
#else
N = pffftd_next_power_of_two(inp_power_of_two[k]);
#endif
if (N != ref_power_of_two[k]) {
resNextPw2 = 1;
printf("pffft_next_power_of_two(%d) does deliver %d, which is not reference result %d!\n",
inp_power_of_two[k], N, ref_power_of_two[k] );
}
#ifdef PFFFT_ENABLE_FLOAT
result = pffft_is_power_of_two(inp_power_of_two[k]);
#else
result = pffftd_is_power_of_two(inp_power_of_two[k]);
#endif
if (inp_power_of_two[k] == ref_power_of_two[k]) {
if (!result) {
resIsPw2 = 1;
printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", inp_power_of_two[k]);
}
} else {
if (result) {
resIsPw2 = 1;
printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", inp_power_of_two[k]);
}
}
}
if (!resNextPw2)
printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
if (!resIsPw2)
printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
resFFT = 0;
for ( N = 32; N <= 65536; N *= 2 )
{
result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
resN = result;
resFFT |= result;
result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
resN |= result;
resFFT |= result;
result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
resN |= result;
resFFT |= result;
result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
resN |= result;
resFFT |= result;
if (!resN)
printf("tests for size %d succeeded successfully.\n", N);
}
if (!resFFT) {
#ifdef PFFFT_ENABLE_FLOAT
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, float) succeeded successfully.\n");
#else
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, double) succeeded successfully.\n");
#endif
}
resAll = resNextPw2 | resIsPw2 | resFFT;
if (!resAll)
printf("all tests succeeded successfully.\n");
else
printf("there are failed tests!\n");
return resAll;
}

377
pffft/test_pffft.cpp Normal file
View File

@@ -0,0 +1,377 @@
/*
Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
Small test & bench for PFFFT, comparing its performance with the scalar
FFTPACK, FFTW, and Apple vDSP
How to build:
on linux, with fftw3:
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c
test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
on macos, without fftw3:
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c
-L/usr/local/lib -I/usr/local/include/ -framework Accelerate
on macos, with fftw3:
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c
test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f
-framework Accelerate
as alternative: replace clang by gcc.
on windows, with visual c++:
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
build without SIMD instructions:
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c
fftpack.c -lm
*/
#include "pffft.hpp"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
/* define own constants required to turn off g++ extensions .. */
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
/* maximum allowed phase error in degree */
#define DEG_ERR_LIMIT 1E-4
/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
#define MAG_ERR_LIMIT 1E-6
#define PRINT_SPEC 0
#define PWR2LOG(PWR) ((PWR) < 1E-30 ? 10.0 * log10(1E-30) : 10.0 * log10(PWR))
template<typename T>
bool
Ttest(int N, bool useOrdered)
{
typedef pffft::Fft<T> Fft;
typedef typename pffft::Fft<T>::Scalar FftScalar;
typedef typename Fft::Complex FftComplex;
const bool cplx = pffft::Fft<T>::isComplexTransform();
const double EXPECTED_DYN_RANGE = Fft::isDoubleScalar() ? 215.0 : 140.0;
assert(Fft::isPowerOfTwo(N));
Fft fft = Fft(N); // instantiate and prepareLength() for length N
#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
// possible ways to declare/instatiate aligned vectors with C++11
// some lines require a typedef of above
auto X = fft.valueVector(); // for X = input vector
pffft::AlignedVector<typename Fft::Complex> Y = fft.spectrumVector(); // for Y = forward(X)
pffft::AlignedVector<FftScalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
// or Z = inverseInternalLayout(R)
#else
// possible ways to declare/instatiate aligned vectors with C++98
pffft::AlignedVector<T> X = fft.valueVector(); // for X = input vector
pffft::AlignedVector<FftComplex> Y = fft.spectrumVector(); // for Y = forward(X)
pffft::AlignedVector<typename Fft::Scalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
// or Z = inverseInternalLayout(R)
#endif
// work with complex - without the capabilities of a higher c++ standard
FftScalar* Xs = reinterpret_cast<FftScalar*>(X.data()); // for X = input vector
FftScalar* Ys = reinterpret_cast<FftScalar*>(Y.data()); // for Y = forward(X)
FftScalar* Zs = reinterpret_cast<FftScalar*>(Z.data()); // for Z = inverse(Y) = inverse( forward(X) )
int k, j, m, iter, kmaxOther;
bool retError = false;
double freq, dPhi, phi, phi0;
double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
double amp = 1.0;
for (k = m = 0; k < (cplx ? N : (1 + N / 2)); k += N / 16, ++m) {
amp = ((m % 3) == 0) ? 1.0F : 1.1F;
freq = (k < N / 2) ? ((double)k / N) : ((double)(k - N) / N);
dPhi = 2.0 * M_PI * freq;
if (dPhi < 0.0)
dPhi += 2.0 * M_PI;
iter = -1;
while (1) {
++iter;
if (iter)
printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
/* generate cosine carrier as time signal - start at defined phase phi0 */
phi = phi0 =
(m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
for (j = 0; j < N; ++j) {
if (cplx) {
Xs[2 * j] = (FftScalar)( amp * cos(phi) ); /* real part */
Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */
} else
Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */
/* phase increment .. stay normalized - cos()/sin() might degrade! */
phi += dPhi;
if (phi >= M_PI)
phi -= 2.0 * M_PI;
}
/* forward transform from X --> Y .. using work buffer W */
if (useOrdered)
fft.forward(X, Y);
else {
fft.forwardToInternalLayout(X, R); /* use R for reordering */
fft.reorderSpectrum(R, Y); /* have canonical order in Y[] for power calculations */
}
pwrOther = -1.0;
pwrCar = 0;
/* for positive frequencies: 0 to 0.5 * samplerate */
/* and also for negative frequencies: -0.5 * samplerate to 0 */
for (j = 0; j < (cplx ? N : (1 + N / 2)); ++j) {
if (!cplx && !j) /* special treatment for DC for real input */
pwr = Ys[j] * Ys[j];
else if (!cplx && j == N / 2) /* treat 0.5 * samplerate */
pwr = Ys[1] *
Ys[1]; /* despite j (for freq calculation) we have index 1 */
else
pwr = Ys[2 * j] * Ys[2 * j] + Ys[2 * j + 1] * Ys[2 * j + 1];
if (iter || PRINT_SPEC)
printf("%s fft %d: pwr[j = %d] = %g == %f dB\n",
(cplx ? "cplx" : "real"),
N,
j,
pwr,
PWR2LOG(pwr));
if (k == j)
pwrCar = pwr;
else if (pwr > pwrOther) {
pwrOther = pwr;
kmaxOther = j;
}
}
if (PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE) {
printf("%s fft %d amp %f iter %d:\n",
(cplx ? "cplx" : "real"),
N,
amp,
iter);
printf(" carrier power at bin %d: %g == %f dB\n",
k,
pwrCar,
PWR2LOG(pwrCar));
printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar));
printf(" max other pwr at bin %d: %g == %f dB\n",
kmaxOther,
pwrOther,
PWR2LOG(pwrOther));
printf(" dynamic range: %f dB\n\n",
PWR2LOG(pwrCar) - PWR2LOG(pwrOther));
retError = true;
if (iter == 0)
continue;
}
if (k > 0 && k != N / 2) {
phi = atan2(Ys[2 * k + 1], Ys[2 * k]);
if (fabs(phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0) {
retError = true;
printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg "
"expected = %f deg\n",
(cplx ? "cplx" : "real"),
N,
k,
amp,
phi * 180.0 / M_PI,
phi0 * 180.0 / M_PI);
}
}
expextedMag = cplx ? amp : ((k == 0 || k == N / 2) ? amp : (amp / 2));
mag = sqrt(pwrCar) / N;
if (fabs(mag - expextedMag) > MAG_ERR_LIMIT) {
retError = true;
printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n",
(cplx ? "cplx" : "real"),
N,
k,
amp,
mag,
expextedMag);
}
/* now convert spectrum back */
if (useOrdered)
fft.inverse(Y, Z);
else
fft.inverseFromInternalLayout(R, Z); /* inverse() from internal Layout */
errSum = 0.0;
for (j = 0; j < (cplx ? (2 * N) : N); ++j) {
/* scale back */
Zs[j] /= N;
/* square sum errors over real (and imag parts) */
err = (Xs[j] - Zs[j]) * (Xs[j] - Zs[j]);
errSum += err;
}
if (errSum > N * 1E-7) {
retError = true;
printf("%s fft %d bin %d : inverse FFT doesn't match original signal! "
"errSum = %g ; mean err = %g\n",
(cplx ? "cplx" : "real"),
N,
k,
errSum,
errSum / N);
}
break;
}
}
// using the std::vector<> base classes .. no need for alignedFree() for X, Y, Z and R
return retError;
}
bool
test(int N, bool useComplex, bool useOrdered)
{
if (useComplex) {
return
#ifdef PFFFT_ENABLE_FLOAT
Ttest< std::complex<float> >(N, useOrdered)
#endif
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
&&
#endif
#ifdef PFFFT_ENABLE_DOUBLE
Ttest< std::complex<double> >(N, useOrdered)
#endif
;
} else {
return
#ifdef PFFFT_ENABLE_FLOAT
Ttest<float>(N, useOrdered)
#endif
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
&&
#endif
#ifdef PFFFT_ENABLE_DOUBLE
Ttest<double>(N, useOrdered)
#endif
;
}
}
int
main(int argc, char** argv)
{
int N, result, resN, resAll, k, resNextPw2, resIsPw2, resFFT;
int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
resNextPw2 = 0;
resIsPw2 = 0;
for (k = 0; k < (sizeof(inp_power_of_two) / sizeof(inp_power_of_two[0]));
++k) {
#ifdef PFFFT_ENABLE_FLOAT
N = pffft::Fft<float>::nextPowerOfTwo(inp_power_of_two[k]);
#else
N = pffft::Fft<double>::nextPowerOfTwo(inp_power_of_two[k]);
#endif
if (N != ref_power_of_two[k]) {
resNextPw2 = 1;
printf("pffft_next_power_of_two(%d) does deliver %d, which is not "
"reference result %d!\n",
inp_power_of_two[k],
N,
ref_power_of_two[k]);
}
#ifdef PFFFT_ENABLE_FLOAT
result = pffft::Fft<float>::isPowerOfTwo(inp_power_of_two[k]);
#else
result = pffft::Fft<double>::isPowerOfTwo(inp_power_of_two[k]);
#endif
if (inp_power_of_two[k] == ref_power_of_two[k]) {
if (!result) {
resIsPw2 = 1;
printf("pffft_is_power_of_two(%d) delivers false; expected true!\n",
inp_power_of_two[k]);
}
} else {
if (result) {
resIsPw2 = 1;
printf("pffft_is_power_of_two(%d) delivers true; expected false!\n",
inp_power_of_two[k]);
}
}
}
if (!resNextPw2)
printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
if (!resIsPw2)
printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
resFFT = 0;
for (N = 32; N <= 65536; N *= 2) {
result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
resN = result;
resFFT |= result;
result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
resN |= result;
resFFT |= result;
result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
resN |= result;
resFFT |= result;
result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
resN |= result;
resFFT |= result;
if (!resN)
printf("tests for size %d succeeded successfully.\n", N);
}
if (!resFFT)
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, "
#ifdef PFFFT_ENABLE_FLOAT
"float"
#endif
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
"/"
#endif
#ifdef PFFFT_ENABLE_DOUBLE
"double"
#endif
") succeeded successfully.\n");
resAll = resNextPw2 | resIsPw2 | resFFT;
if (!resAll)
printf("all tests succeeded successfully.\n");
else
printf("there are failed tests!\n");
return resAll;
}

24
pffft/uninstall.cmake Normal file
View File

@@ -0,0 +1,24 @@
set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
if(NOT EXISTS ${MANIFEST})
message(FATAL_ERROR "Cannot find install manifest: '${MANIFEST}'")
endif()
file(STRINGS ${MANIFEST} files)
foreach(file ${files})
if(EXISTS ${file})
message(STATUS "Removing file: '${file}'")
exec_program(
${CMAKE_COMMAND} ARGS "-E remove ${file}"
OUTPUT_VARIABLE stdout
RETURN_VALUE result
)
if(NOT "${result}" STREQUAL 0)
message(FATAL_ERROR "Failed to remove file: '${file}'.")
endif()
else()
MESSAGE(STATUS "File '${file}' does not exist.")
endif()
endforeach(file)

2
pffft/use_gcc8.inc Normal file
View File

@@ -0,0 +1,2 @@
export GCC_WITH_CMAKE=$(which gcc-8)
export GPP_WITH_CMAKE=$(which g++-8)