add pffft

2024-11-09 14:57:18 -06:00
parent 78a00f71cc
commit a1790b8977
69 changed files with 25719 additions and 0 deletions
--- a/pffft/.github/workflows/c-cpp.yml
+++ b/pffft/.github/workflows/c-cpp.yml
@@ -0,0 +1,279 @@
+name: C/C++ CI
+
+on:
+  push:
+    branches:
+      - master
+      - github_actions
+  pull_request:
+    branches:
+      - master
+      - github_actions
+
+env:
+  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
+  BUILD_TYPE: Release
+
+jobs:
+  build_w_mipp_ubuntu-amd64:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: check out MIPP
+      uses: actions/checkout@master
+      with:
+          repository: hayguen/MIPP
+          path: ./MIPP
+    - name: cmake configure MIPP
+      run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
+    - name: cmake install MIPP headers
+      run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
+
+    - uses: actions/checkout@v2
+    - name: cmake_make_simd_float_double
+      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
+    - name: cmake_make_simd_float
+      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
+    - name: cmake_make_simd_double
+      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
+    - name: cmake_make_no-simd_float_double
+      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
+    - name: cmake_make_no-simd_scalar_float_double
+      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
+    - name: compress
+      run: tar zcvf pffft_w_mipp_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_ubuntu_builds
+        path: pffft_w_mipp_ubuntu-amd64.tar.gz
+
+  build_ubuntu-amd64:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: cmake_make_simd_float_double
+      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
+    - name: cmake_make_simd_float
+      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
+    - name: cmake_make_simd_double
+      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
+    - name: cmake_make_no-simd_float_double
+      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
+    - name: cmake_make_no-simd_scalar_float_double
+      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
+    - name: compress
+      run: tar zcvf pffft_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_ubuntu_builds
+        path: pffft_ubuntu-amd64.tar.gz
+
+  cross_build_win_from_linux:
+    runs-on: ubuntu-20.04
+
+    steps:
+    - name: prerequisites
+      run: sudo apt -qq update && sudo apt -yqq install gcc-mingw-w64 g++-mingw-w64
+
+    - name: check out MIPP
+      uses: actions/checkout@master
+      with:
+          repository: hayguen/MIPP
+          path: ./MIPP
+    - name: cmake configure MIPP
+      working-directory: ${{runner.workspace}}
+      run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
+    - name: cmake install MIPP headers
+      working-directory: ${{runner.workspace}}
+      run: cmake --build MIPP_build --target install
+
+    - uses: actions/checkout@v2
+    - name: build_w32_no-simd
+      working-directory: ${{runner.workspace}}
+      run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
+    - name: build_w32_simd_full
+      working-directory: ${{runner.workspace}}
+      run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=pentium4 -DTARGET_C_ARCH=pentium4 -DMIPP_INCLUDE_DIRS=$X/include/mipp
+
+    - name: build_w64_no-simd
+      working-directory: ${{runner.workspace}}
+      run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
+    - name: build_w64_simd_full
+      working-directory: ${{runner.workspace}}
+      run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=sandybridge -DTARGET_C_ARCH=sandybridge -DMIPP_INCLUDE_DIRS=$X/include/mipp
+
+    - name: compress
+      run: tar zcvf pffft_cross-build-windows-from-linux-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt  build_w32_no-simd build_w32_simd_full build_w64_no-simd build_w64_simd_full
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_windows_from_cross_builds
+        path: pffft_cross-build-windows-from-linux-amd64.tar.gz
+
+
+  build_win_msvc:
+    # The CMake configure and build commands are platform agnostic and should work equally
+    # well on Windows or Mac.  You can convert this to a matrix build if you need
+    # cross-platform coverage.
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+    runs-on: windows-2019
+
+    steps:
+    - name: check out MIPP
+      uses: actions/checkout@master
+      with:
+          repository: hayguen/MIPP
+          path: ./MIPP
+    - name: cmake configure MIPP
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
+    - name: cmake install MIPP headers
+      working-directory: ${{runner.workspace}}
+      run: cmake --build MIPP_build --target install
+
+    - uses: actions/checkout@v2
+
+    - name: Configure CMake No-SIMD
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      run: cmake -S $GITHUB_WORKSPACE -B build_no-simd -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DPFFFT_USE_SIMD=OFF -DTARGET_CXX_ARCH=none -DTARGET_C_ARCH=none
+    - name: Build No-SIMD
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build build_no-simd --config $BUILD_TYPE
+
+    - name: Configure CMake SSE2
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      run: cmake -S $GITHUB_WORKSPACE -B build_sse2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=SSE2 -DTARGET_C_ARCH=SSE2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
+    - name: Build SSE2
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build build_sse2 --config $BUILD_TYPE
+
+    - name: Configure CMake AVX
+      # Use a bash shell so we can use the same syntax for environment variable
+      # access regardless of the host operating system
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      run: cmake -S $GITHUB_WORKSPACE -B build_avx -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX -DTARGET_C_ARCH=AVX -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
+    - name: Build AVX
+      working-directory: ${{runner.workspace}}
+      shell: bash
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build build_avx --config $BUILD_TYPE
+
+    - name: Configure CMake AVX2
+      # Use a bash shell so we can use the same syntax for environment variable
+      # access regardless of the host operating system
+      shell: bash
+      working-directory: ${{runner.workspace}}
+      run: cmake -S $GITHUB_WORKSPACE -B build_avx2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX2 -DTARGET_C_ARCH=AVX2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
+    - name: Build AVX2
+      working-directory: ${{runner.workspace}}
+      shell: bash
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build build_avx2 --config $BUILD_TYPE
+
+    - name: compress
+      working-directory: ${{runner.workspace}}
+      run: tar zcvf pffft_windows-msvc-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt  build_no-simd build_sse2 build_avx build_avx2
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_windows_msvc_builds
+        path: ${{runner.workspace}}/pffft_windows-msvc-amd64.tar.gz
+
+
+  build_win_mingw:
+    runs-on: windows-2019
+    strategy:
+      matrix:
+        compiler: [gcc]
+        msystem: [MINGW64]
+    defaults:
+      run:
+        shell: msys2 {0}
+    steps:
+    - uses: actions/checkout@v2
+    - uses: msys2/setup-msys2@v2
+      with:
+        msystem: MINGW64
+        install: gcc cmake make
+    - name: Configure cmake
+      run: CC=gcc cmake -DMINGW=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native -S . -B build_mgw64
+    - name: Build
+      run: cmake --build build_mgw64
+
+    - name: compress
+      run: tar zcvf pffft_windows-mingw-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt  build_mgw64
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_windows_mingw_builds
+        path: pffft_windows-mingw-amd64.tar.gz
+
+
+  build_macos11:
+    # copied from build_ubuntu-amd64 with minor renaming
+    runs-on: macos-11
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: cmake_make_simd_float_double
+      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
+    - name: cmake_make_simd_float
+      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
+    - name: cmake_make_simd_double
+      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
+    - name: cmake_make_no-simd_float_double
+      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
+    - name: cmake_make_no-simd_scalar_float_double
+      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
+    - name: compress
+      run: tar zcvf pffft_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_macos_builds
+        path: pffft_macos-11.tar.gz
+
+  build_w_mipp_macos11:
+    # copied from build_w_mipp_ubuntu-amd64 with minor renaming
+    runs-on: macos-11
+
+    steps:
+    - name: check out MIPP
+      uses: actions/checkout@master
+      with:
+          repository: hayguen/MIPP
+          path: ./MIPP
+    - name: cmake configure MIPP
+      run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
+    - name: cmake install MIPP headers
+      run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
+
+    - uses: actions/checkout@v2
+    - name: cmake_make_simd_float_double
+      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
+    - name: cmake_make_simd_float
+      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
+    - name: cmake_make_simd_double
+      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
+    - name: cmake_make_no-simd_float_double
+      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
+    - name: cmake_make_no-simd_scalar_float_double
+      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
+    - name: compress
+      run: tar zcvf pffft_w_mipp_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
+    - name: 'Upload Artifact'
+      uses: actions/upload-artifact@v2
+      with:
+        name: pffft_macos_builds
+        path: pffft_w_mipp_macos-11.tar.gz
--- a/pffft/.gitignore
+++ b/pffft/.gitignore
@@ -0,0 +1,4 @@
+build
+build_benches
+build_*
+.vscode
--- a/pffft/.gitmodules
+++ b/pffft/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "greenffts"]
+	path = greenffts
+	url = https://github.com/hayguen/greenffts.git
+[submodule "kissfft"]
+	path = kissfft
+	url = https://github.com/hayguen/kissfft.git
+[submodule "pocketfft"]
+	path = pocketfft
+	url = https://github.com/hayguen/pocketfft.git
--- a/pffft/CMakeLists.txt
+++ b/pffft/CMakeLists.txt
@@ -0,0 +1,663 @@
+cmake_minimum_required(VERSION 2.8)
+project(PRETTY_FAST_FFT)
+
+# smaller library size?
+option(PFFFT_USE_TYPE_FLOAT  "activate single precision 'float'?" ON)
+option(PFFFT_USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
+
+# architecture/optimization options
+option(PFFFT_USE_SIMD        "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
+option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
+
+# what to install?
+option(INSTALL_PFFFT      "install pffft to CMAKE_INSTALL_PREFIX?" ON)
+option(INSTALL_PFDSP      "install pfdsp to CMAKE_INSTALL_PREFIX?" OFF)
+option(INSTALL_PFFASTCONV "install pffastconv to CMAKE_INSTALL_PREFIX?" OFF)
+
+# test options
+option(PFFFT_USE_BENCH_FFTW   "use (system-installed) FFTW3 in fft benchmark?" OFF)
+option(PFFFT_USE_BENCH_GREEN  "use Green FFT in fft benchmark? - if exists in subdir" ON)
+option(PFFFT_USE_BENCH_KISS   "use KissFFT in fft benchmark? - if exists in subdir" ON)
+option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
+option(PFFFT_USE_BENCH_MKL    "use Intel MKL in fft benchmark? needs to be installed" OFF)
+option(PFFFT_USE_FFTPACK      "compile and use FFTPACK in fft benchmark & validation?" ON)
+
+option(PFFFT_USE_DEBUG_ASAN  "use GCC's address sanitizer?" OFF)
+
+option(PFFFT_DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
+
+# C90 requires the gcc extensions for function attributes like always_inline
+# C99 provides the function attributes: no gcc extensions required
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_C_EXTENSIONS OFF)
+
+set(CMAKE_CXX_STANDARD 98)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# populate what to install
+set(INSTALL_TARGETS "")
+set(INSTALL_HEADERS "")
+
+
+if ( (NOT PFFFT_USE_TYPE_FLOAT) AND (NOT PFFFT_USE_TYPE_DOUBLE) )
+  message(FATAL_ERROR "activate at least one of PFFFT_USE_TYPE_FLOAT or PFFFT_USE_TYPE_DOUBLE")
+endif()
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+include(cmake/target_optimizations.cmake)
+include(cmake/compiler_warnings.cmake)
+find_package(PAPI)
+find_package(MIPP)
+if (MIPP_FOUND)
+# if (TARGET MIPP)
+    message(STATUS "found MIPP")
+else()
+    message(STATUS "NOT found MIPP")
+endif()
+
+
+if (PFFFT_USE_DEBUG_ASAN)
+  set(ASANLIB "asan")
+else()
+  set(ASANLIB "")
+endif()
+
+message(STATUS "INFO: CMAKE_C_COMPILER_ID is ${CMAKE_C_COMPILER_ID}")
+message(STATUS "INFO: CMAKE_CXX_COMPILER_ID is ${CMAKE_CXX_COMPILER_ID}")
+if (WIN32)
+  message(STATUS "INFO: detected WIN32")
+else()
+  message(STATUS "INFO: NOT WIN32")
+endif()
+if (MINGW)
+  message(STATUS "INFO: detected MINGW with compiler ${CMAKE_C_COMPILER_ID}")
+else()
+  message(STATUS "INFO: NOT MINGW")
+endif()
+if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
+  message(STATUS "INFO: detected MSVC with compiler ${CMAKE_C_COMPILER_ID}")
+endif()
+
+
+if (PFFFT_USE_BENCH_GREEN)
+  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/greenffts/CMakeLists.txt")
+    message(STATUS "found subdir greenffts")
+    set(PATH_GREEN "${CMAKE_CURRENT_LIST_DIR}/greenffts")
+    add_subdirectory( "${PATH_GREEN}" )
+  else()
+    message(WARNING "GreenFFT not found in subdir greenffts")
+  endif()
+endif()
+
+if (PFFFT_USE_BENCH_KISS)
+  # git submodule add https://github.com/hayguen/kissfft.git
+  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/kissfft/CMakeLists.txt")
+    message(STATUS "found subdir kissfft")
+    set(PATH_KISS "${CMAKE_CURRENT_LIST_DIR}/kissfft")
+    add_subdirectory( "${PATH_KISS}" )
+  else()
+    message(WARNING "KissFFT not found in subdir kissfft")
+  endif()
+endif()
+
+if (PFFFT_USE_BENCH_POCKET)
+  # git submodule add https://github.com/hayguen/pocketfft.git
+  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c")
+    message(STATUS "found subdir pocketfft")
+    set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft")
+    add_subdirectory( "${PATH_POCKET}" )
+  else()
+    message(WARNING "PocketFFT not found in subdir pocketfft")
+  endif()
+endif()
+
+
+########################################################################
+# select the release build type by default to get optimization flags
+########################################################################
+if(NOT CMAKE_BUILD_TYPE)
+   set(CMAKE_BUILD_TYPE "Release")
+   message(STATUS "Build type not specified: defaulting to release.")
+endif(NOT CMAKE_BUILD_TYPE)
+
+if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
+  # using Visual Studio C++
+  message(STATUS "INFO: detected MSVC: will not link math lib m")
+  set(MATHLIB "")
+
+  add_definitions("/D_CRT_SECURE_NO_WARNINGS")
+
+  set(MSVC_DISABLED_WARNINGS_LIST
+      "C4996"
+  )
+
+else()
+  if(PFFFT_DISABLE_LINK_WITH_M)
+  else()
+    message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
+    set(MATHLIB "m")
+  endif()
+endif()
+
+set(STDCXXLIB "")
+if (MINGW)
+  set(STDCXXLIB "stdc++")
+endif()
+
+
+set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
+set( SIMD_DOUBLE_HDRS simd/pf_double.h simd/pf_avx_double.h simd/pf_scalar_double.h )
+
+if (PFFFT_USE_TYPE_FLOAT)
+  set( FLOAT_SOURCES pffft.c pffft.h ${SIMD_FLOAT_HDRS} )
+  if (INSTALL_PFFFT)
+    set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.h)
+  endif()
+else()
+  set( FLOAT_SOURCES  )
+endif()
+
+
+if (PFFFT_USE_TYPE_DOUBLE)
+  set( DOUBLE_SOURCES pffft_double.c pffft_double.h ${SIMD_DOUBLE_HDRS} )
+  if (INSTALL_PFFFT)
+    set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft_double.h)
+  endif()
+else()
+  set( DOUBLE_SOURCES )
+endif()
+
+######################################################
+
+add_library(PFFFT STATIC ${FLOAT_SOURCES} ${DOUBLE_SOURCES} pffft_common.c pffft_priv_impl.h pffft.hpp )
+set_target_properties(PFFFT PROPERTIES OUTPUT_NAME "pffft")
+target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
+target_activate_c_compiler_warnings(PFFFT)
+if (PFFFT_USE_SCALAR_VECT)
+  target_compile_definitions(PFFFT PRIVATE PFFFT_SCALVEC_ENABLED=1)
+endif()
+if (PFFFT_USE_DEBUG_ASAN)
+  target_compile_options(PFFFT PRIVATE "-fsanitize=address")
+endif()
+target_set_c_arch_flags(PFFFT)
+if (NOT PFFFT_USE_SIMD)
+  target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
+endif()
+target_link_libraries( PFFFT ${ASANLIB} ${MATHLIB} )
+set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+)
+if (INSTALL_PFFFT)
+  set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFFT)
+  set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.hpp)
+endif()
+
+######################################################
+
+if (PFFFT_USE_TYPE_FLOAT)
+  add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_cplx.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
+  set_property(TARGET PFDSP PROPERTY CXX_STANDARD 11)
+  set_property(TARGET PFDSP PROPERTY CXX_STANDARD_REQUIRED ON)
+  set_target_properties(PFDSP PROPERTIES OUTPUT_NAME "pfdsp")
+  target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
+  target_activate_cxx_compiler_warnings(PFDSP)
+  if (PFFFT_USE_DEBUG_ASAN)
+      target_compile_options(PFDSP PRIVATE "-fsanitize=address")
+  endif()
+  if (PFFFT_USE_SIMD)
+      target_set_cxx_arch_flags(PFDSP)
+  else()
+      target_compile_definitions(PFDSP PRIVATE PFFFT_SIMD_DISABLE=1)
+  endif()
+  target_link_libraries( PFDSP ${MATHLIB} )
+  set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+  if (INSTALL_PFDSP)
+      set(INSTALL_TARGETS ${INSTALL_TARGETS} PFDSP)
+      set(INSTALL_HEADERS ${INSTALL_HEADERS} pf_mixer.h pf_cplx.h pf_carrier.h pf_cic.h)
+  endif()
+endif()
+
+######################################################
+
+if (PFFFT_USE_FFTPACK)
+
+  # float / single precision
+  add_library(FFTPACK_FLOAT STATIC fftpack.c fftpack.h)
+  target_compile_definitions(FFTPACK_FLOAT PRIVATE _USE_MATH_DEFINES)
+  target_activate_c_compiler_warnings(FFTPACK_FLOAT)
+  target_link_libraries( FFTPACK_FLOAT ${MATHLIB} )
+  set_property(TARGET FFTPACK_FLOAT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+
+  # double precision
+  add_library(FFTPACK_DOUBLE STATIC fftpack.c fftpack.h)
+  target_compile_definitions(FFTPACK_DOUBLE PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(FFTPACK_DOUBLE PUBLIC FFTPACK_DOUBLE_PRECISION)
+  target_activate_c_compiler_warnings(FFTPACK_DOUBLE)
+  target_link_libraries( FFTPACK_DOUBLE ${MATHLIB} )
+  set_property(TARGET FFTPACK_DOUBLE APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+
+  # builtin test program of fftpack
+  add_executable(test_fftpack_float fftpack.c fftpack.h)
+  target_compile_definitions(test_fftpack_float PRIVATE _USE_MATH_DEFINES TESTING_FFTPACK)
+  target_link_libraries(test_fftpack_float ${MATHLIB})
+
+  add_executable(test_fftpack_double fftpack.c fftpack.h)
+  target_compile_definitions(test_fftpack_double PRIVATE _USE_MATH_DEFINES FFTPACK_DOUBLE_PRECISION TESTING_FFTPACK)
+  target_link_libraries(test_fftpack_double ${MATHLIB})
+
+endif()
+
+######################################################
+
+if (PFFFT_USE_TYPE_FLOAT)
+  # only 'float' supported in PFFASTCONV
+  add_library(PFFASTCONV STATIC pffastconv.c pffastconv.h pffft.h )
+  set_target_properties(PFFASTCONV PROPERTIES OUTPUT_NAME "pffastconv")
+  target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
+  target_activate_c_compiler_warnings(PFFASTCONV)
+  if (PFFFT_USE_DEBUG_ASAN)
+    target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
+  endif()
+  target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
+  set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+  if (INSTALL_PFFASTCONV)
+    set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFASTCONV)
+    set(INSTALL_HEADERS ${INSTALL_HEADERS} pffastconv.h)
+  endif()
+endif()
+
+
+######################################################
+
+install( TARGETS ${INSTALL_TARGETS}  DESTINATION lib)
+install( FILES  ${INSTALL_HEADERS}  DESTINATION include)
+
+add_custom_target(uninstall
+    "${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/uninstall.cmake"
+)
+
+#######################################################
+
+if (PFFFT_USE_TYPE_FLOAT)
+  add_executable( test_pffft_float  test_pffft.c )
+  target_compile_definitions(test_pffft_float PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(test_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
+  target_link_libraries( test_pffft_float  PFFFT ${ASANLIB} )
+endif()
+
+######################################################
+
+if (PFFFT_USE_TYPE_DOUBLE)
+  add_executable( test_pffft_double  test_pffft.c )
+  target_compile_definitions(test_pffft_double PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(test_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
+  target_link_libraries( test_pffft_double  PFFFT ${ASANLIB} )
+endif()
+
+######################################################
+
+add_executable( test_fft_factors  test_fft_factors.c )
+if (PFFFT_USE_TYPE_FLOAT)
+  target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_FLOAT)
+endif()
+if (PFFFT_USE_TYPE_DOUBLE)
+  target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_DOUBLE)
+endif()
+target_link_libraries(test_fft_factors PFFFT ${ASANLIB} ${MATHLIB})
+
+######################################################
+
+add_executable( test_pffft_cpp test_pffft.cpp )
+target_compile_definitions(test_pffft_cpp PRIVATE _USE_MATH_DEFINES)
+if (PFFFT_USE_TYPE_FLOAT)
+  target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_FLOAT)
+endif()
+if (PFFFT_USE_TYPE_DOUBLE)
+  target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_DOUBLE)
+endif()
+target_link_libraries( test_pffft_cpp  PFFFT ${STDCXXLIB} ${ASANLIB} )
+
+######################################################
+
+add_executable( test_pffft_cpp_11 test_pffft.cpp )
+target_compile_definitions(test_pffft_cpp_11 PRIVATE _USE_MATH_DEFINES)
+if (PFFFT_USE_TYPE_FLOAT)
+  target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_FLOAT)
+endif()
+if (PFFFT_USE_TYPE_DOUBLE)
+  target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_DOUBLE)
+endif()
+target_link_libraries( test_pffft_cpp_11  PFFFT ${STDCXXLIB} ${ASANLIB} )
+
+set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD 11)
+set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD_REQUIRED ON)
+
+######################################################
+
+if (PFFFT_USE_TYPE_FLOAT)
+  add_executable(test_pffastconv   test_pffastconv.c
+    ${SIMD_FLOAT_HDRS} ${SIMD_DOUBLE_HDRS}
+  )
+  target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
+  if (PFFFT_USE_DEBUG_ASAN)
+    target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
+  endif()
+  target_set_c_arch_flags(test_pffastconv)
+  if (NOT PFFFT_USE_SIMD)
+    target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
+  endif()
+  target_link_libraries( test_pffastconv  PFFASTCONV ${ASANLIB} ${MATHLIB} )
+
+endif()
+
+######################################################
+
+if (PFFFT_USE_TYPE_FLOAT)
+  add_executable(bench_pffft_float   bench_pffft.c pffft.h)
+  target_compile_definitions(bench_pffft_float PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(bench_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
+  if (PFFFT_USE_DEBUG_ASAN)
+    target_compile_options(bench_pffft_float PRIVATE "-fsanitize=address")
+  endif()
+
+  target_link_libraries( bench_pffft_float  PFFFT ${ASANLIB} )
+
+  if (PFFFT_USE_FFTPACK)
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTPACK=1)
+    target_link_libraries(bench_pffft_float  FFTPACK_FLOAT)
+  endif()
+
+  if (PFFFT_USE_BENCH_FFTW)
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTW=1)
+    target_link_libraries(bench_pffft_float  fftw3f)
+  endif()
+
+  if (PATH_GREEN AND PFFFT_USE_BENCH_GREEN)
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_GREEN_FFTS=1)
+    target_link_libraries(bench_pffft_float  GreenFFT)
+  endif()
+
+  if (PATH_KISS AND PFFFT_USE_BENCH_KISS)
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1)
+    target_link_libraries(bench_pffft_float  KissFFT)
+  endif()
+
+  if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1)
+    target_link_libraries(bench_pffft_float  PocketFFT)
+  endif()
+
+  if (PFFFT_USE_BENCH_MKL)
+    if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
+      # has chances to work
+    else()
+      # other PROCESSORs could be "ppc", "ppc64",  "arm", "aarch64", "armv7l" - or something else?!
+      message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
+    endif()
+    message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_MKL=1)
+    target_link_libraries(bench_pffft_float  mkl_intel_lp64 mkl_sequential -lmkl_core)
+  endif()
+endif()
+
+if (PFFFT_USE_TYPE_DOUBLE)
+  add_executable(bench_pffft_double   bench_pffft.c pffft.h)
+  target_compile_definitions(bench_pffft_double PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(bench_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
+  if (PFFFT_USE_DEBUG_ASAN)
+    target_compile_options(bench_pffft_double PRIVATE "-fsanitize=address")
+  endif()
+  target_link_libraries( bench_pffft_double  PFFFT ${ASANLIB} )
+
+  if (PFFFT_USE_FFTPACK)
+    target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTPACK=1)
+    target_link_libraries(bench_pffft_double  FFTPACK_DOUBLE)
+  endif()
+
+  if (PFFFT_USE_BENCH_FFTW)
+    target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1)
+    target_link_libraries(bench_pffft_double  fftw3)
+  endif()
+
+  if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
+    target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1)
+    target_link_libraries(bench_pffft_double  PocketFFT)
+  endif()
+
+  if (PFFFT_USE_BENCH_MKL)
+    if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
+      # has chances to work
+    else()
+      # other PROCESSORs could be "ppc", "ppc64",  "arm", "aarch64", "armv7l" - or something else?!
+      message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
+    endif()
+    message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
+    target_compile_definitions(bench_pffft_double PRIVATE HAVE_MKL=1)
+    target_link_libraries(bench_pffft_double  mkl_intel_lp64 mkl_sequential -lmkl_core)
+  endif()
+endif()
+
+######################################################
+
+if (PFFFT_USE_TYPE_FLOAT)
+
+    add_executable(bench_pf_mixer_float   bench_mixers.cpp papi_perf_counter.h)
+    target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
+    target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
+    target_link_libraries( bench_pf_mixer_float  ${ASANLIB} )
+    if (PFFFT_USE_DEBUG_ASAN)
+      target_compile_options(bench_pf_mixer_float PRIVATE "-fsanitize=address")
+    endif()
+    if (PAPI_FOUND)
+        target_compile_definitions(bench_pf_mixer_float PRIVATE HAVE_PAPI=1)
+        target_link_libraries(bench_pf_mixer_float ${PAPI_LIBRARIES})
+    endif()
+    target_link_libraries( bench_pf_mixer_float  PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
+
+
+  ############################################################################
+
+  add_library(pf_conv_arch_none pf_conv.cpp pf_conv.h pf_cplx.h)
+  target_compile_definitions(pf_conv_arch_none PRIVATE CONV_ARCH_POST=none MIPP_NO_INTRINSICS=1)
+  set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD 11)
+  set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD_REQUIRED ON)
+  target_activate_cxx_compiler_warnings(pf_conv_arch_none)
+  add_library(pf_conv_dispatcher  pf_conv_dispatcher.cpp pf_conv_dispatcher.h pf_conv.h pf_cplx.h)
+  set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD 11)
+  set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD_REQUIRED ON)
+  target_activate_cxx_compiler_warnings(pf_conv_dispatcher)
+
+  add_library(pf_conv_arch_dflt pf_conv.cpp pf_conv.h pf_cplx.h)
+  target_compile_definitions(pf_conv_arch_dflt PRIVATE CONV_ARCH_POST=dflt)
+  set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD 11)
+  set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD_REQUIRED ON)
+  target_activate_cxx_compiler_warnings(pf_conv_arch_dflt)
+  target_set_cxx_arch_flags(pf_conv_arch_dflt)
+
+  target_link_libraries(pf_conv_dispatcher pf_conv_arch_none pf_conv_arch_dflt)
+
+  if ((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64"))
+
+    if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(PF_CONV_ARCHES "sse3;sse4;avx;avx2")
+        set(PF_CONV_OPT_sse3 "core2")  # emulate a map
+        set(PF_CONV_OPT_sse4 "nehalem")
+        set(PF_CONV_OPT_avx  "sandybridge")
+        set(PF_CONV_OPT_avx2 "haswell")
+        target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AMD64)
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+        set(PF_CONV_ARCHES "sse2;avx;avx2")
+        set(PF_CONV_OPT_sse2 "SSE2")  # emulate a map
+        set(PF_CONV_OPT_avx  "AVX")
+        set(PF_CONV_OPT_avx2 "AVX2")
+        target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_MSVC_AMD64)
+    else()
+        set(PF_CONV_ARCHES "")
+        message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
+    endif()
+
+  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+
+      if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+          set(PF_CONV_ARCHES "armv8a")
+          set(PF_CONV_OPT_armv8a   "armv8-a")  # emulate a map for arch
+
+          target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AARCH64)
+      else()
+          set(PF_CONV_ARCHES "")
+          message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
+      endif()
+
+  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
+
+    if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(PF_CONV_ARCHES "neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72")
+        set(PF_CONV_OPT_neon_vfpv4        "armv7-a")    # emulate a map for arch
+        set(PF_CONV_EXTRA_neon_vfpv4      "neon_vfpv4") # emulate a map for additional options (EXTRA)
+        set(PF_CONV_OPT_neon_rpi3_a53     "armv7-a")
+        set(PF_CONV_EXTRA_neon_rpi3_a53   "neon_rpi3_a53")
+        set(PF_CONV_OPT_neon_rpi4_a72     "armv7-a")
+        set(PF_CONV_EXTRA_neon_rpi4_a72   "neon_rpi4_a72")
+
+        target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_ARM32NEON)
+    else()
+        set(PF_CONV_ARCHES "")
+        message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
+    endif()
+
+  else()
+      message(WARNING "this is unforseen CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
+  endif()
+
+  foreach (arch_opt ${PF_CONV_ARCHES})
+      add_library(pf_conv_arch_${arch_opt} pf_conv.cpp pf_conv.h pf_cplx.h)
+      set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD 11)
+      set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD_REQUIRED ON)
+      target_activate_cxx_compiler_warnings(pf_conv_arch_${arch_opt})
+      target_compile_definitions(pf_conv_arch_${arch_opt} PRIVATE CONV_ARCH_POST=${arch_opt})
+
+      target_set_cxx_arch_option(pf_conv_arch_${arch_opt} "${PF_CONV_OPT_${arch_opt}}" "${PF_CONV_EXTRA_${arch_opt}}"  "${PF_CONV_OPT_${arch_opt}}")
+      target_link_libraries(pf_conv_dispatcher  pf_conv_arch_${arch_opt})
+      message(STATUS "added library pf_conv_arch_${arch_opt}  with CONV_ARCH_POST=${arch_opt}")
+  endforeach()
+
+  if (PFFFT_USE_DEBUG_ASAN)
+      foreach (arch_opt ${PF_CONV_ARCHES})
+          target_compile_options(pf_conv_arch_${arch_opt} PRIVATE "-fsanitize=address")
+          target_link_libraries( pf_conv_arch_${arch_opt} ${ASANLIB})
+      endforeach()
+
+      target_compile_options(pf_conv_arch_none  PRIVATE "-fsanitize=address")
+      target_link_libraries( pf_conv_arch_none  ${ASANLIB})
+
+      target_compile_options(pf_conv_dispatcher  PRIVATE "-fsanitize=address")
+      target_link_libraries(pf_conv_dispatcher ${ASANLIB})
+  endif()
+
+  if(MIPP_FOUND)
+      foreach (arch_opt ${PF_CONV_ARCHES})
+          message(STATUS "link pf_conv_arch_${arch_opt} against MIPP")
+          target_link_libraries(pf_conv_arch_${arch_opt} MIPP)
+      endforeach()
+
+      message(STATUS "link pf_conv_arch_none against MIPP")
+      target_link_libraries(pf_conv_arch_none MIPP)
+  endif()
+
+  ############################################################################
+
+  add_executable(bench_pf_conv_float   bench_conv.cpp papi_perf_counter.h)
+  set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD 11)
+  set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD_REQUIRED ON)
+  target_compile_definitions(bench_pf_conv_float PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(bench_pf_conv_float PRIVATE PFFFT_ENABLE_FLOAT)
+  if (PFFFT_USE_DEBUG_ASAN)
+      target_compile_options(bench_pf_conv_float PRIVATE "-fsanitize=address")
+  endif()
+  target_link_libraries( bench_pf_conv_float  ${ASANLIB} )
+  if (PAPI_FOUND)
+      target_compile_definitions(bench_pf_conv_float PRIVATE HAVE_PAPI=1)
+      target_link_libraries(bench_pf_conv_float ${PAPI_LIBRARIES})
+  endif()
+  if(MIPP_FOUND)
+      target_link_libraries(bench_pf_conv_float MIPP)
+  endif()
+  target_link_libraries( bench_pf_conv_float  pf_conv_dispatcher PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
+
+endif()
+
+######################################################
+
+add_subdirectory(examples)
+
+######################################################
+
+enable_testing()
+
+
+add_test(NAME test_fft_factors
+  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fft_factors"
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+if (PFFFT_USE_FFTPACK)
+  add_test(NAME test_fftpack_float
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_float"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  add_test(NAME test_fftpack_double
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_double"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+endif()
+
+
+if (PFFFT_USE_TYPE_FLOAT)
+
+  add_test(NAME bench_pffft_pow2
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--max-len" "128" "--quick"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  add_test(NAME bench_pffft_non2
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--non-pow2" "--max-len" "192" "--quick"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  # add_test(NAME bench_plots
+  #   COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/plots.sh"
+  #   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  # )
+
+  add_test(NAME test_pfconv_lens_symetric
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  add_test(NAME test_pfconv_lens_non_sym
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  add_test(NAME bench_pfconv_symetric
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" "--sym"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  add_test(NAME bench_pfconv_non_sym
+    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+endif()
+
--- a/pffft/LICENSE.txt
+++ b/pffft/LICENSE.txt
@@ -0,0 +1,38 @@
+
+Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
+Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+Copyright (c) 2004 the University Corporation for Atmospheric
+Research ("UCAR"). All rights reserved. Developed by NCAR's
+Computational and Information Systems Laboratory, UCAR,
+www.cisl.ucar.edu.
+
+Redistribution and use of the Software in source and binary forms,
+with or without modification, is permitted provided that the
+following conditions are met:
+
+- Neither the names of NCAR's Computational and Information Systems
+Laboratory, the University Corporation for Atmospheric Research,
+nor the names of its sponsors or contributors may be used to
+endorse or promote products derived from this Software without
+specific prior written permission.  
+
+- Redistributions of source code must retain the above copyright
+notices, this list of conditions, and the disclaimer below.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions, and the disclaimer below in the
+documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
--- a/pffft/README.md
+++ b/pffft/README.md
@@ -0,0 +1,352 @@
+
+---
+
+# PFFFT: a pretty fast FFT and fast convolution with PFFASTCONV
+
+---
+
+<!-- toc -->
+
+- [Brief Description](#brief-description)
+- [Why does it exist?](#why-does-it-exist)
+- [CMake](#cmake)
+- [History / Origin / Changes](#history--origin--changes)
+- [Comparison with other FFTs](#comparison-with-other-ffts)
+- [Dependencies / Required Linux packages](#dependencies--required-linux-packages)
+- [Benchmarks and results](#benchmarks-and-results)
+
+<!-- tocstop -->
+
+---
+
+## Brief description:
+
+PFFFT does 1D Fast Fourier Transforms, of single precision real and
+complex vectors. It tries do it fast, it tries to be correct, and it
+tries to be small. Computations do take advantage of SSE1 instructions
+on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
+license is BSD-like.
+
+PFFFT is a fork of [Julien Pommier's library on bitbucket](https://bitbucket.org/jpommier/pffft/)
+with some changes and additions.
+
+
+PFFASTCONV does fast convolution (FIR filtering), of single precision 
+real vectors, utilizing the PFFFT library. The license is BSD-like.
+
+PFDSP contains a few other signal processing functions.
+Currently, mixing and carrier generation functions are contained.
+It is work in progress - also the API!
+The fast convolution from PFFASTCONV might get merged into PFDSP.
+
+
+## Why does it exist:
+
+I (Julien Pommier) was in search of a good performing FFT library ,
+preferably very small and with a very liberal license.
+
+When one says "fft library", FFTW ("Fastest Fourier Transform in the
+West") is probably the first name that comes to mind -- I guess that
+99% of open-source projects that need a FFT do use FFTW, and are happy
+with it. However, it is quite a large library , which does everything
+fft related (2d transforms, 3d transforms, other transformations such
+as discrete cosine , or fast hartley). And it is licensed under the
+GNU GPL , which means that it cannot be used in non open-source
+products.
+
+An alternative to FFTW that is really small, is the venerable FFTPACK
+v4, which is available on NETLIB. A more recent version (v5) exists,
+but it is larger as it deals with multi-dimensional transforms. This
+is a library that is written in FORTRAN 77, a language that is now
+considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
+by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
+its age, benchmarks show it that it still a very good performing FFT
+library, see for example the 1d single precision benchmarks
+[here](http://www.fftw.org/speed/opteron-2.2GHz-32bit/). It is however not
+competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
+Apple vDSP. The reason for that is that those libraries do take
+advantage of the SSE SIMD instructions available on Intel CPUs,
+available since the days of the Pentium III. These instructions deal
+with small vectors of 4 floats at a time, instead of a single float
+for a traditionnal FPU, so when using these instructions one may expect
+a 4-fold performance improvement.
+
+The idea was to take this fortran fftpack v4 code, translate to C,
+modify it to deal with those SSE instructions, and check that the
+final performance is not completely ridiculous when compared to other
+SIMD FFT libraries. Translation to C was performed with [f2c](
+http://www.netlib.org/f2c/). The resulting file was a bit edited in
+order to remove the thousands of gotos that were introduced by
+f2c. You will find the fftpack.h and fftpack.c sources in the
+repository, this a complete translation of [fftpack](
+http://www.netlib.org/fftpack/), with the discrete cosine transform
+and the test program. There is no license information in the netlib
+repository, but it was confirmed to me by the fftpack v5 curators that
+the [same terms do apply to fftpack v4]
+(http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html). This is a
+"BSD-like" license, it is compatible with proprietary projects.
+
+Adapting fftpack to deal with the SIMD 4-element vectors instead of
+scalar single precision numbers was more complex than I originally
+thought, especially with the real transforms, and I ended up writing
+more code than I planned..
+
+
+## The code:
+
+### Good old C:
+The FFT API is very very simple, just make sure that you read the comments in `pffft.h`.
+
+The Fast convolution's API is also very simple, just make sure that you read the comments 
+in `pffastconv.h`.
+
+### C++:
+A simple C++ wrapper is available in `pffft.hpp`.
+
+### Git:
+This archive's source can be downloaded with git (without the submodules):
+```
+git clone https://github.com/marton78/pffft.git
+```
+
+### Only two files?:
+_"Only two files, in good old C, pffft.c and pffft.h"_
+
+This statement does **NO LONGER** hold!
+
+With new functionality and support for AVX, there was need to restructure the sources.
+But you can compile and link **pffft** as a static library.
+
+
+## CMake:
+There's now CMake support to build the static libraries `libPFFFT.a` 
+and `libPFFASTCONV.a` from the source files, plus the additional 
+`libFFTPACK.a` library. Later one's sources are there anyway for the benchmark.
+
+There are several CMake options to modify library size and optimization.
+You can explore all available options with `cmake-gui` or `ccmake`,
+the console version - after having installed (on Debian/Ubuntu Linux) one of
+```
+sudo apt-get install cmake-qt-gui
+sudo apt-get install cmake-curses-gui
+```
+
+Some of the options:
+* `PFFFT_USE_TYPE_FLOAT` to activate single precision 'float' (default: ON)
+* `PFFFT_USE_TYPE_DOUBLE` to activate 'double' precision float (default: ON)
+* `PFFFT_USE_SIMD` to use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? (default: ON)
+* `DISABLE_SIMD_AVX` to disable AVX CPU features (default: OFF)
+* `PFFFT_USE_SIMD_NEON` to force using NEON on ARM (requires PFFFT_USE_SIMD) (default: OFF)
+* `PFFFT_USE_SCALAR_VECT` to use 4-element vector scalar operations (if no other SIMD) (default: ON)
+
+Options can be passed to `cmake` at command line, e.g.
+```
+cmake -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_TYPE_DOUBLE=ON
+```
+
+My Linux distribution defaults to GCC. With installed CLANG and the bash shell, you can use it with
+```
+mkdir build
+cd build
+CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -DCMAKE_BUILD_TYPE=Debug ../
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=~ ../
+ccmake .                          # or: cmake-gui .
+cmake --build .                   # or simply: make
+ctest                             # to execute some tests - including benchmarks
+cmake --build . --target install  # or simply: [sudo] make install
+```
+
+With MSVC on Windows, you need some different options. Following ones to build a 64-bit Release with Visual Studio 2019:
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 16 2019" -A x64 ..
+cmake --build . --config Release
+ctest -C Release
+```
+
+see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
+
+
+## History / Origin / Changes:
+Origin for this code/fork is Julien Pommier's pffft on bitbucket:
+[https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/)
+
+Git history shows following first commits of the major contributors:
+* Julien Pommier: November 19, 2011
+* Marton Danoczy: September 30, 2015
+* Hayati Ayguen: December 22, 2019
+* Dario Mambro: March 24, 2020
+
+There are a few other contributors not listed here.
+
+The main changes include:
+* improved benchmarking, see [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks)
+* double support
+* avx(2) support
+* c++ headers (wrapper)
+* additional API helper functions
+* additional library for fast convolution
+* cmake support
+* ctest
+
+
+## Comparison with other FFTs:
+The idea was not to break speed records, but to get a decently fast
+fft that is at least 50% as fast as the fastest FFT -- especially on
+slowest computers . I'm more focused on getting the best performance
+on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
+on getting top performance on today fastest cpus.
+
+It can be used in a real-time context as the fft functions do not
+perform any memory allocation -- that is why they accept a 'work'
+array in their arguments.
+
+It is also a bit focused on performing 1D convolutions, that is why it
+provides "unordered" FFTs , and a fourier domain convolution
+operation.
+
+Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages).
+It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast.
+The whole C++ implementation file is 161 lines, including the Copyright header, see
+[https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp)
+
+## Dependencies / Required Linux packages
+
+On Debian/Ubuntu Linux following packages should be installed:
+
+```
+sudo apt-get install build-essential gcc g++ cmake
+```
+
+
+## Benchmarks and results
+
+#### Quicklink
+Find results at [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
+
+#### General
+My (Hayati Ayguen) first look at FFT-benchmarks was with [benchFFT](http://www.fftw.org/benchfft/)
+and especially the results of the benchmarks [results](http://www.fftw.org/speed/),
+which demonstrate the performance of the [FFTW](http://www.fftw.org/).
+Looking at the benchmarked computer systems from todays view (2021), these are quite outdated.
+
+Having a look into the [benchFFT source code](http://www.fftw.org/benchfft/benchfft-3.1.tar.gz),
+the latest source changes, including competitive fft implementations, are dated November 2003.
+
+In 2019, when pffft got my attention at [bitbucket](https://bitbucket.org/jpommier/pffft/src/master/),
+there were also some benchmark results.
+Unfortunately the results are tables with numbers - without graphical plots.
+Without the plots, i could not get an impression. That was, why i started
+[https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks),
+which includes GnuPlot figures.
+
+Today in June 2021, i realized the existence of [https://github.com/FFTW/benchfft](https://github.com/FFTW/benchfft).
+This repository is much more up-to-date with a commit in December 2020.
+Unfortunately, it looks not so simple to get it run - including the generation of plots.
+
+Is there any website showing benchFFT results of more recent computer systems?
+
+Of course, it's very important, that a benchmark can be compared with a bunch
+of different FFT algorithms/implementations.
+This requires to have these compiled/built and utilizable.
+
+
+#### Git submodules for Green-, Kiss- and Pocket-FFT
+Sources for [Green-](https://github.com/hayguen/greenffts),
+[Kiss-](https://github.com/hayguen/kissfft)
+and [Pocket-FFT](https://github.com/hayguen/pocketfft)
+can be downloaded directly with the sources of this repository - using git submodules:
+```
+git clone --recursive https://github.com/marton78/pffft.git
+```
+
+Important is `--recursive`, that does also fetch the submodules directly.
+But you might retrieve the submodules later, too:
+```
+git submodule update --init
+```
+
+#### Fastest Fourier Transform in the West: FFTW
+To allow comparison with FFTW [http://www.fftw.org/](http://www.fftw.org/),
+cmake option `-DPFFFT_USE_BENCH_FFTW=ON` has to be used with following commands.
+The cmake option requires previous setup of following (debian/ubuntu) package:
+```
+sudo apt-get install libfftw3-dev
+```
+
+#### Intel Math Kernel Library: MKL
+Intel's MKL [https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html)
+currently looks even faster than FFTW.
+
+On Ubuntu-Linux it's easy to setup with the package `intel-mkl`.
+Similar on Debian: `intel-mkl-full`.
+
+There are special repositories for following Linux distributions:
+* Debian/apt: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html)
+* RedHat/yum: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html)
+* Gentoo/ebuild: [https://packages.gentoo.org/packages/sci-libs/mkl](https://packages.gentoo.org/packages/sci-libs/mkl)
+
+#### Performing the benchmarks - with CMake
+Benchmarks should be prepared by creating a special build folder
+```
+mkdir build_benches
+cd build_benches
+cmake ../bench
+```
+
+There are several CMake options to parametrize, which fft implementations should be benched.
+You can explore all available options with `cmake-gui` or `ccmake`, see [CMake](#cmake).
+
+Some of the options:
+* `BENCH_ID`         name the benchmark - used in filename
+* `BENCH_ARCH`       target architecture passed to compiler for code optimization
+* `PFFFT_USE_BENCH_FFTW`   use (system-installed) FFTW3 in fft benchmark? (default: OFF)
+* `PFFFT_USE_BENCH_GREEN`  use Green FFT in fft benchmark? (default: ON)
+* `PFFFT_USE_BENCH_KISS`   use KissFFT in fft benchmark? (default: ON)
+* `PFFFT_USE_BENCH_POCKET` use PocketFFT in fft benchmark? (default: ON)
+* `PFFFT_USE_BENCH_MKL`    use Intel MKL in fft benchmark?  (default: OFF)
+
+These options can be passed to `cmake` at command line, e.g.
+```
+cmake -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
+```
+
+The benchmarks are built and executed with
+```
+cmake --build .
+```
+
+You can also specify to use a different compiler/version with the cmake step, e.g.:
+
+```
+CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 cmake -DBENCH_ID=gcc9 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
+```
+
+```
+CC=/usr/bin/clang-11 CXX=/usr/bin/clang++-11 cmake -DBENCH_ID=clang11 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
+```
+
+For using MSVC/Windows, the cmake command requires/needs the generator and architecture options and to be called from the VS Developer prompt:
+```
+cmake -G "Visual Studio 16 2019" -A x64 ../bench/
+```
+
+see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
+
+
+
+For running with different compiler version(s):
+* copy the result file (.tgz), e.g. `cp *.tgz ../`
+* delete the build directory: `rm -rf *`
+* then continue with the cmake step
+
+
+#### Benchmark results and contribution
+You might contribute by providing us the results of your computer(s).
+
+The benchmark results are stored in a separate git-repository:
+See [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
+
+This is to keep this repositories' sources small.
+
--- a/pffft/bench/CMakeLists.txt
+++ b/pffft/bench/CMakeLists.txt
@@ -0,0 +1,224 @@
+cmake_minimum_required(VERSION 2.8)
+project(BENCH_PFFFT)
+
+set(BENCH_ID  "default" CACHE STRING "ID: use single word without spaces. gets part of result filename")
+
+option(BENCH_FAST_MATH  "Build with fast math - non IEEE compliant" ON)
+
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge/ARM-NEON:armv7-a")
+elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge")
+elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC")  # others: "Intel"
+  set(BENCH_ARCH "AVX" CACHE STRING "target architecture (/arch): SSE2/AVX")
+else()
+  set(BENCH_ARCH "" CACHE STRING "target architecture - use full compiler option!")
+endif()
+
+# architecture/optimization options
+option(PFFFT_USE_SIMD        "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
+option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
+option(PFFFT_USE_SIMD_NEON   "force using NEON on ARM? (requires PFFFT_USE_SIMD)" OFF)
+option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
+
+option(PFFFT_USE_BENCH_FFTW   "use (system-installed) FFTW3 in fft benchmark?" OFF)
+option(PFFFT_USE_BENCH_GREEN  "use Green FFT in fft benchmark? - if exists in subdir" ON)
+option(PFFFT_USE_BENCH_KISS   "use KissFFT in fft benchmark? - if exists in subdir" ON)
+option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
+option(PFFFT_USE_BENCH_MKL    "use Intel MKL in fft benchmark? needs to be installed" OFF)
+
+
+set(OSSTR "")
+if (WIN32)
+  set(OSSTR "Win32")
+endif (WIN32)
+if (UNIX)
+  set(OSSTR "Unix")
+endif (UNIX)
+
+set(BUILD_DIR_TO_EXE "")
+set(CMAKE_PLATFORM_OPT "")
+set(CMAKE_MAKE_OPT "")
+if (MSVC)
+  set(BUILD_DIR_TO_EXE "Release/")
+  set(CMAKE_PLATFORM_OPT "-A \"${CMAKE_GENERATOR_PLATFORM}\"")
+  set(CMAKE_MAKE_OPT "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}")
+endif()
+
+
+set(benchdir "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}")
+set(benchdir_flt "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/float")
+set(benchdir_dbl "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/double")
+set(builddir_flt "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_float")
+set(builddir_dbl "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_double")
+
+add_custom_command(OUTPUT "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir}"
+  COMMAND ${CMAKE_COMMAND} -E echo "benchmark ${BENCH_ID}"   > "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "CMake major:    ${CMAKE_MAJOR_VERSION}"        >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "CMake minor:    ${CMAKE_MINOR_VERSION}"        >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "OS:             ${OSSTR}"                      >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "System:         ${CMAKE_SYSTEM_NAME}"          >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "System CPU:     ${CMAKE_SYSTEM_PROCESSOR}"     >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "System Version: ${CMAKE_HOST_SYSTEM_VERSION}"  >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "C   Compiler:   ${CMAKE_C_COMPILER_ID}"        >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "C   Version:    ${CMAKE_C_COMPILER_VERSION}"   >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "C++ Compiler:   ${CMAKE_CXX_COMPILER_ID}"      >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "C++ Version:    ${CMAKE_CXX_COMPILER_VERSION}" >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "MSVC Version:   ${MSVC_VERSION}"               >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "MSVC Toolset:   ${MSVC_TOOLSET_VERSION}"       >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "Exe Suffix:     ${CMAKE_EXECUTABLE_SUFFIX}"    >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "C   Byte Order: ${CMAKE_C_BYTE_ORDER}"         >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "C++ Byte Order: ${CMAKE_CXX_BYTE_ORDER}"       >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo ""                                              >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "Architecture:   ${BENCH_ARCH}"                 >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "Fast math:      ${BENCH_FAST_MATH}"            >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD=${PFFFT_USE_SIMD}"                   >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config DISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}"   >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}"         >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}"     >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}"       >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}"     >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}"       >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}"   >> "${benchdir}/info.txt"
+  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}"         >> "${benchdir}/info.txt"
+)
+
+if (UNIX)
+  add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
+    COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
+    COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/unix_info.sh"
+    DEPENDS "${benchdir}/info.txt"
+    WORKING_DIRECTORY ${benchdir}
+  )
+else()
+  add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
+    COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
+    DEPENDS "${benchdir}/info.txt"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+endif()
+
+
+add_custom_command(OUTPUT "${builddir_flt}/directory.txt"
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_flt}"
+  COMMAND ${CMAKE_COMMAND} -E touch "${builddir_flt}/directory.txt"
+)
+
+add_custom_command(OUTPUT "${builddir_dbl}/directory.txt"
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_dbl}"
+  COMMAND ${CMAKE_COMMAND} -E touch "${builddir_dbl}/directory.txt"
+)
+
+add_custom_command(OUTPUT "${benchdir_flt}/directory.txt"
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_flt}"
+  COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_flt}/directory.txt"
+)
+
+add_custom_command(OUTPUT "${benchdir_dbl}/directory.txt"
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_dbl}"
+  COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_dbl}/directory.txt"
+)
+
+
+
+add_custom_target(build_float
+  COMMAND ${CMAKE_COMMAND} -E echo "start cmake for float in ${builddir_flt}"
+  COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
+                        "${CMAKE_MAKE_OPT}"
+                        -DCMAKE_BUILD_TYPE=Release
+                        "-DARCH=${BENCH_ARCH}"
+                        -DUSE_FAST_MATH=${BENCH_FAST_MATH}
+                        -DPFFFT_USE_TYPE_FLOAT=ON
+                        -DPFFFT_USE_TYPE_DOUBLE=OFF
+                        -DUSE_FLOAT_PREC=ON
+                        -DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
+                        -DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
+                        -DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
+                        -DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
+                        -DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
+                        -DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
+                        -DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
+                        -DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
+                        -DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
+                        "${CMAKE_SOURCE_DIR}/.."
+  # COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for float in ${builddir_flt}"
+  COMMAND ${CMAKE_COMMAND} --build . --config Release
+  DEPENDS "${builddir_flt}/directory.txt"
+  WORKING_DIRECTORY "${builddir_flt}"
+)
+
+add_custom_target(build_double
+  COMMAND ${CMAKE_COMMAND} -E echo "start cmake for double in ${builddir_dbl}"
+  COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
+                        "${CMAKE_MAKE_OPT}"
+                        -DCMAKE_BUILD_TYPE=Release
+                        "-DARCH=${BENCH_ARCH}"
+                        -DUSE_FAST_MATH=${BENCH_FAST_MATH}
+                        -DPFFFT_USE_TYPE_FLOAT=OFF
+                        -DPFFFT_USE_TYPE_DOUBLE=ON
+                        -DUSE_FLOAT_PREC=OFF
+                        -DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
+                        -DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
+                        -DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
+                        -DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
+                        -DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
+                        -DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
+                        -DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
+                        -DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
+                        -DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
+                        "${CMAKE_SOURCE_DIR}/.."
+  COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for double in ${builddir_dbl}"
+  COMMAND ${CMAKE_COMMAND} --build . --config Release
+  DEPENDS "${builddir_dbl}/directory.txt"
+  WORKING_DIRECTORY "${builddir_dbl}"
+)
+
+add_custom_target(bench_float
+  COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for float"
+  COMMAND "${builddir_flt}/${BUILD_DIR_TO_EXE}bench_pffft_float${CMAKE_EXECUTABLE_SUFFIX}"
+  DEPENDS "${benchdir_flt}/directory.txt" build_float
+  WORKING_DIRECTORY "${benchdir_flt}"
+)
+
+add_custom_target(bench_double
+  COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for double"
+  COMMAND "${builddir_dbl}/${BUILD_DIR_TO_EXE}bench_pffft_double${CMAKE_EXECUTABLE_SUFFIX}"
+  DEPENDS "${benchdir_dbl}/directory.txt" build_double
+  WORKING_DIRECTORY "${benchdir_dbl}"
+)
+
+add_custom_target(bench ALL
+  COMMAND ${CMAKE_COMMAND} -E echo ""
+  COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
+  COMMAND ${CMAKE_COMMAND} -E echo ""
+  COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
+  # DEPENDS "${benchdir}/info.txt" "${benchdir}/unix_info.txt"
+  DEPENDS "${benchdir}/info.txt" bench_float bench_double "${benchdir}/unix_info.txt"
+  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
+add_custom_target(bench_float_tar
+  COMMAND ${CMAKE_COMMAND} -E echo ""
+  COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
+  COMMAND ${CMAKE_COMMAND} -E echo ""
+  COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
+  DEPENDS "${benchdir}/info.txt" bench_float "${benchdir}/unix_info.txt"
+  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
+add_custom_target(bench_double_tar
+  COMMAND ${CMAKE_COMMAND} -E echo ""
+  COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
+  COMMAND ${CMAKE_COMMAND} -E echo ""
+  COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
+  DEPENDS "${benchdir}/info.txt" bench_double "${benchdir}/unix_info.txt"
+  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
+add_custom_target(clean_results
+  COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_flt}"
+  COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_dbl}"
+  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
--- a/pffft/bench/unix_info.sh
+++ b/pffft/bench/unix_info.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+lscpu > unix_lscpu.txt
+cat /proc/cpuinfo > unix_cpuinfo.txt
+lsb_release -a  > unix_lsb_release.txt
+FILES=$(ls -1 /etc/*-release)
+if [ ! -z "$FILES" ]; then
+  cp /etc/*-release ./
+fi
--- a/pffft/bench_conv.cpp
+++ b/pffft/bench_conv.cpp
@@ -0,0 +1,345 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include <algorithm>
+#include <random>
+#include <cstdint>
+#include <complex>
+
+#include "papi_perf_counter.h"
+
+//#if defined(HAVE_MIPP) && !defined(NO_MIPP)
+#if defined(HAVE_MIPP)
+#include <mipp.h>
+
+#define MIPP_VECTOR  mipp::vector
+#else
+#define MIPP_VECTOR  std::vector
+#endif
+
+#include "pf_conv_dispatcher.h"
+#include "pf_conv.h"
+
+
+#define TEST_WITH_MIN_LEN     0
+
+
+MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
+{
+    MIPP_VECTOR<float> v(N < 0 ? M : N);
+    std::mt19937 g;
+    g.seed(seed_value);
+    constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
+    for (int k = 0; k < M; ++k)
+        v[k] = float(int_fast32_t(g())) * scale;
+    for (int k = M; k < N; ++k)
+        v[k] = 0.0F;
+    return v;
+}
+
+
+int bench_oop_core(
+        const conv_f_ptrs & conv_arch,
+        const float * signal, const int sz_signal,
+        const float * filter, const int sz_filter,
+        const int blockLen,
+        float * y
+        )
+{
+    conv_buffer_state state;
+    const auto conv_oop = conv_arch.fp_conv_float_oop;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        state.size += blockLen;
+        int n_out = conv_oop(signal, &state, filter, sz_filter, y);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+int bench_inplace_core(
+        const conv_f_ptrs & conv_arch,
+        float * signal, const int sz_signal,
+        const float * filter, const int sz_filter,
+        const int blockLen
+        )
+{
+    conv_buffer_state state;
+    const auto conv_inplace = conv_arch.fp_conv_float_inplace;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        state.size += blockLen;
+        int n_out = conv_inplace(signal, &state, filter, sz_filter);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+
+int bench_oop(
+        const conv_f_ptrs & conv_arch,
+        float * buffer,
+        const float * signal, const int sz_signal,
+        const float * filter, const int sz_filter,
+        const int blockLen,
+        float * y
+        )
+{
+    conv_buffer_state state;
+    const auto conv_oop = conv_arch.fp_conv_float_oop;
+    const auto move_rest = conv_arch.fp_conv_float_move_rest;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        move_rest(buffer, &state);
+        //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
+        std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
+        state.size += blockLen;
+        int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+int bench_cx_real_oop(
+        const conv_f_ptrs & conv_arch,
+        complexf * buffer,
+        const float * signal_re, const int sz_signal_re,
+        const float * filter, const int sz_filter,
+        const int blockLen,
+        float * y_re
+        )
+{
+    conv_buffer_state state;
+    const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
+    const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
+    // interpret buffer, signal and output vector y  as complex data
+    complexf * y = reinterpret_cast<complexf *>(y_re);
+    const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
+    const int sz_signal = sz_signal_re / 2;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        move_rest(buffer, &state);
+        //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
+        std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
+        state.size += blockLen;
+        int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+
+int main(int argc, char *argv[])
+{
+    // cli defaults:
+    // process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
+    int arch = 0, N = 64 * 1024 * 1024;
+    int filterLen = 128, blockLen = 1024;
+    int seed_sig = 1, seed_filter = 2;
+    bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
+
+    for (int i = 1; i < argc; ++i)
+    {
+        if (i+1 < argc && !strcmp(argv[i], "-a"))
+            arch = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-n"))
+            N = atoi(argv[++i]) * 1024 * 1024;
+        else if (i+1 < argc && !strcmp(argv[i], "-f"))
+            filterLen = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-b"))
+            blockLen = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-ss"))
+            seed_sig = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-sf"))
+            seed_filter = atoi(argv[++i]);
+        else if (!strcmp(argv[i], "-v"))
+            verbose = true;
+        else if (!strcmp(argv[i], "-h"))
+            showUsage = exitFromUsage = true;
+        else
+            fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
+    }
+
+    int num_arch = 0;
+    const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
+
+    if (verbose)
+    {
+        fprintf(stderr, "num_arch is %d\n", num_arch);
+        for (int a = 0; a < num_arch; ++a)
+            if (conv_arch_ptrs[a])
+                fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
+            else
+                fprintf(stderr, " arch %d is nullptr !!!\n", a );
+        fprintf(stderr, "\n");
+    }
+
+    if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
+    {
+        fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
+        fprintf(stderr, "    [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
+        fprintf(stderr, "arch is one of:");
+        for (int a = 0; a < num_arch; ++a)
+            if (conv_arch_ptrs[a])
+                fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
+        fprintf(stderr, "\n");
+        if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
+            return 0;
+    }
+
+    if (verbose)
+    {
+        #ifdef HAVE_PAPI
+        fprintf(stderr, "PAPI is available\n");
+        #else
+        fprintf(stderr, "PAPI is NOT available!\n");
+        #endif
+    }
+    #if !defined(HAVE_MIPP)
+    fprintf(stderr, "MIPP is NOT available!\n");
+    #endif
+
+    //int float_simd_size[num_arch];
+    int max_simd_size = -1;
+    for (int a = 0; a < num_arch; ++a)
+    {
+        if (conv_arch_ptrs[a])
+        {
+            const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
+            //float_simd_size[a] = sz;
+            if (max_simd_size < sz)
+                max_simd_size = sz;
+            if (verbose)
+                fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
+        }
+        //else
+        //    float_simd_size[a] = 0;
+    }
+    //const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
+    if (verbose)
+        fprintf(stderr, "max float simd size: %d\n", max_simd_size);
+
+#if TEST_WITH_MIN_LEN
+    filterLen = 2;
+#endif
+
+    // round up filter length
+    filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
+
+#if TEST_WITH_MIN_LEN
+    blockLen = 1;
+    N = 2 * (3 + filterLen);    // produce 3+1 samples
+#endif
+
+    if (!conv_arch_ptrs[arch])
+    {
+        fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
+        return 1;
+    }
+    const conv_f_ptrs & conv_arch =  *conv_arch_ptrs[arch];
+    if (verbose)
+        fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
+
+    fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
+        N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
+
+    MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
+    MIPP_VECTOR<float> y(N + 1, 0.0F);
+    MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
+    MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
+    MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
+
+#if 1 && TEST_WITH_MIN_LEN
+    for (int k = 0; k < N; ++k)
+        s[k] = (k+1);
+    for (int k = 0; k < filterLen; ++k)
+        filter[k] = (k+1);
+#endif
+
+    s[N] = 123.0F;
+    y[N] = 321.0F;
+    buffer[blockLen + filterLen] = 789.0F;
+    buffer_cx[blockLen + filterLen].i = 987.0F;
+
+    fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
+    int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
+    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
+#if TEST_WITH_MIN_LEN
+    for (int k = 0; k < n_oop_out; ++k )
+        fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
+    fprintf(stderr, "\n");
+#endif
+
+    fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
+    n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
+    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
+    assert(s[N] == 123.0F);
+    assert(y[N] == 321.0F);
+    assert(buffer[blockLen + filterLen] == 789.0F);
+    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
+#if TEST_WITH_MIN_LEN
+    for (int k = 0; k < n_oop_out; ++k )
+        fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
+    fprintf(stderr, "\n");
+#endif
+
+    fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
+    n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
+    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
+    assert(s[N] == 123.0F);
+    assert(y[N] == 321.0F);
+    assert(buffer[blockLen + filterLen] == 789.0F);
+    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
+#if TEST_WITH_MIN_LEN
+    fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
+    for (int k = 0; k < n_oop_out; ++k )
+        fprintf(stderr, "y[%2d] = %g  %+g * i\n", k, y[2*k], y[2*k+1]);
+    fprintf(stderr, "\n");
+
+    const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
+    const int Nc = N /2;
+    fprintf(stderr, "reference with std::complex<float>:\n");
+    for (int off = 0; off +filterLen <= Nc; ++off )
+    {
+        std::complex<float> sum(0.0F, 0.0F);
+        for (int k=0; k < filterLen; ++k)
+            sum += sc[off+k] * filter[k];
+        fprintf(stderr, "yv[%2d] = %g  %+g * i\n", off, sum.real(), sum.imag() );
+    }
+#endif
+
+    fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
+    int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
+    fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
+    assert(s[N] == 123.0F);
+    assert(y[N] == 321.0F);
+    assert(buffer[blockLen + filterLen] == 789.0F);
+    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
+#if TEST_WITH_MIN_LEN
+    for (int k = 0; k < n_inp_out; ++k )
+        fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
+    fprintf(stderr, "\n");
+#endif
+
+    fprintf(stderr, "\n");
+    return 0;
+}
--- a/pffft/bench_mixers.cpp
+++ b/pffft/bench_mixers.cpp
@@ -0,0 +1,889 @@
+/*
+  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+  bench for mixer algorithm/implementations
+
+ */
+
+#include <pf_mixer.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#include "papi_perf_counter.h"
+
+#if defined(__linux__)
+#define HAVE_SYS_TIMES
+#endif
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#define VC_EXTRALEAN
+#include <windows.h>
+#endif
+
+#define BENCH_REF_TRIG_FUNC       1
+#define BENCH_OUT_OF_PLACE_ALGOS  0
+#define BENCH_INPLACE_ALGOS       1
+
+#define SAVE_BY_DEFAULT  0
+#define SAVE_LIMIT_MSPS           16
+
+#if 0
+  #define BENCH_FILE_SHIFT_MATH_CC           "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
+  #define BENCH_FILE_ADD_FAST_CC             "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
+  #define BENCH_FILE_ADD_FAST_INP_C          "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
+  #define BENCH_FILE_UNROLL_INP_C            "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_INP_C        "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
+  #define BENCH_FILE_REC_OSC_CC              ""
+  #define BENCH_FILE_REC_OSC_INP_C           "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
+  #define BENCH_FILE_REC_OSC_SSE_INP_C       "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
+#else
+  #define BENCH_FILE_SHIFT_MATH_CC           ""
+  #define BENCH_FILE_ADD_FAST_CC             ""
+  #define BENCH_FILE_ADD_FAST_INP_C          ""
+  #define BENCH_FILE_UNROLL_INP_C            ""
+  #define BENCH_FILE_LTD_UNROLL_INP_C        ""
+  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  ""
+  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  ""
+  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  ""
+  #define BENCH_FILE_REC_OSC_CC              ""
+  #define BENCH_FILE_REC_OSC_INP_C           ""
+  #define BENCH_FILE_REC_OSC_SSE_INP_C       ""
+#endif
+
+
+
+#if defined(HAVE_SYS_TIMES)
+    static double ttclk = 0.;
+
+    static double uclock_sec(int find_start)
+    {
+        struct tms t0, t;
+        if (ttclk == 0.)
+        {
+            ttclk = sysconf(_SC_CLK_TCK);
+            fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
+        }
+        times(&t);
+        if (find_start)
+        {
+            t0 = t;
+            while (t0.tms_utime == t.tms_utime)
+                times(&t);
+        }
+        /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
+        return ((double)t.tms_utime) / ttclk;
+    }
+
+#elif defined(WIN32)
+    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
+    double uclock_sec(int find_start)
+    {
+        FILETIME a, b, c, d;
+        if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
+        {
+            //  Returns total user time.
+            //  Can be tweaked to include kernel times as well.
+            return
+                (double)(d.dwLowDateTime |
+                    ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
+        }
+        else {
+            //  Handle error
+            return 0;
+        }
+    }
+
+#else
+    double uclock_sec(int find_start)
+    { return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+void save(complexf * d, int B, int N, const char * fn)
+{
+    if (!fn || !fn[0])
+    {
+        if (! SAVE_BY_DEFAULT)
+            return;
+        fn = "/dev/shm/bench.bin";
+    }
+    FILE * f = fopen(fn, "wb");
+    if (!f) {
+        fprintf(stderr, "error writing result to %s\n", fn);
+        return;
+    }
+    if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
+        N = SAVE_LIMIT_MSPS * 1024 * 1024;
+    for (int off = 0; off + B <= N; off += B)
+    {
+        fwrite(d+off, sizeof(complexf), B, f);
+    }
+    fclose(f);
+}
+
+
+double bench_core_shift_math_cc(
+        const int B, const int N, const bool ignore_time,
+        const complexf *input,
+        complexf *output,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    float phase = 0.0F;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_math_cc(const int B, const int N, const bool ignore_time) {
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_math_cc(B, N, ignore_time, input, output,  iter, off);
+
+    save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
+
+    free(input);
+    free(output);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_table_cc(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    int table_size=65536;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_table_data_t table_data = shift_table_init(table_size);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_addfast(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, BENCH_FILE_ADD_FAST_CC);
+
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_addfast_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_addfast_data_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    float phase = 0.0F;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        phase = shift_addfast_inp_c(data+off, B, &state, phase);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_addfast_inp(int B, int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
+    int iter, off;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_addfast_inplace(
+                B, N, ignore_time, input, state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
+
+    free(input);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_unroll_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_unroll_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_unroll_data_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    float phase = 0.0F;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        phase = shift_unroll_inp_c(data+off, B, &state, phase);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_unroll_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
+    int iter, off;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_unroll_inplace(
+                B, N, ignore_time, input, state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_UNROLL_INP_C);
+
+    free(input);
+    shift_unroll_deinit(&state);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+
+double bench_shift_limited_unroll_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_cc(input+off, output+off, B, &state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_limited_unroll_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_limited_unroll_data_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        shift_limited_unroll_inp_c(data+off, B, &state);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_limited_unroll_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
+    int iter, off;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_limited_unroll_inplace(
+                B, N, ignore_time, input, state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
+
+    free(input);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_limited_unroll_A_sse_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_limited_unroll_A_sse_data_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        shift_limited_unroll_A_sse_inp_c(data+off, B, &state);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_limited_unroll_A_sse_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_A_sse_data_t *state = (shift_limited_unroll_A_sse_data_t*)malloc(sizeof(shift_limited_unroll_A_sse_data_t));
+    int iter, off;
+
+    *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_limited_unroll_A_sse_inplace(
+                B, N, ignore_time, input, *state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
+
+    free(input);
+    free(state);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_limited_unroll_B_sse_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_limited_unroll_B_sse_data_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        shift_limited_unroll_B_sse_inp_c(data+off, B, &state);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_limited_unroll_B_sse_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_B_sse_data_t *state = (shift_limited_unroll_B_sse_data_t*)malloc(sizeof(shift_limited_unroll_B_sse_data_t));
+    int iter, off;
+
+    *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_limited_unroll_B_sse_inplace(
+                B, N, ignore_time, input, *state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
+    
+    free(input);
+    free(state);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_limited_unroll_C_sse_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_limited_unroll_C_sse_data_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        shift_limited_unroll_C_sse_inp_c(data+off, B, &state);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_limited_unroll_C_sse_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_C_sse_data_t *state = (shift_limited_unroll_C_sse_data_t*)malloc(sizeof(shift_limited_unroll_C_sse_data_t));
+    int iter, off;
+
+    *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    double T = bench_core_shift_limited_unroll_C_sse_inplace(
+                B, N, ignore_time, input, *state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
+
+    free(input);
+    free(state);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_cc_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state, shift_state;
+    shift_recursive_osc_conf_t gen_conf, shift_conf;
+
+    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_REC_OSC_CC);
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_rec_osc_cc_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_recursive_osc_conf_t &conf, shift_recursive_osc_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        shift_recursive_osc_inp_c(data+off, B, &conf, &state);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_rec_osc_cc_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state, shift_state;
+    shift_recursive_osc_conf_t gen_conf, shift_conf;
+    int iter, off;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
+
+    double T = bench_core_shift_rec_osc_cc_inplace(
+                B, N, ignore_time, input, shift_conf, shift_state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
+    free(input);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_core_shift_rec_osc_sse_c_inplace(
+        const int B, const int N, const bool ignore_time,
+        complexf *data,
+        shift_recursive_osc_sse_conf_t &conf, shift_recursive_osc_sse_t &state,
+        int &iters_out, int &off_out
+        )
+{
+    const double t0 = uclock_sec(1);
+    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    double t1;
+    int off = 0, iter = 0;
+    papi_perf_counter perf_counter(1);
+
+    do {
+        // work
+        shift_recursive_osc_sse_inp_c(data+off, B, &conf, &state);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( off + B < N && (ignore_time || t1 < tstop) );
+
+    iters_out = iter;
+    off_out = off;
+    return t1 - t0;
+}
+
+double bench_shift_rec_osc_sse_c_inp(const int B, const int N, const bool ignore_time) {
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_recursive_osc_sse_t *shift_state = (shift_recursive_osc_sse_t*)malloc(sizeof(shift_recursive_osc_sse_t));
+    shift_recursive_osc_sse_conf_t shift_conf;
+    int iter, off;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
+
+    double T = bench_core_shift_rec_osc_sse_c_inplace(
+                B, N, ignore_time, input, shift_conf, *shift_state,
+                iter, off
+                );
+
+    save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
+    free(input);
+    free(shift_state);
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+
+int main(int argc, char **argv)
+{
+    double rt;
+
+    // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
+    int B = 8 * 1024;
+    int N = 64 * 1024 * 1024;
+    int showUsage = 0;
+    bool ignore_time = true;
+
+    if (argc == 1)
+        showUsage = 1;
+
+    if (1 < argc)
+        B = atoi(argv[1]);
+    if (2 < argc)
+        N = atoi(argv[2]) * 1024 * 1024;
+
+    if ( !B || !N || showUsage )
+    {
+        fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
+        if ( !B || !N )
+            return 0;
+    }
+
+    fprintf(stderr, "processing up to N = %d MSamples with block length of %d samples\n",
+        N / (1024 * 1024), B );
+
+
+#if BENCH_REF_TRIG_FUNC
+    printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
+    rt = bench_shift_math_cc(B, N, ignore_time);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+#endif
+
+#if BENCH_OUT_OF_PLACE_ALGOS
+    printf("starting bench of shift_table_cc (out-of-place) ..\n");
+    rt = bench_shift_table_cc(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
+    rt = bench_shift_addfast(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
+    rt = bench_shift_unroll_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
+    rt = bench_shift_limited_unroll_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
+    rt = bench_shift_rec_osc_cc_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+#endif
+
+#if BENCH_INPLACE_ALGOS
+
+    printf("starting bench of shift_addfast_inp_c in-place ..\n");
+    rt = bench_shift_addfast_inp(B, N, ignore_time);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_unroll_inp_c in-place ..\n");
+    rt = bench_shift_unroll_inp(B, N, ignore_time);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
+    rt = bench_shift_limited_unroll_inp(B, N, ignore_time);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    if ( have_sse_shift_mixer_impl() )
+    {
+        printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_A_sse_inp(B, N, ignore_time);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+        printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_B_sse_inp(B, N, ignore_time);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+        printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_C_sse_inp(B, N, ignore_time);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+    }
+
+    printf("starting bench of shift_recursive_osc_cc in-place ..\n");
+    rt = bench_shift_rec_osc_cc_inp(B, N, ignore_time);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    if ( have_sse_shift_mixer_impl() )
+    {
+        printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
+        rt = bench_shift_rec_osc_sse_c_inp(B, N, ignore_time);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+    }
+#endif
+
+    return 0;
+}
+
--- a/pffft/bench_pffft.c
+++ b/pffft/bench_pffft.c
--- a/pffft/cmake/FindMIPP.cmake
+++ b/pffft/cmake/FindMIPP.cmake
@@ -0,0 +1,26 @@
+
+if(MIPP_INCLUDE_DIRS)
+  set(MIPP_FIND_QUIETLY TRUE)
+endif()
+
+find_path(MIPP_INCLUDE_DIRS NAMES mipp.h
+    HINTS
+        ${MIPP_ROOT}
+        $ENV{HOME}/.local
+    PATH_SUFFIXES include/mipp
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MIPP DEFAULT_MSG MIPP_INCLUDE_DIRS)
+
+if(MIPP_FOUND AND NOT TARGET MIPP)
+    message(STATUS "MIPP_FOUND -> creating interface library MIPP at ${MIPP_INCLUDE_DIRS}")
+    add_library(MIPP INTERFACE)
+    target_compile_definitions(MIPP INTERFACE HAVE_MIPP=1)
+    target_include_directories(MIPP INTERFACE ${MIPP_INCLUDE_DIRS})
+    target_compile_features(MIPP INTERFACE cxx_std_11)
+else()
+    message(WARNING "MIPP not found.")
+endif()
+
+mark_as_advanced(MIPP_INCLUDE_DIRS)
--- a/pffft/cmake/FindPAPI.cmake
+++ b/pffft/cmake/FindPAPI.cmake
@@ -0,0 +1,25 @@
+# Find PAPI libraries
+# Once done this will define
+#  PAPI_FOUND - System has PAPI
+#  PAPI_INCLUDE_DIRS - The PAPI include directories
+#  PAPI_LIBRARIES - The libraries needed to use PAPI
+
+if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)
+  set(PAPI_FIND_QUIETLY TRUE)
+endif()
+
+find_path(PAPI_INCLUDE_DIRS NAMES papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include)
+find_library(PAPI_LIBRARIES NAMES papi HINTS ${PAPI_ROOT} PATH_SUFFIXES lib lib64)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARIES PAPI_INCLUDE_DIRS)
+if(PAPI_FOUND AND NOT TARGET PAPI::PAPI)
+    set(PAPI_LIBRARIES ${PAPI_LIBRARIES} rt)
+
+    add_library(PAPI::PAPI SHARED IMPORTED)
+    set_target_properties(PAPI::PAPI PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}"
+        IMPORTED_LOCATION "${PAPI_LIBRARIES}")
+endif()
+
+mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)
--- a/pffft/cmake/compiler_warnings.cmake
+++ b/pffft/cmake/compiler_warnings.cmake
@@ -0,0 +1,11 @@
+
+function(target_activate_cxx_compiler_warnings target)
+    target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
+    target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
+endfunction()
+
+function(target_activate_c_compiler_warnings target)
+    target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
+    target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
+endfunction()
+
--- a/pffft/cmake/target_optimizations.cmake
+++ b/pffft/cmake/target_optimizations.cmake
@@ -0,0 +1,197 @@
+
+# cmake options: TARGET_C_ARCH / TARGET_CPP_ARCH:
+#   and optionally:  TARGET_C_EXTRA TARGET_CXX_EXTRA
+#
+# provided:
+#   - function: target_set_c_arch_flags(<target>)    # uses options TARGET_C_ARCH and TARGET_C_EXTRA
+#   - function: target_set_cxx_arch_flags(<target>)  # uses options TARGET_CXX_ARCH and TARGET_CXX_EXTRA
+#   - macro:    target_set_cxx_arch_option(<target> <gcc/clang_march> <gcc/clang_extra> <msvc_arch>)
+#
+# see https://en.wikichip.org/wiki/x86/extensions
+# and https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
+#   for gcc specific architecture options
+# and https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
+# or  https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
+#   for msvc specific architecture options
+
+# https://en.wikichip.org/wiki/arm/versions
+# https://en.wikipedia.org/wiki/Raspberry_Pi
+# https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#ARM-Options
+# https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores
+# https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
+
+# arm32_rpi1 untested
+#   -mcpu=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp         -mtune=arm1176jzf-s
+# arm32_rpi2 untested
+#   "-march=armv7-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
+#   "-march=armv8-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
+# arm32_rpi3 with "armv7-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit  => MIPP test reports: NEONv1, 128 bits
+#   "-march=armv7-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
+# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit  => MIPP test reports: NEONv1, 128 bits
+#   "-march=armv8-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
+# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit  => MIPP test reports: NEONv1, 128 bits
+#   "-march=armv8-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53"
+# arm32_rpi4 untested
+#   RPi 4 Model B:    Cortex-A72  =>  "-mtune=cortex-a72"  ?
+#   "-mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits  -mtune=cortex-a72"
+
+set(MSVC_EXTRA_OPT_none "")
+set(GCC_EXTRA_OPT_none "")
+set(GCC_EXTRA_OPT_neon_vfpv4    "-mfloat-abi=hard" "-mfpu=neon-vfpv4")
+set(GCC_EXTRA_OPT_neon_rpi3_a53 "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53")
+set(GCC_EXTRA_OPT_neon_rpi4_a72 "-mfloat-abi=hard" "-mfpu=neon-fp-armv8" "-mtune=cortex-a72")
+
+if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
+    set(GCC_MARCH_DESC "native/SSE2:pentium4/SSE3:core2/SSE4:nehalem/AVX:sandybridge/AVX2:haswell")
+    set(GCC_MARCH_VALUES "none;native;pentium4;core2;nehalem;sandybridge;haswell" CACHE INTERNAL "List of possible architectures")
+    set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible EXTRA options")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    set(GCC_MARCH_DESC "native/ARMwNEON:armv8-a")
+    set(GCC_MARCH_VALUES "none;native;armv8-a" CACHE INTERNAL "List of possible architectures")
+    set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
+    set(GCC_MARCH_DESC "native/ARMwNEON:armv7-a")
+    set(GCC_MARCH_VALUES "none;native;armv7-a" CACHE INTERNAL "List of possible architectures")
+    set(GCC_EXTRA_VALUES "none;neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72" CACHE INTERNAL "List of possible additional options")
+else()
+    message(WARNING "unsupported CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}'")
+    # other PROCESSORs could be "ppc", "ppc64",  "arm" - or something else?!
+    set(GCC_MARCH_DESC "native")
+    set(GCC_MARCH_VALUES "none;native" CACHE INTERNAL "List of possible architectures")
+    set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
+endif()
+
+# cmake options - depending on C/C++ compiler
+# how are chances, that C and C++ compilers are from different vendors?
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+    set(TARGET_C_ARCH "none" CACHE STRING "gcc target C architecture (-march): ${GCC_MARCH_DESC}")
+    set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
+    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
+        set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
+        set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
+    endif()
+elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    set(TARGET_C_ARCH "none" CACHE STRING "clang target C architecture (-march): ${GCC_MARCH_DESC}")
+    set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
+    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
+        set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
+        set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
+    endif()
+elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    set(TARGET_C_ARCH "none" CACHE STRING "msvc target C architecture (/arch): SSE2/AVX/AVX2/AVX512")
+    set(TARGET_C_EXTRA "none" CACHE STRING "msvc additional options")
+else()
+    message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+endif()
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set(TARGET_CXX_ARCH "none" CACHE STRING "gcc target C++ architecture (-march): ${GCC_MARCH_DESC}")
+    set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
+    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
+        set(TARGET_CXX_EXTRA "none" CACHE STRING "gcc additional options for C++")
+        set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
+    endif()
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set(TARGET_CXX_ARCH "none" CACHE STRING "clang target C++ architecture (-march): ${GCC_MARCH_DESC}")
+    set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
+    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
+        set(TARGET_CXX_EXTRA "none" CACHE STRING "clang additional options for C++")
+        set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
+    endif()
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+    set(TARGET_CXX_ARCH "none" CACHE STRING "msvc target C++ architecture (/arch): SSE2/AVX/AVX2/AVX512")
+    set(TARGET_CXX_EXTRA "none" CACHE STRING "msvc additional options")
+else()
+    message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+endif()
+
+######################################################
+
+function(target_set_c_arch_flags target)
+    if ( ("${TARGET_C_ARCH}" STREQUAL "") OR ("${TARGET_C_ARCH}" STREQUAL "none") )
+        message(STATUS "C ARCH for target ${target} is not set!")
+    else()
+        if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
+            target_compile_options(${target} PRIVATE "-march=${TARGET_C_ARCH}")
+            message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
+        elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
+            target_compile_options(${target} PRIVATE "/arch:${TARGET_C_ARCH}")
+            message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
+        else()
+            message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+        endif()
+    endif()
+    if ( ("${TARGET_C_EXTRA}" STREQUAL "") OR ("${TARGET_C_EXTRA}" STREQUAL "none") )
+        message(STATUS "C additional options for target ${target} is not set!")
+    else()
+        if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
+            target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
+            message(STATUS "C additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
+        elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
+            # target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_C_EXTRA}}")
+            message(STATUS "C additional options for target ${target} not usable with MSVC")
+        else()
+            message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+        endif()
+        if ( ("${TARGET_C_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
+            message(STATUS "additional option contains neon: setting PFFFT_ENABLE_NEON for C target ${target}")
+            target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
+        endif()
+    endif()
+endfunction()
+
+function(target_set_cxx_arch_flags target)
+    if ( ("${TARGET_CXX_ARCH}" STREQUAL "") OR ("${TARGET_CXX_ARCH}" STREQUAL "none") )
+        message(STATUS "C++ ARCH for target ${target} is not set!")
+    else()
+        if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
+            target_compile_options(${target} PRIVATE "-march=${TARGET_CXX_ARCH}")
+            message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
+        elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+            target_compile_options(${target} PRIVATE "/arch:${TARGET_CXX_ARCH}")
+            message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
+        else()
+            message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+        endif()
+    endif()
+    if ( ("${TARGET_CXX_EXTRA}" STREQUAL "") OR ("${TARGET_CXX_EXTRA}" STREQUAL "none") )
+        message(STATUS "C++ additional options for target ${target} is not set!")
+    else()
+        if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
+            target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
+            message(STATUS "C++ additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
+        elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
+            # target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
+            message(STATUS "C++ additional options for target ${target} not usable with MSVC")
+        else()
+          message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+        endif()
+        if ( ("${TARGET_CXX_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
+            message(STATUS "additional option contains 'neon': setting PFFFT_ENABLE_NEON for C++ target ${target}")
+            target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
+        endif()
+    endif()
+endfunction()
+
+
+macro(target_set_cxx_arch_option target gcc_clang_arch gcc_clang_extra msvc_arch )
+    if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
+
+        if ( NOT (("${gcc_clang_arch}" STREQUAL "") OR ("${gcc_clang_arch}" STREQUAL "none") ) )
+            target_compile_options(${target} PRIVATE "-march=${gcc_clang_arch}")
+            message(STATUS "C++ ARCH for target ${target}: ${gcc_clang_arch}")
+        endif()
+        if (NOT ( ("${gcc_clang_extra}" STREQUAL "") OR ("${gcc_clang_extra}" STREQUAL "none") ) )
+            target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${gcc_clang_extra}}")
+            message(STATUS "C++ additional options for target ${target}: ${GCC_EXTRA_OPT_${gcc_clang_extra}}")
+        endif()
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+        if ( NOT (("${msvc_arch}" STREQUAL "") OR ("${msvc_arch}" STREQUAL "none") ) )
+            target_compile_options(${target} PRIVATE "/arch:${msvc_arch}")
+            message(STATUS "C++ ARCH for target ${target} set: ${msvc_arch}")
+        endif()
+    else()
+        message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_option(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
+    endif()
+endmacro()
+
--- a/pffft/cross_build_mingw32.sh
+++ b/pffft/cross_build_mingw32.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# requires debian/ubuntu packages: zip gcc-mingw-w64
+
+if [ -z "$1" ]; then
+  echo "usage: $0 <zip-post> <any other cmake options>"
+  exit 1
+fi
+
+ZIP_POST="$1"
+shift
+
+CROSS="i686-w64-mingw32"
+WN="w32"
+TOOLCHAIN="mingw-w32-i686.cmake"
+
+rm -rf build_${WN}_${ZIP_POST}
+echo -e "\n\n********************************************************"
+echo "start build of pffft_${WN}_${ZIP_POST}"
+mkdir build_${WN}_${ZIP_POST} && \
+cmake -S . -B build_${WN}_${ZIP_POST} \
+  -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
+  -DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
+  "$@" && \
+cmake --build build_${WN}_${ZIP_POST}
--- a/pffft/cross_build_mingw64.sh
+++ b/pffft/cross_build_mingw64.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# requires debian/ubuntu packages: zip gcc-mingw-w64
+
+if [ -z "$1" ]; then
+  echo "usage: $0 <zip-post> <any other cmake options>"
+  exit 1
+fi
+
+ZIP_POST="$1"
+shift
+
+# CROSS="x86_64-w64-mingw32"
+WN="w64"
+TOOLCHAIN="mingw-w64-x64_64.cmake"
+
+rm -rf build_${WN}_${ZIP_POST}
+echo -e "\n\n********************************************************"
+echo "start build of pffft_${WN}_${ZIP_POST}"
+mkdir build_${WN}_${ZIP_POST} && \
+cmake -S . -B build_${WN}_${ZIP_POST} \
+  -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
+  -DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
+  "$@" && \
+cmake --build build_${WN}_${ZIP_POST}
--- a/pffft/examples/CMakeLists.txt
+++ b/pffft/examples/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.1)
+project(examples)
+
+if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
+  # using Visual Studio C++
+  message(STATUS "INFO: detected MSVC: will not link math lib m")
+  set(MATHLIB "")
+  add_definitions("/D_CRT_SECURE_NO_WARNINGS")
+  set(MSVC_DISABLED_WARNINGS_LIST "C4996")
+else()
+  if(PFFFT_DISABLE_LINK_WITH_M)
+  else()
+    message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
+    set(MATHLIB "m")
+  endif()
+endif()
+
+set(STDCXXLIB "")
+if (MINGW)
+  set(STDCXXLIB "stdc++")
+endif()
+
+
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+
+if (PFFFT_USE_TYPE_DOUBLE)
+  add_executable(example_cpp11_real_dbl_fwd example_cpp11_real_dbl_fwd.cpp)
+  target_compile_definitions(example_cpp11_real_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
+  target_link_libraries(example_cpp11_real_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
+  set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD 11)
+  set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
+
+  add_executable(example_cpp11_cplx_dbl_fwd example_cpp11_cplx_dbl_fwd.cpp)
+  target_compile_definitions(example_cpp11_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
+  target_link_libraries(example_cpp11_cplx_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
+  set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD 11)
+  set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
+
+  add_executable(example_c_cplx_dbl_fwd example_c_cplx_dbl_fwd.c)
+  target_compile_definitions(example_c_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_FLOAT)
+  target_link_libraries(example_c_cplx_dbl_fwd PFFFT ${MATHLIB})
+endif()
+
+
+if (PFFFT_USE_TYPE_FLOAT)
+  add_executable(example_cpp98_real_flt_fwd example_cpp98_real_flt_fwd.cpp)
+  target_compile_definitions(example_cpp98_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
+  target_link_libraries(example_cpp98_real_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
+  set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD 98)
+  set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
+
+  add_executable(example_cpp98_cplx_flt_fwd example_cpp98_cplx_flt_fwd.cpp)
+  target_compile_definitions(example_cpp98_cplx_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
+  target_link_libraries(example_cpp98_cplx_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
+  set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD 98)
+  set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
+
+  add_executable(example_c_real_flt_fwd example_c_real_flt_fwd.c)
+  target_compile_definitions(example_c_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
+  target_link_libraries(example_c_real_flt_fwd PFFFT ${MATHLIB})
+endif()
+
--- a/pffft/examples/example_c_cplx_dbl_fwd.c
+++ b/pffft/examples/example_c_cplx_dbl_fwd.c
@@ -0,0 +1,69 @@
+
+#include "pffft_double.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+void c_forward_complex_double(const int transformLen)
+{
+  printf("running %s()\n", __FUNCTION__);
+
+  /* first check - might be skipped */
+  if (transformLen < pffftd_min_fft_size(PFFFT_COMPLEX))
+  {
+    fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffftd_min_fft_size(PFFFT_COMPLEX));
+    return;
+  }
+
+  /* instantiate FFT and prepare transformation for length N */
+  PFFFTD_Setup *ffts = pffftd_new_setup(transformLen, PFFFT_COMPLEX);
+
+  /* one more check */
+  if (!ffts)
+  {
+    fprintf(stderr,
+            "Error: transformation length %d is not decomposable into small prime factors. "
+            "Next valid transform size is: %d ; next power of 2 is: %d\n",
+            transformLen,
+            pffftd_nearest_transform_size(transformLen, PFFFT_COMPLEX, 1),
+            pffftd_next_power_of_two(transformLen) );
+    return;
+  }
+
+  /* allocate aligned vectors for input X and output Y */
+  double *X = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));  /* complex: re/im interleaved */
+  double *Y = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));  /* complex: re/im interleaved */
+  double *W = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));
+
+  /* prepare some input data */
+  for (int k = 0; k < 2 * transformLen; k += 4)
+  {
+    X[k] = k / 2;  /* real */
+    X[k+1] = (k / 2) & 1;  /* imag */
+
+    X[k+2] = -1 - k / 2;  /* real */
+    X[k+3] = (k / 2) & 1;  /* imag */
+  }
+
+  /* do the forward transform; write complex spectrum result into Y */
+  pffftd_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
+
+  /* print spectral output */
+  printf("output should be complex spectrum with %d complex bins\n", transformLen);
+  for (int k = 0; k < 2 * transformLen; k += 2)
+    printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
+
+  pffftd_aligned_free(W);
+  pffftd_aligned_free(Y);
+  pffftd_aligned_free(X);
+  pffftd_destroy_setup(ffts);
+}
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 16;
+  c_forward_complex_double(N);
+  return 0;
+}
--- a/pffft/examples/example_c_real_flt_fwd.c
+++ b/pffft/examples/example_c_real_flt_fwd.c
@@ -0,0 +1,66 @@
+
+#include "pffft.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+void c_forward_real_float(const int transformLen)
+{
+  printf("running %s()\n", __FUNCTION__);
+
+  /* first check - might be skipped */
+  if (transformLen < pffft_min_fft_size(PFFFT_REAL))
+  {
+    fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffft_min_fft_size(PFFFT_REAL));
+    return;
+  }
+
+  /* instantiate FFT and prepare transformation for length N */
+  PFFFT_Setup *ffts = pffft_new_setup(transformLen, PFFFT_REAL);
+
+  /* one more check */
+  if (!ffts)
+  {
+    fprintf(stderr,
+            "Error: transformation length %d is not decomposable into small prime factors. "
+            "Next valid transform size is: %d ; next power of 2 is: %d\n",
+            transformLen,
+            pffft_nearest_transform_size(transformLen, PFFFT_REAL, 1),
+            pffft_next_power_of_two(transformLen) );
+    return;
+  }
+
+  /* allocate aligned vectors for input X and output Y */
+  float *X = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
+  float *Y = (float*)pffft_aligned_malloc(transformLen * sizeof(float));  /* complex: re/im interleaved */
+  float *W = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
+
+  /* prepare some input data */
+  for (int k = 0; k < transformLen; k += 2)
+  {
+    X[k] = k;
+    X[k+1] = -1-k;
+  }
+
+  /* do the forward transform; write complex spectrum result into Y */
+  pffft_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
+
+  /* print spectral output */
+  printf("output should be complex spectrum with %d complex bins\n", transformLen /2);
+  for (int k = 0; k < transformLen; k += 2)
+    printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
+
+  pffft_aligned_free(W);
+  pffft_aligned_free(Y);
+  pffft_aligned_free(X);
+  pffft_destroy_setup(ffts);
+}
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 32;
+  c_forward_real_float(N);
+  return 0;
+}
--- a/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
+++ b/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
@@ -0,0 +1,66 @@
+
+#include "pffft.hpp"
+
+#include <complex>
+#include <iostream>
+
+
+void cxx11_forward_complex_double(const int transformLen)
+{
+  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
+
+  // first check - might be skipped
+  using FFT_T = pffft::Fft< std::complex<double> >;
+  if (transformLen < FFT_T::minFFtsize())
+  {
+    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
+    return;
+  }
+
+  // instantiate FFT and prepare transformation for length N
+  pffft::Fft< std::complex<double> > fft(transformLen);
+
+  // one more check
+  if (!fft.isValid())
+  {
+    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
+              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
+              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
+    return;
+  }
+
+  // allocate aligned vectors for input X and output Y
+  auto X = fft.valueVector();
+  auto Y = fft.spectrumVector();
+
+  // alternative access: get raw pointers to aligned vectors
+  std::complex<double> *Xs = X.data();
+  std::complex<double> *Ys = Y.data();
+
+  // prepare some input data
+  for (int k = 0; k < transformLen; k += 2)
+  {
+    X[k] = std::complex<double>(k, k&1);        // access through AlignedVector<double>
+    Xs[k+1] = std::complex<double>(-1-k, k&1);  // access through raw pointer
+  }
+
+  // do the forward transform; write complex spectrum result into Y
+  fft.forward(X, Y);
+
+  // print spectral output
+  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
+  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
+  for (unsigned k = 0; k < Y.size(); k += 2)
+  {
+    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
+    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
+  }
+}
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 16;
+  cxx11_forward_complex_double(N);
+  return 0;
+}
--- a/pffft/examples/example_cpp11_real_dbl_fwd.cpp
+++ b/pffft/examples/example_cpp11_real_dbl_fwd.cpp
@@ -0,0 +1,66 @@
+
+#include "pffft.hpp"
+
+#include <complex>
+#include <iostream>
+
+
+void cxx11_forward_real_double(const int transformLen)
+{
+  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
+
+  // first check - might be skipped
+  using FFT_T = pffft::Fft<double>;
+  if (transformLen < FFT_T::minFFtsize())
+  {
+    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
+    return;
+  }
+
+  // instantiate FFT and prepare transformation for length N
+  pffft::Fft<double> fft { transformLen };
+
+  // one more check
+  if (!fft.isValid())
+  {
+    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
+              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
+              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
+    return;
+  }
+
+  // allocate aligned vectors for (real) input X and (complex) output Y
+  auto X = fft.valueVector();     // input vector;  type is AlignedVector<double>
+  auto Y = fft.spectrumVector();  // output vector; type is AlignedVector< std::complex<double> >
+
+  // alternative access: get raw pointers to aligned vectors
+  double *Xs = X.data();
+  std::complex<double> *Ys = Y.data();
+
+  // prepare some input data
+  for (int k = 0; k < transformLen; k += 2)
+  {
+    X[k] = k;        // access through AlignedVector<double>
+    Xs[k+1] = -1-k;  // access through raw pointer
+  }
+
+  // do the forward transform; write complex spectrum result into Y
+  fft.forward(X, Y);
+
+  // print spectral output
+  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
+  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
+  for (unsigned k = 0; k < Y.size(); k += 2)
+  {
+    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
+    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
+  }
+}
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 32;
+  cxx11_forward_real_double(N);
+  return 0;
+}
--- a/pffft/examples/example_cpp98_cplx_flt_fwd.cpp
+++ b/pffft/examples/example_cpp98_cplx_flt_fwd.cpp
@@ -0,0 +1,66 @@
+
+#include "pffft.hpp"
+
+#include <complex>
+#include <iostream>
+
+
+void cxx98_forward_complex_float(const int transformLen)
+{
+  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
+
+  // first check - might be skipped
+  typedef pffft::Fft< std::complex<float> > FFT_T;
+  if (transformLen < FFT_T::minFFtsize())
+  {
+    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
+    return;
+  }
+
+  // instantiate FFT and prepare transformation for length N
+  pffft::Fft< std::complex<float> > fft(transformLen);
+
+  // one more check
+  if (!fft.isValid())
+  {
+    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
+              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
+              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
+    return;
+  }
+
+  // allocate aligned vectors for input X and output Y
+  pffft::AlignedVector< std::complex<float> > X = fft.valueVector();
+  pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
+
+  // alternative access: get raw pointers to aligned vectors
+  std::complex<float> *Xs = X.data();
+  std::complex<float> *Ys = Y.data();
+
+  // prepare some input data
+  for (int k = 0; k < transformLen; k += 2)
+  {
+    X[k] = std::complex<float>(k, k&1);        // access through AlignedVector<float>
+    Xs[k+1] = std::complex<float>(-1-k, k&1);  // access through raw pointer
+  }
+
+  // do the forward transform; write complex spectrum result into Y
+  fft.forward(X, Y);
+
+  // print spectral output
+  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
+  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
+  for (unsigned k = 0; k < Y.size(); k += 2)
+  {
+    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
+    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
+  }
+}
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 16;
+  cxx98_forward_complex_float(N);
+  return 0;
+}
--- a/pffft/examples/example_cpp98_real_flt_fwd.cpp
+++ b/pffft/examples/example_cpp98_real_flt_fwd.cpp
@@ -0,0 +1,66 @@
+
+#include "pffft.hpp"
+
+#include <complex>
+#include <iostream>
+
+
+void cxx98_forward_real_float(const int transformLen)
+{
+  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
+
+  // first check - might be skipped
+  typedef pffft::Fft<float> FFT_T;
+  if (transformLen < FFT_T::minFFtsize())
+  {
+    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
+    return;
+  }
+
+  // instantiate FFT and prepare transformation for length N
+  pffft::Fft<float> fft(transformLen);
+
+  // one more check
+  if (!fft.isValid())
+  {
+    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
+              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
+              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
+    return;
+  }
+
+  // allocate aligned vectors for input X and output Y
+  pffft::AlignedVector<float> X = fft.valueVector();
+  pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
+
+  // alternative access: get raw pointers to aligned vectors
+  float *Xs = X.data();
+  std::complex<float> *Ys = Y.data();
+
+  // prepare some input data
+  for (int k = 0; k < transformLen; k += 2)
+  {
+    X[k] = k;        // access through AlignedVector<float>
+    Xs[k+1] = -1-k;  // access through raw pointer
+  }
+
+  // do the forward transform; write complex spectrum result into Y
+  fft.forward(X, Y);
+
+  // print spectral output
+  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
+  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
+  for (unsigned k = 0; k < Y.size(); k += 2)
+  {
+    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
+    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
+  }
+}
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 32;
+  cxx98_forward_real_float(N);
+  return 0;
+}
--- a/pffft/fftpack.c
+++ b/pffft/fftpack.c
--- a/pffft/fftpack.h
+++ b/pffft/fftpack.h
@@ -0,0 +1,799 @@
+/*
+  Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
+  
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+   ChangeLog:
+   2011/10/02: this is my first release of this file.
+*/
+
+#ifndef FFTPACK_H
+#define FFTPACK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft */
+
+#ifndef FFTPACK_DOUBLE_PRECISION
+  typedef float fftpack_real;
+  typedef int   fftpack_int;
+#else
+  typedef double fftpack_real;
+  typedef int    fftpack_int;
+#endif
+
+  void cffti(fftpack_int n, fftpack_real *wsave);
+
+  void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
+
+  void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
+
+  void rffti(fftpack_int n, fftpack_real *wsave);
+  void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
+  void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
+
+  void cosqi(fftpack_int n, fftpack_real *wsave);
+  void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+  void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+  void costi(fftpack_int n, fftpack_real *wsave);
+  void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+  void sinqi(fftpack_int n, fftpack_real *wsave);
+  void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+  void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+  void sinti(fftpack_int n, fftpack_real *wsave);
+  void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FFTPACK_H */
+
+/*
+
+                      FFTPACK
+
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+
+                  version 4  april 1985
+
+     a package of fortran subprograms for the fast fourier
+      transform of periodic and other symmetric sequences
+
+                         by
+
+                  paul n swarztrauber
+
+  national center for atmospheric research  boulder,colorado 80307
+
+   which is sponsored by the national science foundation
+
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+
+
+this package consists of programs which perform fast fourier
+transforms for both complex and real periodic sequences and
+certain other symmetric sequences that are listed below.
+
+1.   rffti     initialize  rfftf and rfftb
+2.   rfftf     forward transform of a real periodic sequence
+3.   rfftb     backward transform of a real coefficient array
+
+4.   ezffti    initialize ezfftf and ezfftb
+5.   ezfftf    a simplified real periodic forward transform
+6.   ezfftb    a simplified real periodic backward transform
+
+7.   sinti     initialize sint
+8.   sint      sine transform of a real odd sequence
+
+9.   costi     initialize cost
+10.  cost      cosine transform of a real even sequence
+
+11.  sinqi     initialize sinqf and sinqb
+12.  sinqf     forward sine transform with odd wave numbers
+13.  sinqb     unnormalized inverse of sinqf
+
+14.  cosqi     initialize cosqf and cosqb
+15.  cosqf     forward cosine transform with odd wave numbers
+16.  cosqb     unnormalized inverse of cosqf
+
+17.  cffti     initialize cfftf and cfftb
+18.  cfftf     forward transform of a complex periodic sequence
+19.  cfftb     unnormalized inverse of cfftf
+
+
+******************************************************************
+
+subroutine rffti(n,wsave)
+
+  ****************************************************************
+
+subroutine rffti initializes the array wsave which is used in
+both rfftf and rfftb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 2*n+15.
+        the same work array can be used for both rfftf and rfftb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of rfftf or rfftb.
+
+******************************************************************
+
+subroutine rfftf(n,r,wsave)
+
+******************************************************************
+
+subroutine rfftf computes the fourier coefficients of a real
+perodic sequence (fourier analysis). the transform is defined
+below at output parameter r.
+
+input parameters
+
+n       the length of the array r to be transformed.  the method
+        is most efficient when n is a product of small primes.
+        n may change so long as different work arrays are provided
+
+r       a real array of length n which contains the sequence
+        to be transformed
+
+wsave   a work array which must be dimensioned at least 2*n+15.
+        in the program that calls rfftf. the wsave array must be
+        initialized by calling subroutine rffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by rfftf and rfftb.
+
+
+output parameters
+
+r       r(1) = the sum from i=1 to i=n of r(i)
+
+        if n is even set l =n/2   , if n is odd set l = (n+1)/2
+
+          then for k = 2,...,l
+
+             r(2*k-2) = the sum from i = 1 to i = n of
+
+                  r(i)*cos((k-1)*(i-1)*2*pi/n)
+
+             r(2*k-1) = the sum from i = 1 to i = n of
+
+                 -r(i)*sin((k-1)*(i-1)*2*pi/n)
+
+        if n is even
+
+             r(n) = the sum from i = 1 to i = n of
+
+                  (-1)**(i-1)*r(i)
+
+ *****  note
+             this transform is unnormalized since a call of rfftf
+             followed by a call of rfftb will multiply the input
+             sequence by n.
+
+wsave   contains results which must not be destroyed between
+        calls of rfftf or rfftb.
+
+
+******************************************************************
+
+subroutine rfftb(n,r,wsave)
+
+******************************************************************
+
+subroutine rfftb computes the real perodic sequence from its
+fourier coefficients (fourier synthesis). the transform is defined
+below at output parameter r.
+
+input parameters
+
+n       the length of the array r to be transformed.  the method
+        is most efficient when n is a product of small primes.
+        n may change so long as different work arrays are provided
+
+r       a real array of length n which contains the sequence
+        to be transformed
+
+wsave   a work array which must be dimensioned at least 2*n+15.
+        in the program that calls rfftb. the wsave array must be
+        initialized by calling subroutine rffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by rfftf and rfftb.
+
+
+output parameters
+
+r       for n even and for i = 1,...,n
+
+             r(i) = r(1)+(-1)**(i-1)*r(n)
+
+                  plus the sum from k=2 to k=n/2 of
+
+                   2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
+
+                  -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
+
+        for n odd and for i = 1,...,n
+
+             r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
+
+                  2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
+
+                 -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
+
+ *****  note
+             this transform is unnormalized since a call of rfftf
+             followed by a call of rfftb will multiply the input
+             sequence by n.
+
+wsave   contains results which must not be destroyed between
+        calls of rfftb or rfftf.
+
+******************************************************************
+
+subroutine sinti(n,wsave)
+
+******************************************************************
+
+subroutine sinti initializes the array wsave which is used in
+subroutine sint. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed.  the method
+        is most efficient when n+1 is a product of small primes.
+
+output parameter
+
+wsave   a work array with at least int(2.5*n+15) locations.
+        different wsave arrays are required for different values
+        of n. the contents of wsave must not be changed between
+        calls of sint.
+
+******************************************************************
+
+subroutine sint(n,x,wsave)
+
+******************************************************************
+
+subroutine sint computes the discrete fourier sine transform
+of an odd sequence x(i). the transform is defined below at
+output parameter x.
+
+sint is the unnormalized inverse of itself since a call of sint
+followed by another call of sint will multiply the input sequence
+x by 2*(n+1).
+
+the array wsave which is used by subroutine sint must be
+initialized by calling subroutine sinti(n,wsave).
+
+input parameters
+
+n       the length of the sequence to be transformed.  the method
+        is most efficient when n+1 is the product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+
+wsave   a work array with dimension at least int(2.5*n+15)
+        in the program that calls sint. the wsave array must be
+        initialized by calling subroutine sinti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i)= the sum from k=1 to k=n
+
+                  2*x(k)*sin(k*i*pi/(n+1))
+
+             a call of sint followed by another call of
+             sint will multiply the sequence x by 2*(n+1).
+             hence sint is the unnormalized inverse
+             of itself.
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of sint.
+
+******************************************************************
+
+subroutine costi(n,wsave)
+
+******************************************************************
+
+subroutine costi initializes the array wsave which is used in
+subroutine cost. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed.  the method
+        is most efficient when n-1 is a product of small primes.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        different wsave arrays are required for different values
+        of n. the contents of wsave must not be changed between
+        calls of cost.
+
+******************************************************************
+
+subroutine cost(n,x,wsave)
+
+******************************************************************
+
+subroutine cost computes the discrete fourier cosine transform
+of an even sequence x(i). the transform is defined below at output
+parameter x.
+
+cost is the unnormalized inverse of itself since a call of cost
+followed by another call of cost will multiply the input sequence
+x by 2*(n-1). the transform is defined below at output parameter x
+
+the array wsave which is used by subroutine cost must be
+initialized by calling subroutine costi(n,wsave).
+
+input parameters
+
+n       the length of the sequence x. n must be greater than 1.
+        the method is most efficient when n-1 is a product of
+        small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15
+        in the program that calls cost. the wsave array must be
+        initialized by calling subroutine costi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+            x(i) = x(1)+(-1)**(i-1)*x(n)
+
+             + the sum from k=2 to k=n-1
+
+                 2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
+
+             a call of cost followed by another call of
+             cost will multiply the sequence x by 2*(n-1)
+             hence cost is the unnormalized inverse
+             of itself.
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of cost.
+
+******************************************************************
+
+subroutine sinqi(n,wsave)
+
+******************************************************************
+
+subroutine sinqi initializes the array wsave which is used in
+both sinqf and sinqb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed. the method
+        is most efficient when n is a product of small primes.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        the same work array can be used for both sinqf and sinqb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of sinqf or sinqb.
+
+******************************************************************
+
+subroutine sinqf(n,x,wsave)
+
+******************************************************************
+
+subroutine sinqf computes the fast fourier transform of quarter
+wave data. that is , sinqf computes the coefficients in a sine
+series representation with only odd wave numbers. the transform
+is defined below at output parameter x.
+
+sinqb is the unnormalized inverse of sinqf since a call of sinqf
+followed by a call of sinqb will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine sinqf must be
+initialized by calling subroutine sinqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        in the program that calls sinqf. the wsave array must be
+        initialized by calling subroutine sinqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i) = (-1)**(i-1)*x(n)
+
+                + the sum from k=1 to k=n-1 of
+
+                2*x(k)*sin((2*i-1)*k*pi/(2*n))
+
+             a call of sinqf followed by a call of
+             sinqb will multiply the sequence x by 4*n.
+             therefore sinqb is the unnormalized inverse
+             of sinqf.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of sinqf or sinqb.
+
+******************************************************************
+
+subroutine sinqb(n,x,wsave)
+
+******************************************************************
+
+subroutine sinqb computes the fast fourier transform of quarter
+wave data. that is , sinqb computes a sequence from its
+representation in terms of a sine series with odd wave numbers.
+the transform is defined below at output parameter x.
+
+sinqf is the unnormalized inverse of sinqb since a call of sinqb
+followed by a call of sinqf will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine sinqb must be
+initialized by calling subroutine sinqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        in the program that calls sinqb. the wsave array must be
+        initialized by calling subroutine sinqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i)= the sum from k=1 to k=n of
+
+               4*x(k)*sin((2k-1)*i*pi/(2*n))
+
+             a call of sinqb followed by a call of
+             sinqf will multiply the sequence x by 4*n.
+             therefore sinqf is the unnormalized inverse
+             of sinqb.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of sinqb or sinqf.
+
+******************************************************************
+
+subroutine cosqi(n,wsave)
+
+******************************************************************
+
+subroutine cosqi initializes the array wsave which is used in
+both cosqf and cosqb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the array to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        the same work array can be used for both cosqf and cosqb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of cosqf or cosqb.
+
+******************************************************************
+
+subroutine cosqf(n,x,wsave)
+
+******************************************************************
+
+subroutine cosqf computes the fast fourier transform of quarter
+wave data. that is , cosqf computes the coefficients in a cosine
+series representation with only odd wave numbers. the transform
+is defined below at output parameter x
+
+cosqf is the unnormalized inverse of cosqb since a call of cosqf
+followed by a call of cosqb will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine cosqf must be
+initialized by calling subroutine cosqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15
+        in the program that calls cosqf. the wsave array must be
+        initialized by calling subroutine cosqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i) = x(1) plus the sum from k=2 to k=n of
+
+                2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
+
+             a call of cosqf followed by a call of
+             cosqb will multiply the sequence x by 4*n.
+             therefore cosqb is the unnormalized inverse
+             of cosqf.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of cosqf or cosqb.
+
+******************************************************************
+
+subroutine cosqb(n,x,wsave)
+
+******************************************************************
+
+subroutine cosqb computes the fast fourier transform of quarter
+wave data. that is , cosqb computes a sequence from its
+representation in terms of a cosine series with odd wave numbers.
+the transform is defined below at output parameter x.
+
+cosqb is the unnormalized inverse of cosqf since a call of cosqb
+followed by a call of cosqf will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine cosqb must be
+initialized by calling subroutine cosqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array that must be dimensioned at least 3*n+15
+        in the program that calls cosqb. the wsave array must be
+        initialized by calling subroutine cosqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i)= the sum from k=1 to k=n of
+
+               4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
+
+             a call of cosqb followed by a call of
+             cosqf will multiply the sequence x by 4*n.
+             therefore cosqf is the unnormalized inverse
+             of cosqb.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of cosqb or cosqf.
+
+******************************************************************
+
+subroutine cffti(n,wsave)
+
+******************************************************************
+
+subroutine cffti initializes the array wsave which is used in
+both cfftf and cfftb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 4*n+15
+        the same work array can be used for both cfftf and cfftb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of cfftf or cfftb.
+
+******************************************************************
+
+subroutine cfftf(n,c,wsave)
+
+******************************************************************
+
+subroutine cfftf computes the forward complex discrete fourier
+transform (the fourier analysis). equivalently , cfftf computes
+the fourier coefficients of a complex periodic sequence.
+the transform is defined below at output parameter c.
+
+the transform is not normalized. to obtain a normalized transform
+the output must be divided by n. otherwise a call of cfftf
+followed by a call of cfftb will multiply the sequence by n.
+
+the array wsave which is used by subroutine cfftf must be
+initialized by calling subroutine cffti(n,wsave).
+
+input parameters
+
+
+n      the length of the complex sequence c. the method is
+       more efficient when n is the product of small primes. n
+
+c      a complex array of length n which contains the sequence
+
+wsave   a real work array which must be dimensioned at least 4n+15
+        in the program that calls cfftf. the wsave array must be
+        initialized by calling subroutine cffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by cfftf and cfftb.
+
+output parameters
+
+c      for j=1,...,n
+
+           c(j)=the sum from k=1,...,n of
+
+                 c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
+
+                       where i=sqrt(-1)
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of subroutine cfftf or cfftb
+
+******************************************************************
+
+subroutine cfftb(n,c,wsave)
+
+******************************************************************
+
+subroutine cfftb computes the backward complex discrete fourier
+transform (the fourier synthesis). equivalently , cfftb computes
+a complex periodic sequence from its fourier coefficients.
+the transform is defined below at output parameter c.
+
+a call of cfftf followed by a call of cfftb will multiply the
+sequence by n.
+
+the array wsave which is used by subroutine cfftb must be
+initialized by calling subroutine cffti(n,wsave).
+
+input parameters
+
+
+n      the length of the complex sequence c. the method is
+       more efficient when n is the product of small primes.
+
+c      a complex array of length n which contains the sequence
+
+wsave   a real work array which must be dimensioned at least 4n+15
+        in the program that calls cfftb. the wsave array must be
+        initialized by calling subroutine cffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by cfftf and cfftb.
+
+output parameters
+
+c      for j=1,...,n
+
+           c(j)=the sum from k=1,...,n of
+
+                 c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
+
+                       where i=sqrt(-1)
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of subroutine cfftf or cfftb
+
+*/
--- a/pffft/fmv.h
+++ b/pffft/fmv.h
@@ -0,0 +1,20 @@
+#ifndef FMV_H
+
+#if HAVE_FUNC_ATTRIBUTE_IFUNC
+#if defined(__has_attribute)
+#if __has_attribute(target_clones)
+#if defined(__x86_64)
+
+// see https://gcc.gnu.org/wiki/FunctionMultiVersioning
+#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
+#define HAVE_PF_TARGET_CLONES  1
+#endif
+#endif
+#endif
+#endif
+
+#ifndef PF_TARGET_CLONES
+#define PF_TARGET_CLONES
+#endif
+
+#endif
--- a/pffft/mingw-w32-i686.cmake
+++ b/pffft/mingw-w32-i686.cmake
@@ -0,0 +1,25 @@
+# Sample toolchain file for building for Windows from an Ubuntu Linux system.
+#
+# Typical usage:
+#    *) install cross compiler: `sudo apt-get install mingw-w64`
+#    *) cd build
+#    *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w32-i686.cmake ..
+#
+# build for Windows' 32 bit architecture
+
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+set(TOOLCHAIN_PREFIX i686-w64-mingw32)
+
+# cross compilers to use for C, C++ and Fortran
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
+set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
+
+# target environment on the build host system
+set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
+
+# modify default behavior of FIND_XXX() commands
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
--- a/pffft/mingw-w64-x64_64.cmake
+++ b/pffft/mingw-w64-x64_64.cmake
@@ -0,0 +1,25 @@
+# Sample toolchain file for building for Windows from an Ubuntu Linux system.
+#
+# Typical usage:
+#    *) install cross compiler: `sudo apt-get install mingw-w64`
+#    *) cd build
+#    *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w64-x86_64.cmake ..
+#
+# build for Windows' 64 bit architecture
+
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
+
+# cross compilers to use for C, C++ and Fortran
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
+set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
+
+# target environment on the build host system
+set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
+
+# modify default behavior of FIND_XXX() commands
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
--- a/pffft/papi_perf_counter.h
+++ b/pffft/papi_perf_counter.h
@@ -0,0 +1,97 @@
+#pragma once
+
+/* for measurement of CPU cycles ..
+ *
+ * requires
+ *   sudo apt-get install libpapi-dev papi-tools
+ * on debian/ubuntu linux distributions
+ *
+ */
+
+#ifdef HAVE_PAPI
+#include <papi.h>
+#endif
+
+#include <stdio.h>
+
+
+struct papi_perf_counter
+{
+    papi_perf_counter()
+        : realTime(0.0F), processTime(0.0F), instructions(0LL), ipc(0.0F)
+        , started(false), finished(false), print_at_destruction(false)
+    { }
+
+    papi_perf_counter(int _start, bool print_at_destruction_ = true)
+        : print_at_destruction(print_at_destruction_)
+    {
+        (void)_start;
+        start();
+    }
+
+    ~papi_perf_counter()
+    {
+        if (print_at_destruction)
+            print(stderr);
+    }
+
+    bool start()
+    {
+        static bool reported_start_error = false;
+#ifdef HAVE_PAPI
+        int ret = PAPI_ipc(&realTime, &processTime, &instructions, &ipc);
+        if (ret && !reported_start_error)
+        {
+            reported_start_error = true;
+            fprintf(stderr, "papi_perf_counter::start(): PAPI_ipc() returned error %d\n", ret);
+        }
+#else
+        if (!reported_start_error)
+        {
+            reported_start_error = true;
+            fprintf(stderr, "papi_perf_counter::start(): no HAVE_PAPI\n");
+        }
+        int ret = 1;
+#endif
+        started = (!ret);
+        finished = false;
+        return started;
+    }
+
+    bool finish()
+    {
+        papi_perf_counter end(1, false);
+        if (started && !finished && end.started)
+        {
+            realTime = end.realTime - realTime;
+            processTime = end.processTime - processTime;
+            instructions = end.instructions - instructions;
+            ipc = end.ipc;
+            finished = true;
+            return true;
+        }
+        return false;
+    }
+
+    void print(FILE *f = stdout)
+    {
+        if (started && !finished)
+            finish();
+        if (!started || !finished)
+            return;
+        double cycles = instructions / ipc;
+        fprintf(f, "real %g, process %g, instructions %lld, ins/cycle %f => cycles %g\n"
+                , realTime, processTime, instructions, ipc, cycles
+                );
+        started = false;
+    }
+
+    float realTime;
+    float processTime;
+    long long instructions;
+    float ipc;
+    bool started;
+    bool finished;
+    bool print_at_destruction;
+};
+
--- a/pffft/pf_carrier.cpp
+++ b/pffft/pf_carrier.cpp
@@ -0,0 +1,298 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_carrier.h"
+#include "fmv.h"
+
+#include <limits.h>
+#include <assert.h>
+
+
+PF_TARGET_CLONES
+void generate_dc_f(float* output, int size)
+{
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_s16(short* output, int size)
+{
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_fs4_f(float* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0.0F;
+        output[i++]=(127.0F / 128.0F);
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=(-127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0.0F;
+        output[i++]=(-127.0F / 128.0F);
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_fs4_s16(short* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0;
+        output[i++]=SHRT_MAX;
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=-SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0;
+        output[i++]=-SHRT_MAX;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_neg_fs4_f(float* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0.0F;
+        output[i++]=(-127.0F / 128.0F);
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=(-127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0.0F;
+        output[i++]=(127.0F / 128.0F);
+    }
+}
+
+PF_TARGET_CLONES
+void generate_neg_fs4_s16(short* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0;
+        output[i++]=-SHRT_MAX;
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=-SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0;
+        output[i++]=SHRT_MAX;
+    }
+}
+
+/****************************************************/
+
+PF_TARGET_CLONES
+void generate_dc_pos_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+1+i*0 */
+        output[i++]=m+m;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 1+0+i*1 */
+        output[i++]=m+0;
+        output[i++]=m;
+        /* exp(i* +pi) = 1-1+i*0 */
+        output[i++]=m-m;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 1+0+i*-1 */
+        output[i++]=m;
+        output[i++]=-m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+1+i*0 */
+        output[i++]=m+m;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 1+0+i*-1 */
+        output[i++]=m+0;
+        output[i++]=-m;
+        /* exp(i* +pi) = 1-1+i*0 */
+        output[i++]=m-m;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 1+0+i*1 */
+        output[i++]=m+0;
+        output[i++]=m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* pos(0) + neg(0) = exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
+        output[i++]=m;
+        output[i++]=-m;
+
+        /* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
+        output[i++]=-m;
+        output[i++]=m;
+
+        /* pos(2) + neg(2) = exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
+        output[i++]=-m;
+        output[i++]=m;
+
+        /* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
+        output[i++]=m;
+        output[i++]=-m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_pos_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* dc + pos(0) + neg(0) = dc + exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
+        output[i++]=m+m;
+        output[i++]=-m;
+
+        /* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
+        output[i++]=0;
+        output[i++]=m;
+
+        /* dc + pos(2) + neg(2) = dc + exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
+        output[i++]=0;
+        output[i++]=m;
+
+        /* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
+        output[i++]=m+m;
+        output[i++]=-m;
+    }
+}
+
+
+PF_TARGET_CLONES
+void generate_pos_neg_fs2_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* dc + exp(i* 0 ) = +1 */
+        output[i++]=m;
+        output[i++]=0;
+        /* dc + exp(i* pi) = -1 */
+        output[i++]=-m;
+        output[i++]=0;
+        /* dc + exp(i* 0 ) = +1 */
+        output[i++]=m;
+        output[i++]=0;
+        /* dc + exp(i* pi) = -1 */
+        output[i++]=-m;
+        output[i++]=0;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_pos_neg_fs2_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* with dc = i*1 */
+        /* dc + exp(i* 0 ) = i*1 +1 */
+        output[i++]=m;
+        output[i++]=m;
+        /* dc + exp(i* pi) = i*1 -1 */
+        output[i++]=-m;
+        output[i++]=m;
+        /* dc + exp(i* 0 ) = i*1 +1 */
+        output[i++]=m;
+        output[i++]=m;
+        /* dc + exp(i* pi) = i*1 -1 */
+        output[i++]=-m;
+        output[i++]=m;
+    }
+}
+
+
--- a/pffft/pf_carrier.h
+++ b/pffft/pf_carrier.h
@@ -0,0 +1,75 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+   _____                      _
+  / ____|                    | |
+ | |     ___  _ __ ___  _ __ | | _____  __
+ | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
+ | |___| (_) | | | | | | |_) | |  __/>  <
+  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
+                       | |
+                       |_|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+
+/* generation functions */
+void generate_dc_f(float* output, int size);
+void generate_dc_s16(short* output, int size);
+void generate_pos_fs4_f(float* output, int size);
+void generate_pos_fs4_s16(short* output, int size);
+void generate_neg_fs4_f(float* output, int size);
+void generate_neg_fs4_s16(short* output, int size);
+
+void generate_dc_pos_fs4_s16(short* output, int size);
+void generate_dc_neg_fs4_s16(short* output, int size);
+void generate_pos_neg_fs4_s16(short* output, int size);
+void generate_dc_pos_neg_fs4_s16(short* output, int size);
+
+void generate_pos_neg_fs2_s16(short* output, int size);
+void generate_dc_pos_neg_fs2_s16(short* output, int size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/pffft/pf_cic.cpp
+++ b/pffft/pf_cic.cpp
@@ -0,0 +1,255 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* gcc requires this for M_PI !? */
+#undef __STRICT_ANSI__
+
+/* include own header first, to see missing includes */
+#include "pf_cic.h"
+#include "fmv.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+
+/*
+   ____ ___ ____   ____  ____   ____
+  / ___|_ _/ ___| |  _ \|  _ \ / ___|
+ | |    | | |     | | | | | | | |
+ | |___ | | |___  | |_| | |_| | |___
+  \____|___\____| |____/|____/ \____|
+*/
+
+#define SINESHIFT 12
+#define SINESIZE (1<<SINESHIFT)
+typedef int64_t cic_dt; // data type used for integrators and combs
+typedef struct {
+    int factor;
+    uint64_t phase;
+    float gain;
+    cic_dt ig0a, ig0b, ig1a, ig1b;
+    cic_dt comb0a, comb0b, comb1a, comb1b;
+    int16_t *sinetable;
+} cicddc_t;
+
+void *cicddc_init(int factor) {
+    int i;
+    int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
+    cicddc_t *s;
+    s = (cicddc_t *)malloc(sizeof(cicddc_t));
+    memset(s, 0, sizeof(cicddc_t));
+
+    float sineamp = 32767.0f;
+    s->factor = factor;
+    s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
+
+    s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
+    double f = 2.0 * M_PI / (double)SINESIZE;
+    for(i = 0; i < sinesize2; i++) {
+        s->sinetable[i] = sineamp * cos(f * i);
+    }
+    return s;
+}
+
+void cicddc_free(void *state) {
+    cicddc_t *s = (cicddc_t *)state;
+    free(s->sinetable);
+    free(s);
+}
+
+
+PF_TARGET_CLONES
+void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    int16_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int sinep = phase >> (64-SINESHIFT);
+            in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
+PF_TARGET_CLONES
+void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    int16_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int32_t m_a, m_b, m_c, m_d;
+            int sinep = phase >> (64-SINESHIFT);
+            m_a = inp[2*i];
+            m_b = inp[2*i+1];
+            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            m_d = (int32_t)sinetable[sinep];
+            // complex multiplication:
+            in_a = m_a*m_c - m_b*m_d;
+            in_b = m_a*m_d + m_b*m_c;
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += 2*factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
+
+/* This is almost copy paste from cicddc_cs16_c.
+   I'm afraid this is going to be annoying to maintain... */
+PF_TARGET_CLONES
+void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    uint8_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int32_t m_a, m_b, m_c, m_d;
+            int sinep = phase >> (64-SINESHIFT);
+            // subtract 127.4 (good for rtl-sdr)
+            m_a = (((int32_t)inp[2*i])   << 8) - 32614;
+            m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
+            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            m_d = (int32_t)sinetable[sinep];
+            // complex multiplication:
+            in_a = m_a*m_c - m_b*m_d;
+            in_b = m_a*m_d + m_b*m_c;
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += 2*factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
--- a/pffft/pf_cic.h
+++ b/pffft/pf_cic.h
@@ -0,0 +1,58 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+   ____ ___ ____   ____  ____   ____
+  / ___|_ _/ ___| |  _ \|  _ \ / ___|
+ | |    | | |     | | | | | | | |
+ | |___ | | |___  | |_| | |_| | |___
+  \____|___\____| |____/|____/ \____|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+void *cicddc_init(int factor);
+void cicddc_free(void *state);
+void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
+void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
+void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/pffft/pf_conv.cpp
+++ b/pffft/pf_conv.cpp
@@ -0,0 +1,322 @@
+
+#include "pf_conv.h"
+
+#include <string.h>
+#include <assert.h>
+
+#include <algorithm>
+
+#if 0
+#include <stdio.h>
+
+#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
+
+#else
+#define DPRINT(...) do { } while (0)
+#endif
+
+
+#ifdef HAVE_MIPP
+#include <mipp.h>
+#endif
+
+
+#ifndef CONV_ARCH_POST
+#error CONV_ARCH_POST not defined
+#endif
+
+#define PP_STRINGIFY(X) #X
+#define PP_TOSTRING(X)  PP_STRINGIFY(X)
+#define PP_CONCAT_IMPL(x, y) x##y
+#define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y )
+
+#define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST)
+
+
+const char * ARCHFUNCNAME(id)()
+{
+    return PP_TOSTRING(CONV_ARCH_POST);
+}
+
+
+int ARCHFUNCNAME(conv_float_simd_size)()
+{
+#if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP)
+    // have a completely MIPP independent implementation
+    return 1;
+#else
+    return mipp::N<float>();
+#endif
+}
+
+
+void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state)
+{
+    int R = state->size - state->offset;    // this many samples from prev conv_float were not processed
+    if (R > 0)
+    {
+        // memmove(s, &s[state->offset], R * sizeof(s[0]));   // move them to the begin
+        std::copy(&s[state->offset], &s[state->size], s);
+    }
+    else
+        R = 0;
+    state->offset = 0;      // data - to be processed - is at begin
+    state->size = R;        // this many unprocessed samples
+}
+
+
+void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state)
+{
+    int R = state->size - state->offset;    // this many samples from prev conv_float were not processed
+    if (R > 0)
+    {
+        // memmove(s, &s[state->offset], R * sizeof(s[0]));   // move them to the begin
+        std::copy(&s[state->offset], &s[state->size], s);
+    }
+    else
+        R = 0;
+    state->offset = 0;      // data - to be processed - is at begin
+    state->size = R;        // this many unprocessed samples
+}
+
+
+#if defined(MIPP_NO_INTRINSICS)
+// have a completely MIPP independent implementation
+// #error missing HAVE_MIPP: there is no MIPP-independent implementation
+
+int ARCHFUNCNAME(conv_float_inplace)(
+        float * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter
+        )
+{
+    const int off0 = state->offset;
+    const int sz_s = state->size;
+    int offset;
+
+    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
+    {
+        float accu = 0.0F;
+        for (int k = 0; k < sz_filter; ++k)
+            accu += s[offset+k] * filter[k];
+        s[offset] = accu;
+    }
+
+    state->offset = offset;
+    return offset - off0;
+}
+
+
+int ARCHFUNCNAME(conv_float_oop)(
+        const float * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter,
+        float * RESTRICT y
+        )
+{
+    const int off0 = state->offset;
+    const int sz_s = state->size;
+    int offset;
+
+    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
+    {
+        float accu = 0.0F;
+        for (int k = 0; k < sz_filter; ++k)
+            accu += s[offset+k] * filter[k];
+        y[offset] = accu;
+    }
+
+    state->offset = offset;
+    return offset - off0;
+}
+
+
+int ARCHFUNCNAME(conv_cplx_float_oop)(
+        const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter,
+        complexf * RESTRICT y_cplx
+        )
+{
+    const int off0 = state->offset;
+    const int sz_s = state->size;
+    const int sz_f = sz_filter;
+    int offset;
+
+    for ( offset = off0; offset + sz_f <= sz_s; ++offset)
+    {
+        float accu_re = 0.0F;
+        float accu_im = 0.0F;
+        for (int k = 0; k < sz_filter; ++k)
+        {
+            accu_re = s_cplx[offset+k].i * filter[k];   // accu += rS * rH;
+            accu_im = s_cplx[offset+k].q * filter[k];   // accu += rS * rH;
+        }
+        y_cplx[offset].i = accu_re;  // == hadd() == sum of real parts
+        y_cplx[offset].q = accu_im;  // == hadd() == sum of imag parts
+    }
+
+    state->offset = offset;
+    return offset - off0;
+}
+
+
+#elif defined(HAVE_MIPP)
+
+
+int ARCHFUNCNAME(conv_float_inplace)(
+        float * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter
+        )
+{
+    assert( (sz_filter % mipp::N<float>()) == 0 );  // size of filter must be divisible by conv_float_simd_size()
+
+    mipp::Reg<float> accu, rS, rH;
+    const int off0 = state->offset;
+    const int sz_s = state->size;
+    int offset;
+
+    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
+    {
+        accu.set0();
+        for (int k = 0; k < sz_filter; k += mipp::N<float>())
+        {
+            rS.load(&s[offset+k]);
+            rH.load(&filter[k]);
+            accu = mipp::fmadd(rS, rH, accu);   // accu += rS * rH;
+        }
+        s[offset] = accu.sum();    // == hadd()
+    }
+
+    state->offset = offset;
+    return offset - off0;
+}
+
+
+int ARCHFUNCNAME(conv_float_oop)(
+        const float * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter,
+        float * RESTRICT y
+        )
+{
+    assert( (sz_filter % mipp::N<float>()) == 0 );  // size of filter must be divisible by conv_float_simd_size()
+
+    mipp::Reg<float> accu, rS, rH;
+    const int off0 = state->offset;
+    const int sz_s = state->size;
+    int offset;
+
+    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
+    {
+        accu.set0();
+        for (int k = 0; k < sz_filter; k += mipp::N<float>())
+        {
+            rS.loadu(&s[offset+k]);
+            rH.load(&filter[k]);
+            accu = mipp::fmadd(rS, rH, accu);   // accu += rS * rH;
+        }
+        y[offset] = accu.sum();    // == hadd()
+    }
+
+    state->offset = offset;
+    return offset - off0;
+}
+
+
+int ARCHFUNCNAME(conv_cplx_float_oop)(
+        const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter,
+        complexf * RESTRICT y_cplx
+        )
+{
+    assert( (sz_filter % mipp::N<float>()) == 0 );  // size of filter must be divisible by conv_float_simd_size()
+    const float * RESTRICT s = &(s_cplx[0].i);
+    float * RESTRICT y = &(y_cplx[0].i);
+
+    mipp::Regx2<float> accu_x2, rS_x2, H_x2;
+    const int off0 = 2 * state->offset;
+    const int sz_s = 2 * state->size;
+    const int sz_f2 = 2 * sz_filter;
+    int offset;
+
+    for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2)
+    {
+        accu_x2.val[0].set0();
+        accu_x2.val[1].set0();
+        for (int k = 0; k < sz_filter; k += mipp::N<float>())
+        {
+            mipp::Reg<float> rH;
+            rS_x2.loadu(&s[offset+2*k]);
+            rH.load(&filter[k]);
+            H_x2 = mipp::interleave<float>(rH, rH);
+            accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]);   // accu += rS * rH;
+            accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]);   // accu += rS * rH;
+        }
+        H_x2 = mipp::deinterleave(accu_x2);
+        y[offset]   = H_x2.val[0].sum();  // == hadd() == sum of real parts
+        y[offset+1] = H_x2.val[1].sum();  // == hadd() == sum of imag parts
+    }
+
+    state->offset = offset /2;
+    return (offset - off0) / 2;
+}
+
+#endif
+
+
+static const conv_f_ptrs conv_ptrs =
+{
+    PP_TOSTRING(CONV_ARCH_POST),
+#ifndef MIPP_NO_INTRINSICS
+    1,
+#else
+    0,
+#endif
+
+    ARCHFUNCNAME(id),
+    ARCHFUNCNAME(conv_float_simd_size),
+
+#if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP)
+    ARCHFUNCNAME(conv_float_move_rest),
+    ARCHFUNCNAME(conv_float_inplace),
+    ARCHFUNCNAME(conv_float_oop),
+
+    ARCHFUNCNAME(conv_cplx_move_rest),
+    ARCHFUNCNAME(conv_cplx_float_oop)
+#else
+    nullptr,
+    nullptr,
+    nullptr,
+
+    nullptr,
+    nullptr
+#endif
+};
+
+
+const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)()
+{
+    DPRINT("arch pointer for '%s':\n", conv_ptrs.id);
+    if (!strcmp(conv_ptrs.id, "none"))
+        return &conv_ptrs;
+
+#if defined(MIPP_NO_INTRINSICS)
+    DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id);
+    return &conv_ptrs;
+#elif defined(HAVE_MIPP)
+    DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id);
+    DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp);
+    DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size());
+    if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1)
+        return &conv_ptrs;
+    else
+        DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size());
+#else
+    DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id);
+#endif
+    DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id);
+    return nullptr;
+}
+
+#if defined(__cplusplus) && (__cplusplus >= 201703L)
+[[maybe_unused]]
+#endif
+static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);
+
--- a/pffft/pf_conv.h
+++ b/pffft/pf_conv.h
@@ -0,0 +1,109 @@
+#pragma once
+
+/* pf_conv.h/.cpp implements linear "slow" convolution.
+ * this code is primarily for test/demonstration of runtime dispatching.
+ * each "kernel" is compiled with different compiler/architecture options,
+ * that activates different implementations in the MIPP headers.
+ *
+ * the dispatcher library 'pf_conv_dispatcher' collects (links agains)
+ * all the pf_conv_arch_<opt> libraries ..
+ * and provides the  get_all_conv_arch_ptrs() function,
+ * which delivers an array of pointers to the struct (conv_f_ptrs)
+ * containing the function pointers for the different implementations.
+ *
+ * requirement(s):
+ * - installed MIPP headers
+ * - compiler definitions for the different architecture types:
+ *   see CMakeLists.txt CONV_ARCH_MSVC_AMD64, CONV_ARCH_GCC_ARM32NEON, ..
+ * - one cmake library target pf_conv_arch_<opt> for each architecture option.
+ *   each one gets it's specific  architecture/compiler  options
+ *    utilizing the target_set_cxx_arch_option() macro in the CMakeLists.txt
+ */
+
+#include "pf_cplx.h"
+
+#if defined(_MSC_VER)
+#  define RESTRICT __restrict
+#elif defined(__GNUC__)
+#  define RESTRICT __restrict
+#else
+#  define RESTRICT
+#endif
+
+
+struct conv_buffer_state
+{
+    int offset; // sample index where data (to process) starts
+    int size;   // actual - or previous - size in amount of samples from buffer start (NOT offset)
+};
+
+// declare provided function pointer types
+
+typedef const char * (*f_conv_id)();
+
+typedef int  (*f_conv_float_simd_size)();
+
+typedef void (*f_conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state);
+typedef void (*f_conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state);
+
+typedef int  (*f_conv_float_inplace)(
+        float * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter
+        );
+
+typedef int  (*f_conv_float_oop)(
+        const float * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter,
+        float * RESTRICT y
+        );
+
+typedef int  (*f_conv_cplx_float_oop)(
+        const complexf * RESTRICT s, conv_buffer_state * RESTRICT state,
+        const float * RESTRICT filter, const int sz_filter,
+        complexf * RESTRICT y
+        );
+
+
+// struct with the provided function pointers
+struct conv_f_ptrs
+{
+    const char * id;
+    const int using_mipp;
+    f_conv_id               fp_id;
+    f_conv_float_simd_size  fp_conv_float_simd_size;
+
+    f_conv_float_move_rest  fp_conv_float_move_rest;
+    f_conv_float_inplace    fp_conv_float_inplace;
+    f_conv_float_oop        fp_conv_float_oop;
+
+    f_conv_cplx_move_rest   fp_conv_cplx_move_rest;
+    f_conv_cplx_float_oop   fp_conv_cplx_float_oop;
+};
+
+typedef const conv_f_ptrs * ptr_to_conv_f_ptrs;
+
+// function pointer type, delivering the struct with the function pointers
+typedef const conv_f_ptrs* (*f_conv_ptrs)();
+
+
+// helper for systematic function names
+#define CONV_FN_ARCH(FN, ARCH) FN##_##ARCH
+
+// declare all functions - returning the structs with the function pointers
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, none)();  // = conv_ptrs_none()
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, dflt)();  // simd / mipp is activated
+
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse3)();  // = conv_ptrs_sse3()
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse4)();
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)();
+
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse2)();
+//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();  // already declared
+//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); // already declared
+
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();    // for armv7l / 32-bit ARM
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
+
+extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, armv8a)();  // for aarch64
--- a/pffft/pf_conv_dispatcher.cpp
+++ b/pffft/pf_conv_dispatcher.cpp
@@ -0,0 +1,61 @@
+
+#include "pf_conv_dispatcher.h"
+
+#if 0
+#include <stdio.h>
+
+#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
+
+#else
+#define DPRINT(...) do { } while (0)
+#endif
+
+
+#define N_DEFAULT_ARCHES  2
+// 0 is "none"
+// 1 "dflt"
+
+ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch)
+{
+    static ptr_to_conv_f_ptrs * all_arches = nullptr;
+    static int n_arch = 0;
+    if (!all_arches)
+    {
+        n_arch = N_DEFAULT_ARCHES;
+        // @TODO: runtime check if actual CPU supports specific architecture
+#if defined(CONV_ARCH_GCC_AMD64)
+        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+4] = {0};
+        DPRINT("CONV_ARCH_GCC_AMD64: sse3, sse4, avx, avx2\n");
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse3)();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse4)();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
+#elif defined(CONV_ARCH_MSVC_AMD64)
+        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
+        DPRINT("CONV_ARCH_MSVC_AMD64: sse2, avx, avx2\n");
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse2)();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
+#elif defined(CONV_ARCH_GCC_ARM32NEON)
+        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
+        DPRINT("CONV_ARCH_GCC_ARM32NEON: neon_vfpv4, neon_rpi3_a53\n");
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
+#elif defined(CONV_ARCH_GCC_AARCH64)
+        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+1] = {0};
+        DPRINT("CONV_ARCH_GCC_AARCH64: -\n");
+        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, armv8a)();
+#else
+        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES] = {0};
+        DPRINT("unknown CONV_ARCH: -\n");
+#endif
+        conv_arch_ptrs[0] = CONV_FN_ARCH(conv_ptrs, none)();
+        conv_arch_ptrs[1] = CONV_FN_ARCH(conv_ptrs, dflt)();
+        all_arches = conv_arch_ptrs;
+    }
+    if (p_num_arch)
+        *p_num_arch = n_arch;
+    return all_arches;
+}
+
--- a/pffft/pf_conv_dispatcher.h
+++ b/pffft/pf_conv_dispatcher.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "pf_conv.h"
+
+ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch);
+
--- a/pffft/pf_cplx.h
+++ b/pffft/pf_cplx.h
@@ -0,0 +1,44 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+/*
+   _____                      _
+  / ____|                    | |
+ | |     ___  _ __ ___  _ __ | | _____  __
+ | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
+ | |___| (_) | | | | | | |_) | |  __/>  <
+  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
+                       | |
+                       |_|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
--- a/pffft/pf_mixer.cpp
+++ b/pffft/pf_mixer.cpp
--- a/pffft/pf_mixer.h
+++ b/pffft/pf_mixer.h
@@ -0,0 +1,270 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include "pf_cplx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// =================================================================================
+
+int have_sse_shift_mixer_impl();
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO A ***/
+/**************/
+
+float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO B ***/
+/**************/
+
+typedef struct shift_table_data_s
+{
+    float* table;
+    int table_size;
+} shift_table_data_t;
+
+void shift_table_deinit(shift_table_data_t table_data);
+shift_table_data_t shift_table_init(int table_size);
+float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO C ***/
+/**************/
+
+typedef struct shift_addfast_data_s
+{
+    float dsin[4];
+    float dcos[4];
+    float phase_increment;
+} shift_addfast_data_t;
+
+shift_addfast_data_t shift_addfast_init(float rate);
+float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
+float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO D ***/
+/**************/
+
+typedef struct shift_unroll_data_s
+{
+    float* dsin;
+    float* dcos;
+    float phase_increment;
+    int size;
+} shift_unroll_data_t;
+
+shift_unroll_data_t shift_unroll_init(float rate, int size);
+void shift_unroll_deinit(shift_unroll_data_t* d);
+float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
+float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO E ***/
+/**************/
+
+/* similar to shift_unroll_cc() - but, have fixed and limited precalc size
+ * idea: smaller cache usage by table
+ * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
+ */
+#define PF_SHIFT_LIMITED_UNROLL_SIZE  128
+#define PF_SHIFT_LIMITED_SIMD_SZ  4
+
+typedef struct shift_limited_unroll_data_s
+{
+    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
+    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
+    complexf complex_phase;
+    float phase_increment;
+} shift_limited_unroll_data_t;
+
+shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
+/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
+/* starting_phase for next call is kept internal in state */
+void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
+void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO F ***/
+/**************/
+
+typedef struct shift_limited_unroll_A_sse_data_s
+{
+    /* small/limited trig table */
+    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_A_sse_data_t;
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO G ***/
+/**************/
+
+typedef struct shift_limited_unroll_B_sse_data_s
+{
+    /* small/limited trig table */
+    float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_B_sse_data_t;
+
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO H ***/
+/**************/
+
+typedef struct shift_limited_unroll_C_sse_data_s
+{
+    /* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
+    float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_C_sse_data_t;
+
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
+
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO I ***/
+/**************/
+
+/* Recursive Quadrature Oscillator functions "recursive_osc"
+ * see https://www.vicanek.de/articles/QuadOsc.pdf
+ */
+#define PF_SHIFT_RECURSIVE_SIMD_SZ  8
+typedef struct shift_recursive_osc_s
+{
+    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
+} shift_recursive_osc_t;
+
+typedef struct shift_recursive_osc_conf_s
+{
+    float k1;
+    float k2;
+} shift_recursive_osc_conf_t;
+
+void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
+void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+
+/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
+/* starting_phase for next call is kept internal in state */
+void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO J ***/
+/**************/
+
+#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ  4
+typedef struct shift_recursive_osc_sse_s
+{
+    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+} shift_recursive_osc_sse_t;
+
+typedef struct shift_recursive_osc_sse_conf_s
+{
+    float k1;
+    float k2;
+} shift_recursive_osc_sse_conf_t;
+
+void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
+void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
+void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
+
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/pffft/pffastconv.c
+++ b/pffft/pffastconv.c
@@ -0,0 +1,264 @@
+/*
+  Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
+ */
+
+#include "pffastconv.h"
+#include "pffft.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+
+#define FASTCONV_DBG_OUT  0
+
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define RESTRICT __restrict
+#pragma warning( disable : 4244 4305 4204 4456 )
+#elif defined(__GNUC__)
+#  define RESTRICT __restrict
+#endif
+
+
+void *pffastconv_malloc(size_t nb_bytes)
+{
+  return pffft_aligned_malloc(nb_bytes);
+}
+
+void pffastconv_free(void *p)
+{
+  pffft_aligned_free(p);
+}
+
+int pffastconv_simd_size()
+{
+  return pffft_simd_size();
+}
+
+
+
+struct PFFASTCONV_Setup
+{
+  float * Xt;      /* input == x in time domain - copy for alignment */
+  float * Xf;      /* input == X in freq domain */
+  float * Hf;      /* filterCoeffs == H in freq domain */
+  float * Mf;      /* input * filterCoeffs in freq domain */
+  PFFFT_Setup *st;
+  int filterLen;   /* convolution length */
+  int Nfft;        /* FFT/block length */
+  int flags;
+  float scale;
+};
+
+
+PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
+{
+  PFFASTCONV_Setup * s = NULL;
+  const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
+  const int minFftLen = 2*pffft_simd_size()*pffft_simd_size();
+  int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1);
+#if FASTCONV_DBG_OUT
+  const int iOldBlkLen = *blockLen;
+#endif
+
+  if ( Nfft < minFftLen )
+    Nfft = minFftLen;
+
+  if ( flags & PFFASTCONV_CPLX_FILTER )
+    return NULL;
+
+  s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
+
+  if ( *blockLen > Nfft ) {
+    Nfft = *blockLen;
+    Nfft = pffft_next_power_of_two(Nfft);
+  }
+  *blockLen = Nfft;  /* this is in (complex) samples */
+
+  Nfft *= cplxFactor;
+
+  if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
+    s->Xt = NULL;
+  else
+    s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->st = pffft_new_setup(Nfft, PFFFT_REAL);  /* with complex: we do 2 x fft() */
+  s->filterLen = filterLen;        /* filterLen == convolution length == length of impulse response */
+  if ( cplxFactor == 2 )
+    s->filterLen = 2 * filterLen - 1;
+  s->Nfft = Nfft;  /* FFT/block length */
+  s->flags = flags;
+  s->scale = (float)( 1.0 / Nfft );
+
+  memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
+  if ( flags & PFFASTCONV_CORRELATION ) {
+    for ( i = 0; i < filterLen; ++i )
+      s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ];
+  } else {
+    for ( i = 0; i < filterLen; ++i )
+      s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
+  }
+
+  pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+
+#if FASTCONV_DBG_OUT
+  printf("\n  fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
+    , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
+#endif
+
+  return s;
+}
+
+
+void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
+{
+  if (!s)
+    return;
+  pffft_destroy_setup(s->st);
+  pffastconv_free(s->Mf);
+  pffastconv_free(s->Hf);
+  pffastconv_free(s->Xf);
+  if ( s->Xt )
+    pffastconv_free(s->Xt);
+  pffastconv_free(s);
+}
+
+
+int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
+{
+  const float * RESTRICT X = input_;
+  float * RESTRICT Y = output_;
+  const int Nfft = s->Nfft;
+  const int filterLen = s->filterLen;
+  const int flags = s->flags;
+  const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
+  const int inputLen = cplxFactor * cplxInputLen;
+  int inpOff, procLen, numOut = 0, j, part, cplxOff;
+
+  /* applyFlush != 0:
+   *     inputLen - inpOff -filterLen + 1 > 0
+   * <=> inputLen -filterLen + 1 > inpOff
+   * <=> inpOff < inputLen -filterLen + 1
+   * 
+   * applyFlush == 0:
+   *     inputLen - inpOff >= Nfft
+   * <=> inputLen - Nfft >= inpOff
+   * <=> inpOff <= inputLen - Nfft
+   * <=> inpOff < inputLen - Nfft + 1
+   */
+
+  if ( cplxFactor == 2 )
+  {
+    const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
+#if 0
+    printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
+#endif
+    for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
+    {
+      procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
+      numOut = ( procLen - filterLen + 1 ) & ( ~1 );
+      if (!numOut)
+        break;
+#if 0
+      if (!inpOff)
+        printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
+      if (inpOff + filterLen + 2 >= maxOff )
+        printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
+#endif
+
+      if ( flags & PFFASTCONV_DIRECT_INP )
+      {
+        pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+      }
+      else
+      {
+        memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
+        if ( procLen < Nfft )
+          memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+    
+        pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+      }
+
+      pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
+
+      if ( flags & PFFASTCONV_DIRECT_OUT )
+      {
+        pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
+      }
+      else
+      {
+        pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+        memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
+      }
+    }
+    return inpOff / cplxFactor;
+  }
+  else
+  {
+    const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
+    const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
+
+    for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
+    {
+      procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
+      numOut = procLen - filterLen + 1;
+
+      for ( part = 0; part < numParts; ++part )  /* iterate per real/imag component */
+      {
+
+        if ( flags & PFFASTCONV_CPLX_INP_OUT )
+        {
+          cplxOff = 2 * inpOff + part;
+          for ( j = 0; j < procLen; ++j )
+            s->Xt[j] = X[cplxOff + 2 * j];
+          if ( procLen < Nfft )
+            memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+
+          pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+        }
+        else if ( flags & PFFASTCONV_DIRECT_INP )
+        {
+          pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+        }
+        else
+        {
+          memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
+          if ( procLen < Nfft )
+            memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+    
+          pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+        }
+
+        pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
+
+        if ( flags & PFFASTCONV_CPLX_INP_OUT )
+        {
+          pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+    
+          cplxOff = 2 * inpOff + part;
+          for ( j = 0; j < numOut; ++j )
+            Y[ cplxOff + 2 * j ] = s->Xf[j];
+        }
+        else if ( flags & PFFASTCONV_DIRECT_OUT )
+        {
+          pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
+        }
+        else
+        {
+          pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+          memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
+        }
+
+      }
+    }
+
+    return inpOff;
+  }
+}
+
--- a/pffft/pffastconv.h
+++ b/pffft/pffastconv.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of PFFFT, PFFASTCONV, nor the names of its
+   sponsors or contributors may be used to endorse or promote products
+   derived from this Software without specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+/*
+   PFFASTCONV : a Pretty Fast Fast Convolution
+
+   This is basically the implementation of fast convolution,
+   utilizing the FFT (pffft).
+
+   Restrictions: 
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFASTCONV_H
+#define PFFASTCONV_H
+
+#include <stddef.h> /* for size_t */
+#include "pffft.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff
+     this struct can't be shared by many threads as it contains
+     temporary data, computed within the convolution
+  */
+  typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
+
+  typedef enum {
+    PFFASTCONV_CPLX_INP_OUT = 1,
+    /* set when input and output is complex,
+     * with real and imag part interleaved in both vectors.
+     * input[] has inputLen complex values: 2 * inputLen floats,
+     * output[] is also written with complex values.
+     * without this flag, the input is interpreted as real vector
+     */
+
+    PFFASTCONV_CPLX_FILTER = 2,
+    /* set when filterCoeffs is complex,
+     * with real and imag part interleaved.
+     * filterCoeffs[] has filterLen complex values: 2 * filterLen floats
+     * without this flag, the filter is interpreted as real vector
+     * ATTENTION: this is not implemented yet!
+     */
+
+    PFFASTCONV_DIRECT_INP = 4,
+    /* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
+     * 1- input vecor X must be aligned
+     * 2- (all) inputLen <= ouput blockLen
+     * 3- X must have minimum length of output BlockLen
+     * 4- the additional samples from inputLen .. BlockLen-1
+     *   must contain valid small and non-NAN samples (ideally zero)
+     * 
+     * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
+     */
+
+    PFFASTCONV_DIRECT_OUT = 8,
+    /* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
+     * 1- output vector Y must be aligned
+     * 2- (all) inputLen <= ouput blockLen
+     * 3- Y must have minimum length of output blockLen
+     * 
+     * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
+     */
+
+    PFFASTCONV_CPLX_SINGLE_FFT = 16,
+    /* hint to process complex data with one single FFT;
+     * default is to use 2 FFTs: one for real part, one for imag part
+     * */
+
+
+    PFFASTCONV_SYMMETRIC = 32,
+    /* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
+
+    PFFASTCONV_CORRELATION = 64,
+    /* filterCoeffs[] of pffastconv_new_setup are for correlation;
+     * thus, do not flip them for the internal fft calculation
+     * - as necessary for the fast convolution */
+
+  } pffastconv_flags_t;
+
+  /*
+    prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
+    The output 'blockLen' might be bigger to allow the fast convolution.
+    
+    'flags' are bitmask over the 'pffastconv_flags_t' enum.
+
+    PFFASTCONV_Setup structure can't be shared accross multiple filters
+    or concurrent threads.
+  */
+  PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
+
+  void pffastconv_destroy_setup(PFFASTCONV_Setup *);
+
+  /* 
+     Perform the fast convolution.
+
+     'input' and 'output' don't need to be aligned - unless any of
+     PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
+
+     inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
+     in this case, multiple FFTs are called internally, to process the
+     input[].
+
+     'output' vector must have size >= (inputLen - filterLen + 1)
+
+     set bool option 'applyFlush' to process the full input[].
+     with this option, 'tail samples' of input are also processed.
+     This might be inefficient, because the FFT is called to produce
+     few(er) output samples, than possible.
+     This option is useful to process the last samples of an input (file)
+     or to reduce latency.
+
+     return value is the number of produced samples in output[].
+     the same amount of samples is processed from input[]. to continue
+     processing, the caller must save/move the remaining samples of
+     input[].
+
+  */
+  int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
+
+  void *pffastconv_malloc(size_t nb_bytes);
+  void pffastconv_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
+  int pffastconv_simd_size();
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFASTCONV_H */
--- a/pffft/pffft.c
+++ b/pffft/pffft.c
@@ -0,0 +1,134 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog: 
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#include "pffft.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#include "simd/pf_float.h"
+
+/* have code comparable with this definition */
+#define SETUP_STRUCT               PFFFT_Setup
+#define FUNC_NEW_SETUP             pffft_new_setup
+#define FUNC_DESTROY               pffft_destroy_setup
+#define FUNC_TRANSFORM_UNORDRD     pffft_transform
+#define FUNC_TRANSFORM_ORDERED     pffft_transform_ordered
+#define FUNC_ZREORDER              pffft_zreorder
+#define FUNC_ZCONVOLVE_ACCUMULATE  pffft_zconvolve_accumulate
+#define FUNC_ZCONVOLVE_NO_ACCU     pffft_zconvolve_no_accu
+
+#define FUNC_ALIGNED_MALLOC        pffft_aligned_malloc
+#define FUNC_ALIGNED_FREE          pffft_aligned_free
+#define FUNC_SIMD_SIZE             pffft_simd_size
+#define FUNC_MIN_FFT_SIZE          pffft_min_fft_size
+#define FUNC_IS_VALID_SIZE         pffft_is_valid_size
+#define FUNC_NEAREST_SIZE          pffft_nearest_transform_size
+#define FUNC_SIMD_ARCH             pffft_simd_arch
+#define FUNC_VALIDATE_SIMD_A       validate_pffft_simd
+#define FUNC_VALIDATE_SIMD_EX      validate_pffft_simd_ex
+
+#define FUNC_CPLX_FINALIZE         pffft_cplx_finalize
+#define FUNC_CPLX_PREPROCESS       pffft_cplx_preprocess
+#define FUNC_REAL_PREPROCESS_4X4   pffft_real_preprocess_4x4
+#define FUNC_REAL_PREPROCESS       pffft_real_preprocess
+#define FUNC_REAL_FINALIZE_4X4     pffft_real_finalize_4x4
+#define FUNC_REAL_FINALIZE         pffft_real_finalize
+#define FUNC_TRANSFORM_INTERNAL    pffft_transform_internal
+
+#define FUNC_COS  cosf
+#define FUNC_SIN  sinf
+
+
+#include "pffft_priv_impl.h"
+
+
--- a/pffft/pffft.h
+++ b/pffft/pffft.h
@@ -0,0 +1,241 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+   
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+   
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.  
+
+   Restrictions: 
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.  
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+#ifndef PFFFT_COMMON_ENUMS
+#define PFFFT_COMMON_ENUMS
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+  
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+#endif
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads. 
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /* 
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384). Threads usually have a small stack, that
+     there's no sufficient amount of memory, usually leading to a crash!
+     Use the heap with pffft_aligned_malloc() in this case.
+
+     For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
+     input with input(=transformation) length N, the output array is
+     'mostly' complex:
+       index k in 1 .. N/2 -1  corresponds to frequency k * Samplerate / N
+       index k == 0 is a special case:
+         the real() part contains the result for the DC frequency 0,
+         the imag() part contains the result for the Nyquist frequency Samplerate/2
+     both 0-frequency and half frequency components, which are real,
+     are assembled in the first entry as  F(0)+i*F(N/2).
+     With the output size N/2 complex values (=N real/imag values), it is
+     obvious, that the result for negative frequencies are not output,
+     cause of symmetry.
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /* 
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+     
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /* 
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+     
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+     
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and put result in dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
+  int pffft_simd_size();
+
+  /* return string identifier of used architecture (SSE/NEON/Altivec/..) */
+  const char * pffft_simd_arch();
+
+
+  /* following functions are identical to the pffftd_ functions */
+
+  /* simple helper to get minimum possible fft size */
+  int pffft_min_fft_size(pffft_transform_t transform);
+
+  /* simple helper to determine next power of 2
+     - without inexact/rounding floating point operations
+  */
+  int pffft_next_power_of_two(int N);
+
+  /* simple helper to determine if power of 2 - returns bool */
+  int pffft_is_power_of_two(int N);
+
+  /* simple helper to determine size N is valid
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5
+     returns bool
+  */
+  int pffft_is_valid_size(int N, pffft_transform_t cplx);
+
+  /* determine nearest valid transform size  (by brute-force testing)
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5.
+     higher: bool-flag to find nearest higher value; else lower.
+  */
+  int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.  
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFFT_H */
+
--- a/pffft/pffft.hpp
+++ b/pffft/pffft.hpp
--- a/pffft/pffft_common.c
+++ b/pffft/pffft_common.c
@@ -0,0 +1,53 @@
+
+#include "pffft.h"
+
+#include <stdlib.h>
+
+/* SSE and co like 16-bytes aligned pointers
+ * with a 64-byte alignment, we are even aligned on L2 cache lines... */
+#define MALLOC_V4SF_ALIGNMENT 64
+
+static void * Valigned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
+
+static void Valigned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
+
+
+static int next_power_of_two(int N) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  /* compute the next highest power of 2 of 32-bit v */
+  unsigned v = N;
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+static int is_power_of_two(int N) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
+  int f = N && !(N & (N - 1));
+  return f;
+}
+
+
+
+void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
+void pffft_aligned_free(void *p) { Valigned_free(p); }
+int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
+int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
+
+void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
+void pffftd_aligned_free(void *p) { Valigned_free(p); }
+int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
+int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
--- a/pffft/pffft_double.c
+++ b/pffft/pffft_double.c
@@ -0,0 +1,147 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+   NOTE: This file is adapted from Julien Pommier's original PFFFT,
+   which works on 32 bit floating point precision using SSE instructions,
+   to work with 64 bit floating point precision using AVX instructions.
+   Author: Dario Mambro @ https://github.com/unevens/pffft
+*/
+
+#include "pffft_double.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#ifdef COMPILER_MSVC
+#  define _USE_MATH_DEFINES
+#  include <malloc.h>
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+#  include <malloc.h>
+#else
+#  include <alloca.h>
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   AVX -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#include "simd/pf_double.h"
+
+/* have code comparable with this definition */
+#define float double
+#define SETUP_STRUCT               PFFFTD_Setup
+#define FUNC_NEW_SETUP             pffftd_new_setup
+#define FUNC_DESTROY               pffftd_destroy_setup
+#define FUNC_TRANSFORM_UNORDRD     pffftd_transform
+#define FUNC_TRANSFORM_ORDERED     pffftd_transform_ordered
+#define FUNC_ZREORDER              pffftd_zreorder
+#define FUNC_ZCONVOLVE_ACCUMULATE  pffftd_zconvolve_accumulate
+#define FUNC_ZCONVOLVE_NO_ACCU     pffftd_zconvolve_no_accu
+
+#define FUNC_ALIGNED_MALLOC        pffftd_aligned_malloc
+#define FUNC_ALIGNED_FREE          pffftd_aligned_free
+#define FUNC_SIMD_SIZE             pffftd_simd_size
+#define FUNC_MIN_FFT_SIZE          pffftd_min_fft_size
+#define FUNC_IS_VALID_SIZE         pffftd_is_valid_size
+#define FUNC_NEAREST_SIZE          pffftd_nearest_transform_size
+#define FUNC_SIMD_ARCH             pffftd_simd_arch
+#define FUNC_VALIDATE_SIMD_A       validate_pffftd_simd
+#define FUNC_VALIDATE_SIMD_EX      validate_pffftd_simd_ex
+
+#define FUNC_CPLX_FINALIZE         pffftd_cplx_finalize
+#define FUNC_CPLX_PREPROCESS       pffftd_cplx_preprocess
+#define FUNC_REAL_PREPROCESS_4X4   pffftd_real_preprocess_4x4
+#define FUNC_REAL_PREPROCESS       pffftd_real_preprocess
+#define FUNC_REAL_FINALIZE_4X4     pffftd_real_finalize_4x4
+#define FUNC_REAL_FINALIZE         pffftd_real_finalize
+#define FUNC_TRANSFORM_INTERNAL    pffftd_transform_internal
+
+#define FUNC_COS  cos
+#define FUNC_SIN  sin
+
+
+#include "pffft_priv_impl.h"
+
+
--- a/pffft/pffft_double.h
+++ b/pffft/pffft_double.h
@@ -0,0 +1,236 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+/*
+   NOTE: This file is adapted from Julien Pommier's original PFFFT,
+   which works on 32 bit floating point precision using SSE instructions,
+   to work with 64 bit floating point precision using AVX instructions.
+   Author: Dario Mambro @ https://github.com/unevens/pffft
+*/
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+   
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.  
+
+   Restrictions: 
+
+   - 1D transforms only, with 64-bit double precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (double*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 32 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_DOUBLE_H
+#define PFFFT_DOUBLE_H
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.  
+  */
+  typedef struct PFFFTD_Setup PFFFTD_Setup;
+
+#ifndef PFFFT_COMMON_ENUMS
+#define PFFFT_COMMON_ENUMS
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+  
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+#endif
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFTD_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads. 
+  */
+  PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
+  void pffftd_destroy_setup(PFFFTD_Setup *);
+  /* 
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+     
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) doubles, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384). Threads usually have a small stack, that
+     there's no sufficient amount of memory, usually leading to a crash!
+     Use the heap with pffft_aligned_malloc() in this case.
+
+     input and output may alias.
+  */
+  void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
+
+  /* 
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+     
+     input and output may alias.
+  */
+  void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
+
+  /* 
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+     
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+     
+     input and output should not alias.
+  */
+  void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and put result in dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
+
+  /* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
+  int pffftd_simd_size();
+
+  /* return string identifier of used architecture (AVX/..) */
+  const char * pffftd_simd_arch();
+
+  /* simple helper to get minimum possible fft size */
+  int pffftd_min_fft_size(pffft_transform_t transform);
+
+  /* simple helper to determine size N is valid
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5
+  */
+  int pffftd_is_valid_size(int N, pffft_transform_t cplx);
+
+  /* determine nearest valid transform size  (by brute-force testing)
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5.
+     higher: bool-flag to find nearest higher value; else lower.
+  */
+  int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
+
+
+  /* following functions are identical to the pffft_ functions - both declared */
+
+  /* simple helper to determine next power of 2
+     - without inexact/rounding floating point operations
+  */
+  int pffftd_next_power_of_two(int N);
+  int pffft_next_power_of_two(int N);
+
+  /* simple helper to determine if power of 2 - returns bool */
+  int pffftd_is_power_of_two(int N);
+  int pffft_is_power_of_two(int N);
+
+  /*
+    the double buffers must have the correct alignment (32-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.  
+  */
+  void *pffftd_aligned_malloc(size_t nb_bytes);
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffftd_aligned_free(void *);
+  void pffft_aligned_free(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFFT_DOUBLE_H */
+
--- a/pffft/pffft_priv_impl.h
+++ b/pffft/pffft_priv_impl.h
--- a/pffft/plots.sh
+++ b/pffft/plots.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+OUTPNG="1"
+W="1024"
+H="768"
+PTS="20"
+LWS="20"
+
+for f in $(ls -1 *-4-*.csv *-6-*.csv); do
+  b=$(basename "$f" ".csv")
+  #echo $b
+  LASTCOL="$(head -n 1 $f |sed 's/,/,\n/g' |grep -c ',')"
+  echo "${b}: last column is $LASTCOL"
+  if [ $(echo "$b" |grep -c -- "-1-") -gt 0 ]; then
+    YL="duration in ms; less is better"
+  elif [ $(echo "$b" |grep -c -- "-4-") -gt 0 ]; then
+    YL="duration relative to pffft; less is better"
+  else
+    YL=""
+  fi
+
+  E=""
+  if [ "${OUTPNG}" = "1" ]; then
+    E="set terminal png size $W,$H"
+    E="${E} ; set output '${b}.png'"
+  fi
+  if [ -z "${E}" ]; then
+    E="set key outside"
+  else
+    E="${E} ; set key outside"
+  fi
+  E="${E} ; set datafile separator ','"
+  E="${E} ; set title '${b}'"
+  E="${E} ; set xlabel 'fft order: fft size N = 2\\^order'"
+  if [ ! -z "${YL}" ]; then
+    #echo "  setting  Y label to ${YL}"
+    E="${E} ; set ylabel '${YL}'"
+  fi
+  # unfortunately no effect for 
+  #for LNO in $(seq 1 ${LASTCOL}) ; do
+  #  E="${E} ; set style line ${LNO} ps ${PTS} lw ${LWS}"
+  #done
+  E="${E} ; plot for [col=3:${LASTCOL}] '${f}' using 2:col with lines title columnhead"
+
+  if [ "${OUTPNG}" = "1" ]; then
+    gnuplot -e "${E}"
+  else
+    gnuplot -e "${E}" --persist
+  fi
+done
--- a/pffft/simd/pf_altivec_float.h
+++ b/pffft/simd/pf_altivec_float.h
@@ -0,0 +1,81 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_ALTIVEC_FLT_H
+#define PF_ALTIVEC_FLT_H
+
+/*
+   Altivec support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
+typedef vector float v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#  define VREQUIRES_ALIGN 1  /* not sure, if really required */
+#  define VARCH "ALTIVEC"
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+#endif
+
+#endif /* PF_SSE1_FLT_H */
+
--- a/pffft/simd/pf_avx_double.h
+++ b/pffft/simd/pf_avx_double.h
@@ -0,0 +1,145 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_AVX_DBL_H
+#define PF_AVX_DBL_H
+
+/*
+   vector support macros: the rest of the code is independant of
+   AVX -- adding support for other platforms with 4-element
+   vectors should be limited to these macros
+*/
+
+
+/*
+  AVX support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
+#pragma message( __FILE__ ": AVX macros are defined" )
+
+#include <immintrin.h>
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "AVX"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+		_mm_shuffle_pd(low1__, low2__, 3),								\
+		1);																\
+	out2 = _mm256_insertf128_pd(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+		_mm_shuffle_pd(high1__, high2__, 3),							\
+		1);																\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+		_mm_shuffle_pd(low2__, high2__, 0),								\
+		1);																\
+	out2 = _mm256_insertf128_pd(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+		_mm_shuffle_pd(low2__, high2__, 3),								\
+		1);																\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {				\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     \
+                                                            \
+        tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);       \
+        tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);       \
+        tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);       \
+        tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);       \
+                                                            \
+        (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);	\
+        (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);  \
+        (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);  \
+        (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);  \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+
+/* reverse/flip all floats */
+#  define VREV_S(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
--- a/pffft/simd/pf_double.h
+++ b/pffft/simd/pf_double.h
@@ -0,0 +1,84 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_DBL_H
+#define PF_DBL_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ *  SIMD reference material:
+ *
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ *
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ *
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ *
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ *
+ */
+
+typedef double vsfscalar;
+
+#include "pf_avx_double.h"
+#include "pf_sse2_double.h"
+#include "pf_neon_double.h"
+
+#ifndef SIMD_SZ
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    pragma message( "building double with simd disabled !" )
+#    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
+#  endif
+#endif
+
+#include "pf_scalar_double.h"
+
+/* shortcuts for complex multiplcations */
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#endif /* PF_DBL_H */
+
--- a/pffft/simd/pf_float.h
+++ b/pffft/simd/pf_float.h
@@ -0,0 +1,84 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_FLT_H
+#define PF_FLT_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ *  SIMD reference material:
+ *
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ *
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ *
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ *
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ *
+ */
+
+typedef float vsfscalar;
+
+#include "pf_sse1_float.h"
+#include "pf_neon_float.h"
+#include "pf_altivec_float.h"
+
+#ifndef SIMD_SZ
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    pragma message( "building float with simd disabled !" )
+#    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
+#  endif
+#endif
+
+#include "pf_scalar_float.h"
+
+/* shortcuts for complex multiplcations */
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#endif /* PF_FLT_H */
+
--- a/pffft/simd/pf_neon_double.h
+++ b/pffft/simd/pf_neon_double.h
@@ -0,0 +1,203 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+/*
+  NEON 64bit support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
+
+#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
+
+#include "pf_neon_double_from_avx.h"
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "NEON"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b;
+    return res;
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
+{
+    float64x1_t al = vget_low_f64(a);
+    float64x1_t bl = vget_low_f64(b);
+    return vcombine_f64(al, bl);
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
+{
+    float64x1_t ah = vget_high_f64(a);
+    float64x1_t bh = vget_high_f64(b);
+    return vcombine_f64(ah, bh);
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b.vect_f64[0];
+    return res;
+}
+
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[1];
+    res.vect_f64[1] = b.vect_f64[1];
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+{
+    __m256d res;
+    float64x2_t low = x.vect_f64[0];
+    float64x2_t high = x.vect_f64[1];
+    float64x1_t a = vget_low_f64(low);
+    float64x1_t b = vget_high_f64(low);
+    float64x1_t c = vget_low_f64(high);
+    float64x1_t d = vget_high_f64(high);
+    res.vect_f64[0] =  vcombine_f64(d, c);
+    res.vect_f64[1] =  vcombine_f64(b, a);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
+		_mm_shuffle_pd_11(low1__, low2__));								\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
+		_mm_shuffle_pd_11(high1__, high2__));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
+		_mm_shuffle_pd_00(low2__, high2__));							\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
+		_mm_shuffle_pd_11(low2__, high2__));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   _mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
--- a/pffft/simd/pf_neon_double_from_avx.h
+++ b/pffft/simd/pf_neon_double_from_avx.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+//see https://github.com/kunpengcompute/AvxToNeon
+
+#ifndef PF_NEON_DBL_FROM_AVX_H
+#define PF_NEON_DBL_FROM_AVX_H
+#include <arm_neon.h>
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#else
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+
+#define FORCE_INLINE static inline
+
+#endif
+
+typedef struct {
+    float32x4_t vect_f32[2];
+} __m256;
+
+typedef struct {
+    float64x2_t vect_f64[2];
+} __m256d;
+
+typedef float64x2_t __m128d;
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
+}
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+    return a.vect_f64[0];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f64[imm8];
+}
+
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+    __m256d res;
+    res.vect_f64[0] = a;
+    return res;
+}
+
+#endif /* PF_AVX_DBL_H */
+
--- a/pffft/simd/pf_neon_float.h
+++ b/pffft/simd/pf_neon_float.h
@@ -0,0 +1,87 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_FLT_H
+#define PF_NEON_FLT_H
+
+/*
+  ARM NEON support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
+#pragma message( __FILE__ ": ARM NEON macros are defined" )
+
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "NEON"
+#  define VREQUIRES_ALIGN 0  /* usually no alignment required */
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
+#  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+// marginally faster version
+//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)    vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
+/* reverse/flip complex floats */
+#  define VREV_C(a)    vextq_f32(a, a, 2)
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+
+#else
+/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
+#endif
+
+#endif /* PF_NEON_FLT_H */
+
--- a/pffft/simd/pf_scalar_double.h
+++ b/pffft/simd/pf_scalar_double.h
@@ -0,0 +1,185 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SCAL_DBL_H
+#define PF_SCAL_DBL_H
+
+/*
+  fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
+*/
+
+#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
+#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
+
+typedef struct {
+  vsfscalar a;
+  vsfscalar b;
+  vsfscalar c;
+  vsfscalar d;
+} v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "4xScalar"
+#  define VREQUIRES_ALIGN 0
+
+  static ALWAYS_INLINE(v4sf) VZERO() {
+    v4sf r = { 0.f, 0.f, 0.f, 0.f };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
+    v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
+    v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
+    v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
+    v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
+    v4sf r = { v, v, v, v };
+    return r;
+  }
+
+#  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
+
+#  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
+
+
+  /* INTERLEAVE2() */
+  #define INTERLEAVE2( A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, B.a, A.b, B.b }; \
+    v4sf Dr = { A.c, B.c, A.d, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* UNINTERLEAVE2() */
+  #define UNINTERLEAVE2(A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, A.c, B.a, B.c }; \
+    v4sf Dr = { A.b, A.d, B.b, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VTRANSPOSE4() */
+  #define VTRANSPOSE4(A, B, C, D) \
+  do { \
+    v4sf Ar = { A.a, B.a, C.a, D.a }; \
+    v4sf Br = { A.b, B.b, C.b, D.b }; \
+    v4sf Cr = { A.c, B.c, C.c, D.c }; \
+    v4sf Dr = { A.d, B.d, C.d, D.d }; \
+    A = Ar; \
+    B = Br; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VSWAPHL() */
+  static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
+    v4sf r = { B.a, B.b, A.c, A.d };
+    return r;
+  }
+
+
+  /* reverse/flip all floats */
+  static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
+    v4sf r = { A.d, A.c, A.b, A.a };
+    return r;
+  }
+
+  /* reverse/flip complex floats */
+  static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
+    v4sf r = { A.c, A.d, A.a, A.b };
+    return r;
+  }
+
+#else
+/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
+#endif
+
+
+#if !defined(SIMD_SZ)
+#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
+typedef vsfscalar v4sf;
+
+#  define SIMD_SZ 1
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "Scalar"
+#  define VREQUIRES_ALIGN 0
+#  define VZERO() 0.0
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
+
+#else
+/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
+#endif
+
+
+#endif /* PF_SCAL_DBL_H */
+
--- a/pffft/simd/pf_scalar_float.h
+++ b/pffft/simd/pf_scalar_float.h
@@ -0,0 +1,185 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SCAL_FLT_H
+#define PF_SCAL_FLT_H
+
+/*
+  fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
+*/
+
+#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
+#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
+
+typedef struct {
+  vsfscalar a;
+  vsfscalar b;
+  vsfscalar c;
+  vsfscalar d;
+} v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "4xScalar"
+#  define VREQUIRES_ALIGN 0
+
+  static ALWAYS_INLINE(v4sf) VZERO() {
+    v4sf r = { 0.f, 0.f, 0.f, 0.f };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
+    v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
+    v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
+    v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
+    v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
+    v4sf r = { v, v, v, v };
+    return r;
+  }
+
+#  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
+
+#  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
+
+
+  /* INTERLEAVE2() */
+  #define INTERLEAVE2( A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, B.a, A.b, B.b }; \
+    v4sf Dr = { A.c, B.c, A.d, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* UNINTERLEAVE2() */
+  #define UNINTERLEAVE2(A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, A.c, B.a, B.c }; \
+    v4sf Dr = { A.b, A.d, B.b, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VTRANSPOSE4() */
+  #define VTRANSPOSE4(A, B, C, D) \
+  do { \
+    v4sf Ar = { A.a, B.a, C.a, D.a }; \
+    v4sf Br = { A.b, B.b, C.b, D.b }; \
+    v4sf Cr = { A.c, B.c, C.c, D.c }; \
+    v4sf Dr = { A.d, B.d, C.d, D.d }; \
+    A = Ar; \
+    B = Br; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VSWAPHL() */
+  static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
+    v4sf r = { B.a, B.b, A.c, A.d };
+    return r;
+  }
+
+
+  /* reverse/flip all floats */
+  static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
+    v4sf r = { A.d, A.c, A.b, A.a };
+    return r;
+  }
+
+  /* reverse/flip complex floats */
+  static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
+    v4sf r = { A.c, A.d, A.a, A.b };
+    return r;
+  }
+
+#else
+/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
+#endif
+
+
+#if !defined(SIMD_SZ)
+#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
+typedef vsfscalar v4sf;
+
+#  define SIMD_SZ 1
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "Scalar"
+#  define VREQUIRES_ALIGN 0
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
+
+#else
+/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
+#endif
+
+
+#endif /* PF_SCAL_FLT_H */
+
--- a/pffft/simd/pf_sse1_float.h
+++ b/pffft/simd/pf_sse1_float.h
@@ -0,0 +1,82 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SSE1_FLT_H
+#define PF_SSE1_FLT_H
+
+/*
+  SSE1 support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
+#pragma message( __FILE__ ": SSE1 float macros are defined" )
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+
+/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
+ *  anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "SSE1"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm_loadu_ps(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm_load_ps(ptr)
+
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+#else
+/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
+#endif
+
+#endif /* PF_SSE1_FLT_H */
+
--- a/pffft/simd/pf_sse2_double.h
+++ b/pffft/simd/pf_sse2_double.h
@@ -0,0 +1,281 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SSE2_DBL_H
+#define PF_SSE2_DBL_H
+
+//detect sse2 support under MSVC
+#if defined ( _M_IX86_FP )
+#  if _M_IX86_FP == 2
+#    if !defined(__SSE2__)
+#      define __SSE2__
+#    endif
+#  endif
+#endif
+
+/*
+  SSE2 64bit support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
+#pragma message (__FILE__ ": SSE2 double macros are defined" )
+
+#include <emmintrin.h>
+
+typedef struct {
+    __m128d d128[2];
+} m256d;
+
+typedef m256d v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#elif defined (_MSC_VER)
+#define FORCE_INLINE static __forceinline
+
+#else
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#endif
+
+FORCE_INLINE m256d mm256_setzero_pd(void)
+{
+    m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_set1_pd(double a)
+{
+    m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
+{
+    m256d res;
+    res.d128[0] = _mm_load_pd((const double *)mem_addr);
+    res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
+{
+    m256d res;
+    res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
+    res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
+    return res;
+}
+
+
+#  define VARCH "SSE2"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() mm256_setzero_pd()
+#  define VMUL(a,b) mm256_mul_pd(a,b)
+#  define VADD(a,b) mm256_add_pd(a,b)
+#  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) mm256_sub_pd(a,b)
+#  define LD_PS1(p) mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
+
+
+FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
+{
+    return a.d128[0];
+}
+
+FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.d128[imm8];
+}
+FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
+{
+    m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b;
+    return res;
+}
+FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
+{
+    m256d res;
+    res.d128[0] = a;
+    return res;
+}
+
+FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
+    return res;
+}
+
+FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
+    return res;
+}
+
+FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
+    m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b.d128[0];
+    return res;
+}
+
+
+FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = a.d128[1];
+    res.d128[1] = b.d128[1];
+    return res;
+}
+
+FORCE_INLINE m256d mm256_reverse(m256d x)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
+    res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+		_mm_shuffle_pd(low1__, low2__, 3));								\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+		_mm_shuffle_pd(high1__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+		_mm_shuffle_pd(low2__, high2__, 0));							\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+		_mm_shuffle_pd(low2__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+#endif
--- a/pffft/sse2neon.h
+++ b/pffft/sse2neon.h
--- a/pffft/test_fft_factors.c
+++ b/pffft/test_fft_factors.c
@@ -0,0 +1,142 @@
+
+#ifdef PFFFT_ENABLE_FLOAT
+#include "pffft.h"
+#endif
+
+
+#ifdef PFFFT_ENABLE_DOUBLE
+#include "pffft_double.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+
+
+#ifdef PFFFT_ENABLE_FLOAT
+int test_float(int TL)
+{
+  PFFFT_Setup * S;
+
+  for (int dir_i = 0; dir_i <= 1; ++dir_i)
+  {
+    for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
+    {
+      const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
+      const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
+      const int N_min = pffft_min_fft_size(cplx);
+      const int N_max = N_min * 11 + N_min;
+      int NTL = pffft_nearest_transform_size(TL, cplx, (!dir_i));
+      double near_off = (NTL - TL) * 100.0 / (double)TL;
+
+      fprintf(stderr, "testing float, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
+          (!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
+
+      for (int N = (N_min/2); N <= N_max; N += (N_min/2))
+      {
+        int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
+        const int factorizable = pffft_is_valid_size(N, cplx);
+        while (R >= 5*N_min && (R % 5) == 0) {  R /= 5; ++f5; }
+        while (R >= 3*N_min && (R % 3) == 0) {  R /= 3; ++f3; }
+        while (R >= 2*N_min && (R % 2) == 0) {  R /= 2; ++f2; }
+        tmp_f = (R == N_min) ? 1 : 0;
+        assert( factorizable == tmp_f );
+
+        S = pffft_new_setup(N, cplx);
+
+        if ( S && !factorizable )
+        {
+          fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
+          return 1;
+        }
+        else if ( !S && factorizable)
+        {
+          fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
+          return 1;
+        }
+        
+        if (S)
+          pffft_destroy_setup(S);
+      }
+
+    }
+  }
+  return 0;
+}
+
+#endif
+
+
+#ifdef PFFFT_ENABLE_DOUBLE
+int test_double(int TL)
+{
+  PFFFTD_Setup * S;
+  for (int dir_i = 0; dir_i <= 1; ++dir_i)
+  {
+    for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
+    {
+      const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
+      const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
+      const int N_min = pffftd_min_fft_size(cplx);
+      const int N_max = N_min * 11 + N_min;
+      int NTL = pffftd_nearest_transform_size(TL, cplx, (!dir_i));
+      double near_off = (NTL - TL) * 100.0 / (double)TL;
+
+      fprintf(stderr, "testing double, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
+          (!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
+
+      for (int N = (N_min/2); N <= N_max; N += (N_min/2))
+      {
+        int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
+        const int factorizable = pffftd_is_valid_size(N, cplx);
+        while (R >= 5*N_min && (R % 5) == 0) {  R /= 5; ++f5; }
+        while (R >= 3*N_min && (R % 3) == 0) {  R /= 3; ++f3; }
+        while (R >= 2*N_min && (R % 2) == 0) {  R /= 2; ++f2; }
+        tmp_f = (R == N_min) ? 1 : 0;
+        assert( factorizable == tmp_f );
+
+        S = pffftd_new_setup(N, cplx);
+
+        if ( S && !factorizable )
+        {
+          fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
+          return 1;
+        }
+        else if ( !S && factorizable)
+        {
+          fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
+          return 1;
+        }
+        
+        if (S)
+          pffftd_destroy_setup(S);
+      }
+
+    }
+  }
+  return 0;
+}
+
+#endif
+
+
+
+int main(int argc, char *argv[])
+{
+  int N = (1 < argc) ? atoi(argv[1]) : 2;
+
+  int r = 0;
+#ifdef PFFFT_ENABLE_FLOAT
+  r = test_float(N);
+  if (r)
+    return r;
+#endif
+
+#ifdef PFFFT_ENABLE_DOUBLE
+  r = test_double(N);
+#endif
+
+  return r;
+}
+
--- a/pffft/test_pffastconv.c
+++ b/pffft/test_pffastconv.c
@@ -0,0 +1,991 @@
+/*
+  Copyright (c) 2013 Julien Pommier.
+  Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
+ */
+
+#define _WANT_SNAN  1
+
+#include "pffft.h"
+#include "pffastconv.h"
+
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+/* benchmark duration: 250 ms */
+#define BENCH_TEST_DURATION_IN_SEC      0.5
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#if 0
+#include "simd/pf_float.h"
+#endif
+
+#if defined(_MSC_VER)
+#  define RESTRICT __restrict
+#elif defined(__GNUC__)
+#  define RESTRICT __restrict
+#else
+#  define RESTRICT
+#endif
+
+
+#if defined(_MSC_VER)
+#pragma warning( disable : 4244 )
+#endif
+
+
+#ifdef SNANF
+  #define INVALID_FLOAT_VAL  SNANF
+#elif defined(SNAN)
+  #define INVALID_FLOAT_VAL  SNAN
+#elif defined(NAN)
+  #define INVALID_FLOAT_VAL  NAN
+#elif defined(INFINITY)
+  #define INVALID_FLOAT_VAL  INFINITY
+#else
+  #define INVALID_FLOAT_VAL  FLT_MAX
+#endif
+
+
+#if defined(HAVE_SYS_TIMES)
+  inline double uclock_sec(void) {
+    static double ttclk = 0.;
+    struct tms t;
+    if (ttclk == 0.)
+      ttclk = sysconf(_SC_CLK_TCK);
+    times(&t);
+    /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
+    return ((double)t.tms_utime)) / ttclk;
+  }
+# else
+  double uclock_sec(void)
+{ return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+
+typedef int            (*pfnConvolution)  (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
+typedef void*          (*pfnConvSetup)    (float *Hfwd, int Nf, int * BlkLen, int flags);
+typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
+typedef void           (*pfnConvDestroy)  (void * setup);
+
+
+struct ConvSetup
+{
+  pfnConvolution pfn;
+  int N;
+  int B;
+  float * H;
+  int flags;
+};
+
+
+void * convSetupRev( float * H, int N, int * BlkLen, int flags )
+{
+  struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
+  int i, Nr = N;
+  if (flags & PFFASTCONV_CPLX_INP_OUT)
+    Nr *= 2;
+  Nr += 4;
+  s->pfn = NULL;
+  s->N = N;
+  s->B = *BlkLen;
+  s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
+  s->flags = flags;
+  memset(s->H, 0, (unsigned)Nr * sizeof(float));
+  if (flags & PFFASTCONV_CPLX_INP_OUT)
+  {
+    for ( i = 0; i < N; ++i ) {
+      s->H[2*(N-1 -i)  ] = H[i];
+      s->H[2*(N-1 -i)+1] = H[i];
+    }
+    /* simpler detection of overruns */
+    s->H[ 2*N    ] = INVALID_FLOAT_VAL;
+    s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
+    s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
+    s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
+  }
+  else
+  {
+    for ( i = 0; i < N; ++i )
+      s->H[ N-1 -i ] = H[i];
+    /* simpler detection of overruns */
+    s->H[ N    ] = INVALID_FLOAT_VAL;
+    s->H[ N +1 ] = INVALID_FLOAT_VAL;
+    s->H[ N +2 ] = INVALID_FLOAT_VAL;
+    s->H[ N +3 ] = INVALID_FLOAT_VAL;
+  }
+  return s;
+}
+
+void convDestroyRev( void * setup )
+{
+  struct ConvSetup * s = (struct ConvSetup*)setup;
+  pffastconv_free(s->H);
+  pffastconv_free(setup);
+}
+
+
+pfnConvolution ConvGetFnPtrRev( void * setup )
+{
+  struct ConvSetup * s = (struct ConvSetup*)setup;
+  if (!s)
+    return NULL;
+  return s->pfn;
+}
+
+
+void convSimdDestroy( void * setup )
+{
+  convDestroyRev(setup);
+}
+
+
+void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
+{
+  void * p = pffastconv_new_setup( H, N, BlkLen, flags );
+  if (!p)
+    printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
+  return p;
+}
+
+
+void fastConvDestroy( void * setup )
+{
+  pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
+}
+
+
+
+int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  const float * RESTRICT X = input;
+  const float * RESTRICT Hrev = p->H;
+  float * RESTRICT Y = output;
+  const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+  const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+  int i, j;
+  (void)Yref;
+  (void)applyFlush;
+
+  if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+  {
+    for ( i = 0; i <= lenNr; i += 2 )
+    {
+      float sumRe = 0.0F, sumIm = 0.0F;
+      for ( j = 0; j < Nr; j += 2 )
+      {
+        sumRe += X[i+j  ] * Hrev[j];
+        sumIm += X[i+j+1] * Hrev[j+1];
+      }
+      Y[i  ] = sumRe;
+      Y[i+1] = sumIm;
+    }
+    return i/2;
+  }
+  else
+  {
+    for ( i = 0; i <= lenNr; ++i )
+    {
+      float sum = 0.0F;
+      for (j = 0; j < Nr; ++j )
+        sum += X[i+j]   * Hrev[j];
+      Y[i] = sum;
+    }
+    return i;
+  }
+}
+
+
+
+int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  float sum[4];
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  const float * RESTRICT X = input;
+  const float * RESTRICT Hrev = p->H;
+  float * RESTRICT Y = output;
+  const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+  const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+  int i, j;
+  (void)Yref;
+  (void)applyFlush;
+
+  if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+  {
+    if ( (Nr & 3) == 0 )
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < Nr; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        Y[i  ] = sum[0] + sum[2];
+        Y[i+1] = sum[1] + sum[3];
+      }
+    }
+    else
+    {
+      const int M = Nr & (~3);
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        float tailSumRe = 0.0F, tailSumIm = 0.0F;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < M; j += 4 )
+        {
+          sum[0] += X[i+j  ] * Hrev[j  ];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        for ( ; j < Nr; j += 2 ) {
+          tailSumRe += X[i+j  ] * Hrev[j  ];
+          tailSumIm += X[i+j+1] * Hrev[j+1];
+        }
+        Y[i  ] = ( sum[0] + sum[2] ) + tailSumRe;
+        Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
+      }
+    }
+    return i/2;
+  }
+  else
+  {
+    if ( (Nr & 3) == 0 )
+    {
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < Nr; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
+      }
+      return i;
+    }
+    else
+    {
+      const int M = Nr & (~3);
+      /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        float tailSum = 0.0;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < M; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        for ( ; j < Nr; ++j )
+          tailSum += X[i+j] * Hrev[j];
+        Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
+      }
+      return i;
+    }
+  }
+}
+
+
+int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  float sum[4];
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  (void)Yref;
+  (void)applyFlush;
+  if (p->flags & PFFASTCONV_SYMMETRIC)
+  {
+    const float * RESTRICT X = input;
+    const float * RESTRICT Hrev = p->H;
+    float * RESTRICT Y = output;
+    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+    const int h = Nr / 2 -4;
+    const int E = Nr -4;
+    int i, j;
+
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        const int k = i + E;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j <= h; j += 4 )
+        {
+          sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+2] );
+          sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
+          sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j  ] );
+          sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
+        }
+        Y[i  ] = sum[0] + sum[2];
+        Y[i+1] = sum[1] + sum[3];
+      }
+      return i/2;
+    }
+    else
+    {
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        const int k = i + E;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j <= h; j += 4 )
+        {
+          sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+3] );
+          sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
+          sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
+          sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j  ] );
+        }
+        Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
+      }
+      return i;
+    }
+  }
+  else
+  {
+    const float * RESTRICT X = input;
+    const float * RESTRICT Hrev = p->H;
+    float * RESTRICT Y = output;
+    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+    int i, j;
+
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < Nr; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        Y[i  ] = sum[0] + sum[2];
+        Y[i+1] = sum[1] + sum[3];
+      }
+      return i/2;
+    }
+    else
+    {
+      if ( (Nr & 3) == 0 )
+      {
+        for ( i = 0; i <= lenNr; ++i )
+        {
+          sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+          for (j = 0; j < Nr; j += 4 )
+          {
+            sum[0] += X[i+j]   * Hrev[j];
+            sum[1] += X[i+j+1] * Hrev[j+1];
+            sum[2] += X[i+j+2] * Hrev[j+2];
+            sum[3] += X[i+j+3] * Hrev[j+3];
+          }
+          Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
+        }
+        return i;
+      }
+      else
+      {
+        const int M = Nr & (~3);
+        /* printf("B: Nr = %d\n", Nr ); */
+        for ( i = 0; i <= lenNr; ++i )
+        {
+          float tailSum = 0.0;
+          sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+          for (j = 0; j < M; j += 4 )
+          {
+            sum[0] += X[i+j]   * Hrev[j];
+            sum[1] += X[i+j+1] * Hrev[j+1];
+            sum[2] += X[i+j+2] * Hrev[j+2];
+            sum[3] += X[i+j+3] * Hrev[j+3];
+          }
+          for ( ; j < Nr; ++j )
+            tailSum += X[i+j] * Hrev[j];
+          Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
+        }
+        return i;
+      }
+    }
+  }
+
+}
+
+
+int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
+{
+  (void)Yref;
+  return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
+}
+
+
+
+void printFirst( const float * V, const char * st, const int N, const int perLine )
+{
+  (void)V;  (void)st;  (void)N;  (void)perLine;
+  return;
+#if 0
+  int i;
+  for ( i = 0; i < N; ++i )
+  {
+    if ( (i % perLine) == 0 )
+      printf("\n%s[%d]", st, i);
+    printf("\t%.1f", V[i]);
+  }
+  printf("\n");
+#endif
+}
+
+
+
+#define NUMY       15
+
+
+int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed, int abortFirstFastAlgo, int printErrValues, int printAsCSV, int *pIsFirstFilterLen) {
+  double t0, t1, tstop, td, tdref;
+  float *X, *H;
+  float *Y[NUMY];
+  int64_t outN[NUMY];
+  /* 256 KFloats or 16 MFloats data */
+#if 1
+  const int len = testOutLen ? (1 << 18) : (1 << 24);
+#elif 0
+  const int len = testOutLen ? (1 << 18) : (1 << 13);
+#else
+  const int len = testOutLen ? (1 << 18) : (1024);
+#endif
+  const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
+  const int lenC = len / cplxFactor;
+
+  int yi, yc, posMaxErr;
+  float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
+  int i, j, numErrOverLimit, iter;
+  int retErr = 0;
+
+  /*                                  0               1               2               3                   4                   5                   6                   7                   8                      9,                   10,                  11,                   12,                   13                     */
+  pfnConvSetup   aSetup[NUMY]     = { convSetupRev,   convSetupRev,   convSetupRev,   fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,         fastConvSetup,       fastConvSetup,       fastConvSetup,        fastConvSetup,        fastConvSetup,         };
+  pfnConvDestroy aDestroy[NUMY]   = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,       fastConvDestroy,     fastConvDestroy,     fastConvDestroy,      fastConvDestroy,      fastConvDestroy,       };
+  pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,                NULL,                NULL,                 NULL,                 NULL,                  };
+  pfnConvolution aConv[NUMY]      = { slow_conv_R,    slow_conv_A,    slow_conv_B,    fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,             fast_conv,           fast_conv,           fast_conv,            fast_conv,            fast_conv,             };
+  const char * convText[NUMY]     = { "R(non-simd)",  "A(non-simd)",  "B(non-simd)",  "fast_conv_64",     "fast_conv_128",    "fast_conv_256",    "fast_conv_512",    "fast_conv_1K",     "fast_conv_2K",        "fast_conv_4K",      "fast_conv_8K",      "fast_conv_16K",      "fast_conv_32K",      "fast_conv_64K",       };
+  int    aFastAlgo[NUMY]          = { 0,              0,              0,              1,                  1,                  1,                  1,                  1,                  1,                     1,                   1,                   1,                    1,                    1,                     };
+  void * aSetupCfg[NUMY]          = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,                NULL,                NULL,                 NULL,                 NULL,                  };
+//int    aBlkLen[NUMY]            = { 1024,           1024,           1024,           64,                 128,                256,                512,                1024,               2048,                  4096,                8192,                16384,                32768,                65536,                 };
+  int    aBlkLen[NUMY]            = { 8192,           8192,           8192,           64,                 128,                256,                512,                1024,               2048,                  4096,                8192,                16384,                32768,                65536,                 };
+#if 1
+  int    aRunAlgo[NUMY]           = { 1,              1,              1,              FILTERLEN<64,       FILTERLEN<128,      FILTERLEN<256,      FILTERLEN<512,      FILTERLEN<1024,     FILTERLEN<2048,        FILTERLEN<4096,      FILTERLEN<8192,      FILTERLEN<16384,      FILTERLEN<32768,      FILTERLEN<65536,       };
+#elif 0
+  int    aRunAlgo[NUMY]           = { 1,              0,              0,              0 && FILTERLEN<64,  1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536,  };
+#else
+  int    aRunAlgo[NUMY]           = { 1,              1,              1,              0 && FILTERLEN<64,  0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536,  };
+#endif
+  double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
+  int aNumIters[NUMY], aNumLoops[NUMY];
+
+  X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
+  for ( i=0; i < NUMY; ++i)
+  {
+    if ( 1 || i < 2 )
+      Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
+    else
+      Y[i] = Y[1];
+
+    Y[i][0] = 123.F;  /* test for pffft_zconvolve_no_accu() */
+    aSpeedFactor[i] = -1.0;
+    aDuration[i] = -1.0;
+    procSmpPerSec[i] = -1.0;
+    aNumIters[i] = 0;
+    aNumLoops[i] = 0;
+  }
+
+  H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
+
+  /* initialize input */
+  if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
+  {
+    for ( i = 0; i < lenC; ++i )
+    {
+      X[2*i  ] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
+      X[2*i+1] = (float)((i+2048) % 4093);
+    }
+  }
+  else
+  {
+    for ( i = 0; i < len; ++i )
+      X[i] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
+  }
+  X[ len    ] = INVALID_FLOAT_VAL;
+  X[ len +1 ] = INVALID_FLOAT_VAL;
+  X[ len +2 ] = INVALID_FLOAT_VAL;
+  X[ len +3 ] = INVALID_FLOAT_VAL;
+
+  if (!testOutLen)
+    printFirst( X, "X", 64, 8 );
+
+  /* filter coeffs */
+  memset( H, 0, FILTERLEN * sizeof(float) );
+#if 1
+  if ( convFlags & PFFASTCONV_SYMMETRIC )
+  {
+    const int half = FILTERLEN / 2;
+    for ( j = 0; j < half; ++j ) {
+      switch (j % 3) {
+        case 0: H[j] = H[FILTERLEN-1-j] = -1.0F;  break;
+        case 1: H[j] = H[FILTERLEN-1-j] =  1.0F;  break;
+        case 2: H[j] = H[FILTERLEN-1-j] =  0.5F;  break;
+      }
+    }
+  }
+  else
+  {
+    for ( j = 0; j < FILTERLEN; ++j ) {
+      switch (j % 3) {
+        case 0: H[j] = -1.0F;  break;
+        case 1: H[j] = 1.0F;   break;
+        case 2: H[j] = 0.5F;   break;
+      }
+    }
+  }
+#else
+  H[0] = 1.0F;
+  H[FILTERLEN -1] = 1.0F;
+#endif
+  if (!testOutLen)
+    printFirst( H, "H", FILTERLEN, 8 );
+
+  if (!printAsCSV)
+  {
+    printf("\n");
+    printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
+      ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
+      (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
+      ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
+  }
+
+  int hadFastAlgo = 0;
+
+  while (1)
+  {
+
+    for ( yi = 0; yi < NUMY; ++yi )
+    {
+      if (!aRunAlgo[yi])
+        continue;
+
+      if ( aFastAlgo[yi] && abortFirstFastAlgo && hadFastAlgo )
+      {
+        aRunAlgo[yi] = 0;
+        continue;
+      }
+
+      hadFastAlgo = hadFastAlgo | aFastAlgo[yi];
+
+      aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
+
+      /* get effective apply function ptr */
+      if ( aSetupCfg[yi] && aGetFnPtr[yi] )
+        aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
+
+      if ( aSetupCfg[yi] && aConv[yi] )
+      {
+        if (testOutLen)
+        {
+          t0 = uclock_sec();
+          outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
+          t1 = uclock_sec();
+          td = t1 - t0;
+        }
+        else
+        {
+          //const int blkLen = 4096;  /* required for 'fast_conv_4K' */
+          const int blkLen = aBlkLen[yi];
+          int64_t offC = 0, offS, Nout;
+          int k;
+          iter = 0;
+          outN[yi] = 0;
+          aNumLoops[yi] = 1;
+          t0 = uclock_sec();
+          tstop = t0 + BENCH_TEST_DURATION_IN_SEC;
+          do
+          {
+            const int prev_iter = iter;
+            for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
+            {
+              offS = cplxFactor * offC;
+              Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], 0 /* applyFlush */ );
+              offC += Nout;
+              ++iter;
+              if ( !Nout )
+                break;
+            }
+            //if ( !Nout )
+            //  break;
+            t1 = uclock_sec();
+            if ( prev_iter == iter )    // restart from begin of input?
+            {
+                offC = 0;
+                ++aNumLoops[yi];
+            }
+          } while ( t1 < tstop );
+          outN[yi] = offC;
+          td = t1 - t0;
+          procSmpPerSec[yi] = cplxFactor * (double)outN[yi] * (1.0 / td);
+          aNumIters[yi] = iter;
+          aDuration[yi] = td;
+
+          //printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%.1f ms\n",
+          //  convText[yi], (double)outN[yi]/(1000.0 * 1000.0), 1000.0 * aDuration[yi], procSmpPerSec[yi] * 0.001, aNumIters[yi], 1000.0 * td );
+        }
+      }
+      else
+      {
+        outN[yi] = 0;
+      }
+      if ( yi == 0 ) {
+        const float * Yvals = Y[0];
+        const int64_t refOutLen = cplxFactor * outN[0];
+        tdref = td;
+        if (printDbg) {
+          printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
+          printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
+        }
+        aSpeedFactor[yi] = 1.0;
+        /*  */
+        yRangeMin = FLT_MAX;
+        yRangeMax = FLT_MIN;
+        for ( i = 0; i < refOutLen; ++i )
+        {
+          if ( yRangeMax < Yvals[i] )  yRangeMax = Yvals[i];
+          if ( yRangeMin > Yvals[i] )  yRangeMin = Yvals[i];
+        }
+        yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
+        /* yErrLimit = 0.01F; */
+        if (testOutLen) {
+          if (1) {
+            printf("reference output len = %" PRId64 " smp\n", outN[0]);
+            printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
+          }
+          printFirst( Yvals, "Yref", 64, 8 );
+        }
+      }
+      else
+      {
+        aSpeedFactor[yi] = tdref / td;
+        if (printDbg) {
+          printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
+          printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
+        }
+      }
+    }
+
+    int iMaxSpeedSlowAlgo = -1;
+    int iFirstFastAlgo = -1;
+    int iMaxSpeedFastAlgo = -1;
+    int iPrintedRefOutLen = 0;
+    {
+      for ( yc = 1; yc < NUMY; ++yc )
+      {
+        if (!aRunAlgo[yc])
+          continue;
+        if (aFastAlgo[yc]) {
+          if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
+            iMaxSpeedFastAlgo = yc;
+            
+          if (iFirstFastAlgo < 0)
+            iFirstFastAlgo = yc;
+        }
+        else
+        {
+          if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
+            iMaxSpeedSlowAlgo = yc;
+        }
+      }
+
+      if (printSpeed)
+      {
+        if (testOutLen)
+        {
+          if (iMaxSpeedSlowAlgo >= 0 )
+            printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
+          if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
+            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
+          if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
+            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
+
+          if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
+            printf("first   fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo],    aSpeedFactor[iFirstFastAlgo],    1000.0 * aDuration[iFirstFastAlgo]);
+          if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
+            printf("2nd     fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1],  aSpeedFactor[iFirstFastAlgo+1],  1000.0 * aDuration[iFirstFastAlgo+1]);
+
+          if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
+          {
+            printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
+            if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
+              printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
+          }
+          printf("\n");
+        }
+        else
+        {
+          // print columns in 1st line
+          if (printAsCSV && *pIsFirstFilterLen)
+          {
+            printf("\n# filterLen, filterOrder, Re/Cx, type, sym, ");
+            for ( yc = 0; yc < NUMY; ++yc )
+            {
+              if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
+                continue;
+              if (printAsCSV)
+                printf("%s, ", convText[yc]);
+            }
+            *pIsFirstFilterLen = 0;
+          }
+
+          for ( yc = 0; yc < NUMY; ++yc )
+          {
+            if (!yc)
+            {
+              double filterExp = log10((double)FILTERLEN) / log10(2.0);
+              printf("\n%5d, %5.1f, %s, %s, %s, ", FILTERLEN, filterExp,
+                     ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
+                     (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
+                     ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym")
+                     );
+            }
+            if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
+              continue;
+            if (printAsCSV)
+              printf("%.0f, ", procSmpPerSec[yc] * 0.001);
+            else
+              printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%d loops\n",
+                     convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001, aNumIters[yc], aNumLoops[yc] );
+          }
+        }
+
+      }
+    }
+
+
+    for ( yc = 1; yc < NUMY; ++yc )
+    {
+      const float * Yref;
+      const float * Ycurr;
+      int outMin;
+
+      if (!aRunAlgo[yc])
+        continue;
+
+      if (printDbg)
+        printf("\n");
+
+      if ( outN[yc] == 0 )
+      {
+        if (!printAsCSV)
+          printf("output size 0: '%s' not implemented\n", convText[yc]);
+      }
+      else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
+      {
+        if (!iPrintedRefOutLen)
+        {
+          printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
+          iPrintedRefOutLen = 1;
+        }
+        printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
+          FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
+        retErr = 1;
+      }
+
+      posMaxErr = 0;
+      maxErr = -1.0;
+      Yref = Y[0];
+      Ycurr = Y[yc];
+      outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
+      numErrOverLimit = 0;
+      for ( i = 0; i < outMin; ++i )
+      {
+        if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit && printErrValues )
+        {
+          printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
+            convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
+          ++numErrOverLimit;
+        }
+
+        if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
+        {
+          maxErr = fabsf(Ycurr[i] - Yref[i]);
+          posMaxErr = i;
+        }
+      }
+
+      if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
+        printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
+    }
+
+    break;
+  }
+
+  pffastconv_free(X);
+  for ( i=0; i < NUMY; ++i)
+  {
+    if ( 1 || i < 2 )
+      pffastconv_free( Y[i] );
+    if (!aRunAlgo[i])
+      continue;
+    aDestroy[i]( aSetupCfg[i] );
+  }
+
+  pffastconv_free(H);
+
+  return retErr;
+}
+
+/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
+void validate_pffft_simd();
+int  validate_pffft_simd_ex(FILE * DbgOut);
+
+
+int main(int argc, char **argv)
+{
+  int result = 0;
+  int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
+  int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
+  int testReal = 1, testCplx = 1, testSymetric = 0, abortFirstFastAlgo = 1, printErrValues = 0, printAsCSV = 1;
+  int isFirstFilterLen = 1;
+
+  for ( i = 1; i < argc; ++i ) {
+
+    if (!strcmp(argv[i], "--test-simd")) {
+      int numErrs = validate_pffft_simd_ex(stdout);
+      fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
+      return ( numErrs > 0 ? 1 : 0 );
+    }
+
+    if (!strcmp(argv[i], "--no-len")) {
+      testOutLens = 0;
+    }
+    else if (!strcmp(argv[i], "--no-bench")) {
+      benchConv = 0;
+    }
+    else if (!strcmp(argv[i], "--quick")) {
+      quickTest = 1;
+    }
+    else if (!strcmp(argv[i], "--slow")) {
+      slowTest = 1;
+    }
+    else if (!strcmp(argv[i], "--real")) {
+      testCplx = 0;
+    }
+    else if (!strcmp(argv[i], "--cplx")) {
+      testReal = 0;
+    }
+    else if (!strcmp(argv[i], "--sym")) {
+      testSymetric = 1;
+    }
+    else /* if (!strcmp(argv[i], "--help")) */ {
+      printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
+      exit(1);
+    }
+  }
+
+
+  if (testOutLens)
+  {
+    for ( k = 0; k < 3; ++k )
+    {
+      if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
+        continue;
+      printf("\n\n==========\n");
+      printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
+      printf("==========\n");
+      flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
+      flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
+      flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
+      testOutLen = 1;
+      printDbg = 0;
+      printSpeed = 0;
+      for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
+      {
+        if ( (M % 16) != 0 && testSymetric )
+          continue;
+        result |= test(M, flagsB, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, 0, &isFirstFilterLen);
+      }
+    }
+  }
+
+  if (benchConv)
+  {
+      printf("quickTest is %d\n", quickTest);
+      printf("slowTest is %d\n", slowTest);
+
+    for ( k = 0; k < 3; ++k )
+    {
+      if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
+        continue;
+      if (!printAsCSV)
+      {
+        printf("\n\n==========\n");
+        printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
+        printf("==========\n");
+      }
+      flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
+      flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
+      flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
+      testOutLen = 0;
+      printDbg = 0;
+      printSpeed = 1;
+      if (!slowTest) {
+        if (!quickTest) {
+          result |= test(32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+          result |= test(32 + 16, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        }
+        result |= test(64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        if (!quickTest) {
+          result |= test(64 + 32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+          result |= test(128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        }
+      }
+      if (!quickTest) {
+        result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(256,     flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(512,     flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(1024,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+
+        result |= test(2048,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(4096,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(8192,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(16384,   flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+        result |= test(32768,   flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
+      }
+      if (printAsCSV)
+        printf("\n");
+    }
+  }
+
+  return result;
+}
+
--- a/pffft/test_pffft.c
+++ b/pffft/test_pffft.c
@@ -0,0 +1,371 @@
+/*
+  Copyright (c) 2013 Julien Pommier.
+
+  Small test for PFFFT
+
+  How to build: 
+
+  on linux, with fftw3:
+  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+  on macos, without fftw3:
+  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+  on macos, with fftw3:
+  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+  as alternative: replace clang by gcc.
+
+  on windows, with visual c++:
+  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+  
+  build without SIMD instructions:
+  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#ifdef PFFFT_ENABLE_FLOAT
+#include "pffft.h"
+
+typedef float pffft_scalar;
+#else
+/*
+Note: adapted for double precision dynamic range version.
+*/
+#include "pffft_double.h"
+
+typedef double pffft_scalar;
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+/* define own constants required to turn off g++ extensions .. */
+#ifndef M_PI
+  #define M_PI    3.14159265358979323846  /* pi */
+#endif
+
+/* EXPECTED_DYN_RANGE in dB:
+ * single precision float has 24 bits mantissa
+ * => 24 Bits * 6 dB = 144 dB
+ * allow a few dB tolerance (even 144 dB looks good on my PC)
+ */
+#ifdef PFFFT_ENABLE_FLOAT
+#define EXPECTED_DYN_RANGE  140.0
+#else
+#define EXPECTED_DYN_RANGE  215.0
+#endif
+
+/* maximum allowed phase error in degree */
+#define DEG_ERR_LIMIT   1E-4
+
+/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
+#define MAG_ERR_LIMIT  1E-6
+
+
+#define PRINT_SPEC  0
+
+#define PWR2LOG(PWR)  ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
+
+
+
+int test(int N, int cplx, int useOrdered) {
+  int Nfloat = (cplx ? N*2 : N);
+#ifdef PFFFT_ENABLE_FLOAT
+  pffft_scalar *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *R = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+#else
+  pffft_scalar *X = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *Y = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *R = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *Z = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+  pffft_scalar *W = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
+#endif
+  pffft_scalar amp = (pffft_scalar)1.0;
+  double freq, dPhi, phi, phi0;
+  double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
+  int k, j, m, iter, kmaxOther, retError = 0;
+
+#ifdef PFFFT_ENABLE_FLOAT
+  assert( pffft_is_power_of_two(N) );
+  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+#else
+  assert( pffftd_is_power_of_two(N) );
+  PFFFTD_Setup *s = pffftd_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+#endif
+  assert(s);
+  if (!s) {
+    printf("Error setting up PFFFT!\n");
+    return 1;
+  }
+
+  for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
+  {
+    amp = (pffft_scalar)( ( (m % 3) == 0 ) ? 1.0 : 1.1 );
+    freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
+    dPhi = 2.0 * M_PI * freq;
+    if ( dPhi < 0.0 )
+      dPhi += 2.0 * M_PI;
+
+    iter = -1;
+    while (1)
+    {
+      ++iter;
+
+      if (iter)
+        printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
+
+      /* generate cosine carrier as time signal - start at defined phase phi0 */
+      phi = phi0 = (m % 4) * 0.125 * M_PI;  /* have phi0 < 90 deg to be normalized */
+      for ( j = 0; j < N; ++j )
+      {
+        if (cplx) {
+          X[2*j] = amp * (pffft_scalar)cos(phi);  /* real part */
+          X[2*j+1] = amp * (pffft_scalar)sin(phi);  /* imag part */
+        }
+        else
+          X[j] = amp * (pffft_scalar)cos(phi);  /* only real part */
+
+        /* phase increment .. stay normalized - cos()/sin() might degrade! */
+        phi += dPhi;
+        if ( phi >= M_PI )
+          phi -= 2.0 * M_PI;
+      }
+
+      /* forward transform from X --> Y  .. using work buffer W */
+#ifdef PFFFT_ENABLE_FLOAT
+      if ( useOrdered )
+        pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
+      else
+      {
+        pffft_transform(s, X, R, W, PFFFT_FORWARD );  /* use R for reordering */
+        pffft_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
+      }
+#else
+      if ( useOrdered )
+        pffftd_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
+      else
+      {
+        pffftd_transform(s, X, R, W, PFFFT_FORWARD );  /* use R for reordering */
+        pffftd_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
+      }
+#endif
+
+      pwrOther = -1.0;
+      pwrCar = 0;
+
+
+      /* for positive frequencies: 0 to 0.5 * samplerate */
+      /* and also for negative frequencies: -0.5 * samplerate to 0 */
+      for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
+      {
+        if (!cplx && !j)  /* special treatment for DC for real input */
+          pwr = Y[j]*Y[j];
+        else if (!cplx && j == N/2)  /* treat 0.5 * samplerate */
+          pwr = Y[1] * Y[1];  /* despite j (for freq calculation) we have index 1 */
+        else
+          pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
+        if (iter || PRINT_SPEC)
+          printf("%s fft %d:  pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
+        if (k == j)
+          pwrCar = pwr;
+        else if ( pwr > pwrOther ) {
+          pwrOther = pwr;
+          kmaxOther = j;
+        }
+      }
+
+      if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
+        printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
+        printf("  carrier power  at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
+        printf("  carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
+        printf("  max other pwr  at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
+        printf("  dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
+        retError = 1;
+        if ( iter == 0 )
+          continue;
+      }
+
+      if ( k > 0 && k != N/2 )
+      {
+        phi = atan2( Y[2*k+1], Y[2*k] );
+        if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
+        {
+        retError = 1;
+        printf("%s fft %d  bin %d amp %f : phase mismatch! phase = %f deg   expected = %f deg\n",
+            (cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
+        }
+      }
+
+      expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
+      mag = sqrt(pwrCar) / N;
+      if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
+      {
+        retError = 1;
+        printf("%s fft %d  bin %d amp %f : mag = %g   expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
+      }
+
+
+      /* now convert spectrum back */
+#ifdef PFFFT_ENABLE_FLOAT
+      if (useOrdered)
+        pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
+      else
+        pffft_transform(s, R, Z, W, PFFFT_BACKWARD);
+#else
+      if (useOrdered)
+        pffftd_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
+      else
+        pffftd_transform(s, R, Z, W, PFFFT_BACKWARD);
+#endif
+
+      errSum = 0.0;
+      for ( j = 0; j < (cplx ? (2*N) : N); ++j )
+      {
+        /* scale back */
+        Z[j] /= N;
+        /* square sum errors over real (and imag parts) */
+        err = (X[j]-Z[j]) * (X[j]-Z[j]);
+        errSum += err;
+      }
+
+      if ( errSum > N * 1E-7 )
+      {
+        retError = 1;
+        printf("%s fft %d  bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
+      }
+
+      break;
+    }
+
+  }
+#ifdef PFFFT_ENABLE_FLOAT
+  pffft_destroy_setup(s);
+  pffft_aligned_free(X);
+  pffft_aligned_free(Y);
+  pffft_aligned_free(Z);
+  pffft_aligned_free(R);
+  pffft_aligned_free(W);
+#else
+  pffftd_destroy_setup(s);
+  pffftd_aligned_free(X);
+  pffftd_aligned_free(Y);
+  pffftd_aligned_free(Z);
+  pffftd_aligned_free(R);
+  pffftd_aligned_free(W);
+#endif
+
+  return retError;
+}
+
+/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
+void validate_pffft_simd();
+int  validate_pffft_simd_ex(FILE * DbgOut);
+void validate_pffftd_simd();
+int  validate_pffftd_simd_ex(FILE * DbgOut);
+
+
+
+int main(int argc, char **argv)
+{
+  int N, result, resN, resAll, i, k, resNextPw2, resIsPw2, resFFT;
+
+  int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8,  9, 511, 512,  513 };
+  int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
+
+  for ( i = 1; i < argc; ++i ) {
+
+    if (!strcmp(argv[i], "--test-simd")) {
+#ifdef PFFFT_ENABLE_FLOAT
+      int numErrs = validate_pffft_simd_ex(stdout);
+#else
+      int numErrs = validate_pffftd_simd_ex(stdout);
+#endif
+      fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
+      return ( numErrs > 0 ? 1 : 0 );
+    }
+  }
+
+  resNextPw2 = 0;
+  resIsPw2 = 0;
+  for ( k = 0; k < (sizeof(inp_power_of_two)/sizeof(inp_power_of_two[0])); ++k) {
+#ifdef PFFFT_ENABLE_FLOAT
+    N = pffft_next_power_of_two(inp_power_of_two[k]);
+#else
+    N = pffftd_next_power_of_two(inp_power_of_two[k]);
+#endif
+    if (N != ref_power_of_two[k]) {
+      resNextPw2 = 1;
+      printf("pffft_next_power_of_two(%d) does deliver %d, which is not reference result %d!\n",
+        inp_power_of_two[k], N, ref_power_of_two[k] );
+    }
+
+#ifdef PFFFT_ENABLE_FLOAT
+    result = pffft_is_power_of_two(inp_power_of_two[k]);
+#else
+    result = pffftd_is_power_of_two(inp_power_of_two[k]);
+#endif
+    if (inp_power_of_two[k] == ref_power_of_two[k]) {
+      if (!result) {
+        resIsPw2 = 1;
+        printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", inp_power_of_two[k]);
+      }
+    } else {
+      if (result) {
+        resIsPw2 = 1;
+        printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", inp_power_of_two[k]);
+      }
+    }
+  }
+  if (!resNextPw2)
+    printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
+  if (!resIsPw2)
+    printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
+
+  resFFT = 0;
+  for ( N = 32; N <= 65536; N *= 2 )
+  {
+    result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
+    resN = result;
+    resFFT |= result;
+
+    result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
+    resN |= result;
+    resFFT |= result;
+
+    result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
+    resN |= result;
+    resFFT |= result;
+
+    result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
+    resN |= result;
+    resFFT |= result;
+
+    if (!resN)
+      printf("tests for size %d succeeded successfully.\n", N);
+  }
+
+  if (!resFFT) {
+#ifdef PFFFT_ENABLE_FLOAT
+    printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, float) succeeded successfully.\n");
+#else
+    printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, double) succeeded successfully.\n");
+#endif
+  }
+
+  resAll = resNextPw2 | resIsPw2 | resFFT;
+  if (!resAll)
+    printf("all tests succeeded successfully.\n");
+  else
+    printf("there are failed tests!\n");
+
+  return resAll;
+}
+
--- a/pffft/test_pffft.cpp
+++ b/pffft/test_pffft.cpp
@@ -0,0 +1,377 @@
+/*
+  Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+  Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+  Small test & bench for PFFFT, comparing its performance with the scalar
+  FFTPACK, FFTW, and Apple vDSP
+
+  How to build:
+
+  on linux, with fftw3:
+  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c
+  test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+  on macos, without fftw3:
+  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c
+  -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+  on macos, with fftw3:
+  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c
+  test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f
+  -framework Accelerate
+
+  as alternative: replace clang by gcc.
+
+  on windows, with visual c++:
+  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+  build without SIMD instructions:
+  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c
+  fftpack.c -lm
+
+ */
+
+#include "pffft.hpp"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+/* define own constants required to turn off g++ extensions .. */
+#ifndef M_PI
+  #define M_PI    3.14159265358979323846  /* pi */
+#endif
+
+/* maximum allowed phase error in degree */
+#define DEG_ERR_LIMIT 1E-4
+
+/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
+#define MAG_ERR_LIMIT 1E-6
+
+#define PRINT_SPEC 0
+
+#define PWR2LOG(PWR) ((PWR) < 1E-30 ? 10.0 * log10(1E-30) : 10.0 * log10(PWR))
+
+template<typename T>
+bool
+Ttest(int N, bool useOrdered)
+{
+  typedef pffft::Fft<T> Fft;
+  typedef typename pffft::Fft<T>::Scalar  FftScalar;
+  typedef typename Fft::Complex FftComplex;
+
+  const bool cplx = pffft::Fft<T>::isComplexTransform();
+  const double EXPECTED_DYN_RANGE = Fft::isDoubleScalar() ? 215.0 : 140.0;
+
+  assert(Fft::isPowerOfTwo(N));
+
+  Fft fft = Fft(N);  // instantiate and prepareLength() for length N
+
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
+
+  // possible ways to declare/instatiate aligned vectors with C++11
+  //   some lines require a typedef of above
+  auto X = fft.valueVector();                    // for X = input vector
+  pffft::AlignedVector<typename Fft::Complex> Y = fft.spectrumVector();  // for Y = forward(X)
+  pffft::AlignedVector<FftScalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
+  pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
+                                                 //  or Z = inverseInternalLayout(R)
+#else
+
+  // possible ways to declare/instatiate aligned vectors with C++98
+  pffft::AlignedVector<T> X = fft.valueVector();     // for X = input vector
+  pffft::AlignedVector<FftComplex>   Y = fft.spectrumVector();  // for Y = forward(X)
+  pffft::AlignedVector<typename Fft::Scalar>  R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
+  pffft::AlignedVector<T> Z = fft.valueVector();     // for Z = inverse(Y) = inverse( forward(X) )
+                                                     //  or Z = inverseInternalLayout(R)
+#endif
+
+  // work with complex - without the capabilities of a higher c++ standard
+  FftScalar* Xs = reinterpret_cast<FftScalar*>(X.data()); // for X = input vector
+  FftScalar* Ys = reinterpret_cast<FftScalar*>(Y.data()); // for Y = forward(X)
+  FftScalar* Zs = reinterpret_cast<FftScalar*>(Z.data()); // for Z = inverse(Y) = inverse( forward(X) )
+
+  int k, j, m, iter, kmaxOther;
+  bool retError = false;
+  double freq, dPhi, phi, phi0;
+  double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
+  double amp = 1.0;
+
+  for (k = m = 0; k < (cplx ? N : (1 + N / 2)); k += N / 16, ++m) {
+    amp = ((m % 3) == 0) ? 1.0F : 1.1F;
+    freq = (k < N / 2) ? ((double)k / N) : ((double)(k - N) / N);
+    dPhi = 2.0 * M_PI * freq;
+    if (dPhi < 0.0)
+      dPhi += 2.0 * M_PI;
+
+    iter = -1;
+    while (1) {
+      ++iter;
+
+      if (iter)
+        printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
+
+      /* generate cosine carrier as time signal - start at defined phase phi0 */
+      phi = phi0 =
+        (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
+      for (j = 0; j < N; ++j) {
+        if (cplx) {
+          Xs[2 * j] = (FftScalar)( amp * cos(phi) );     /* real part */
+          Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */
+        } else
+          Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */
+
+        /* phase increment .. stay normalized - cos()/sin() might degrade! */
+        phi += dPhi;
+        if (phi >= M_PI)
+          phi -= 2.0 * M_PI;
+      }
+
+      /* forward transform from X --> Y  .. using work buffer W */
+      if (useOrdered)
+        fft.forward(X, Y);
+      else {
+        fft.forwardToInternalLayout(X, R); /* use R for reordering */
+        fft.reorderSpectrum(R, Y); /* have canonical order in Y[] for power calculations */
+      }
+
+      pwrOther = -1.0;
+      pwrCar = 0;
+
+      /* for positive frequencies: 0 to 0.5 * samplerate */
+      /* and also for negative frequencies: -0.5 * samplerate to 0 */
+      for (j = 0; j < (cplx ? N : (1 + N / 2)); ++j) {
+        if (!cplx && !j) /* special treatment for DC for real input */
+          pwr = Ys[j] * Ys[j];
+        else if (!cplx && j == N / 2) /* treat 0.5 * samplerate */
+          pwr = Ys[1] *
+                Ys[1]; /* despite j (for freq calculation) we have index 1 */
+        else
+          pwr = Ys[2 * j] * Ys[2 * j] + Ys[2 * j + 1] * Ys[2 * j + 1];
+        if (iter || PRINT_SPEC)
+          printf("%s fft %d:  pwr[j = %d] = %g == %f dB\n",
+                 (cplx ? "cplx" : "real"),
+                 N,
+                 j,
+                 pwr,
+                 PWR2LOG(pwr));
+        if (k == j)
+          pwrCar = pwr;
+        else if (pwr > pwrOther) {
+          pwrOther = pwr;
+          kmaxOther = j;
+        }
+      }
+
+      if (PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE) {
+        printf("%s fft %d amp %f iter %d:\n",
+               (cplx ? "cplx" : "real"),
+               N,
+               amp,
+               iter);
+        printf("  carrier power  at bin %d: %g == %f dB\n",
+               k,
+               pwrCar,
+               PWR2LOG(pwrCar));
+        printf("  carrier mag || at bin %d: %g\n", k, sqrt(pwrCar));
+        printf("  max other pwr  at bin %d: %g == %f dB\n",
+               kmaxOther,
+               pwrOther,
+               PWR2LOG(pwrOther));
+        printf("  dynamic range: %f dB\n\n",
+               PWR2LOG(pwrCar) - PWR2LOG(pwrOther));
+        retError = true;
+        if (iter == 0)
+          continue;
+      }
+
+      if (k > 0 && k != N / 2) {
+        phi = atan2(Ys[2 * k + 1], Ys[2 * k]);
+        if (fabs(phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0) {
+          retError = true;
+          printf("%s fft %d  bin %d amp %f : phase mismatch! phase = %f deg   "
+                 "expected = %f deg\n",
+                 (cplx ? "cplx" : "real"),
+                 N,
+                 k,
+                 amp,
+                 phi * 180.0 / M_PI,
+                 phi0 * 180.0 / M_PI);
+        }
+      }
+
+      expextedMag = cplx ? amp : ((k == 0 || k == N / 2) ? amp : (amp / 2));
+      mag = sqrt(pwrCar) / N;
+      if (fabs(mag - expextedMag) > MAG_ERR_LIMIT) {
+        retError = true;
+        printf("%s fft %d  bin %d amp %f : mag = %g   expected = %g\n",
+               (cplx ? "cplx" : "real"),
+               N,
+               k,
+               amp,
+               mag,
+               expextedMag);
+      }
+
+      /* now convert spectrum back */
+      if (useOrdered)
+        fft.inverse(Y, Z);
+      else
+        fft.inverseFromInternalLayout(R, Z); /* inverse() from internal Layout */
+
+      errSum = 0.0;
+      for (j = 0; j < (cplx ? (2 * N) : N); ++j) {
+        /* scale back */
+        Zs[j] /= N;
+        /* square sum errors over real (and imag parts) */
+        err = (Xs[j] - Zs[j]) * (Xs[j] - Zs[j]);
+        errSum += err;
+      }
+
+      if (errSum > N * 1E-7) {
+        retError = true;
+        printf("%s fft %d  bin %d : inverse FFT doesn't match original signal! "
+               "errSum = %g ; mean err = %g\n",
+               (cplx ? "cplx" : "real"),
+               N,
+               k,
+               errSum,
+               errSum / N);
+      }
+
+      break;
+    }
+  }
+
+  // using the std::vector<> base classes .. no need for alignedFree() for X, Y, Z and R
+
+  return retError;
+}
+
+bool
+test(int N, bool useComplex, bool useOrdered)
+{
+  if (useComplex) {
+    return
+#ifdef PFFFT_ENABLE_FLOAT
+           Ttest< std::complex<float> >(N, useOrdered)
+#endif
+#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
+        &&
+#endif
+#ifdef PFFFT_ENABLE_DOUBLE
+           Ttest< std::complex<double> >(N, useOrdered)
+#endif
+           ;
+  } else {
+    return
+#ifdef PFFFT_ENABLE_FLOAT
+           Ttest<float>(N, useOrdered)
+#endif
+#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
+        &&
+#endif
+#ifdef PFFFT_ENABLE_DOUBLE
+           Ttest<double>(N, useOrdered)
+#endif
+           ;
+  }
+}
+
+int
+main(int argc, char** argv)
+{
+  int N, result, resN, resAll, k, resNextPw2, resIsPw2, resFFT;
+
+  int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
+  int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
+
+  resNextPw2 = 0;
+  resIsPw2 = 0;
+  for (k = 0; k < (sizeof(inp_power_of_two) / sizeof(inp_power_of_two[0]));
+       ++k) {
+#ifdef PFFFT_ENABLE_FLOAT
+    N = pffft::Fft<float>::nextPowerOfTwo(inp_power_of_two[k]);
+#else
+    N = pffft::Fft<double>::nextPowerOfTwo(inp_power_of_two[k]);
+#endif
+    if (N != ref_power_of_two[k]) {
+      resNextPw2 = 1;
+      printf("pffft_next_power_of_two(%d) does deliver %d, which is not "
+             "reference result %d!\n",
+             inp_power_of_two[k],
+             N,
+             ref_power_of_two[k]);
+    }
+
+#ifdef PFFFT_ENABLE_FLOAT
+    result = pffft::Fft<float>::isPowerOfTwo(inp_power_of_two[k]);
+#else
+    result = pffft::Fft<double>::isPowerOfTwo(inp_power_of_two[k]);
+#endif
+    if (inp_power_of_two[k] == ref_power_of_two[k]) {
+      if (!result) {
+        resIsPw2 = 1;
+        printf("pffft_is_power_of_two(%d) delivers false; expected true!\n",
+               inp_power_of_two[k]);
+      }
+    } else {
+      if (result) {
+        resIsPw2 = 1;
+        printf("pffft_is_power_of_two(%d) delivers true; expected false!\n",
+               inp_power_of_two[k]);
+      }
+    }
+  }
+  if (!resNextPw2)
+    printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
+  if (!resIsPw2)
+    printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
+
+  resFFT = 0;
+  for (N = 32; N <= 65536; N *= 2) {
+    result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
+    resN = result;
+    resFFT |= result;
+
+    result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
+    resN |= result;
+    resFFT |= result;
+
+    result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
+    resN |= result;
+    resFFT |= result;
+
+    result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
+    resN |= result;
+    resFFT |= result;
+
+    if (!resN)
+      printf("tests for size %d succeeded successfully.\n", N);
+  }
+
+  if (!resFFT)
+    printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, "
+#ifdef PFFFT_ENABLE_FLOAT
+           "float"
+#endif
+#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
+            "/"
+#endif
+#ifdef PFFFT_ENABLE_DOUBLE
+           "double"
+#endif
+           ") succeeded successfully.\n");
+
+  resAll = resNextPw2 | resIsPw2 | resFFT;
+  if (!resAll)
+    printf("all tests succeeded successfully.\n");
+  else
+    printf("there are failed tests!\n");
+
+  return resAll;
+}
--- a/pffft/uninstall.cmake
+++ b/pffft/uninstall.cmake
@@ -0,0 +1,24 @@
+set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
+
+if(NOT EXISTS ${MANIFEST})
+    message(FATAL_ERROR "Cannot find install manifest: '${MANIFEST}'")
+endif()
+
+file(STRINGS ${MANIFEST} files)
+foreach(file ${files})
+    if(EXISTS ${file})
+        message(STATUS "Removing file: '${file}'")
+
+        exec_program(
+            ${CMAKE_COMMAND} ARGS "-E remove ${file}"
+            OUTPUT_VARIABLE stdout
+            RETURN_VALUE result
+        )
+
+        if(NOT "${result}" STREQUAL 0)
+            message(FATAL_ERROR "Failed to remove file: '${file}'.")
+        endif()
+    else()
+        MESSAGE(STATUS "File '${file}' does not exist.")
+    endif()
+endforeach(file)
--- a/pffft/use_gcc8.inc
+++ b/pffft/use_gcc8.inc
@@ -0,0 +1,2 @@
+export GCC_WITH_CMAKE=$(which gcc-8)
+export GPP_WITH_CMAKE=$(which g++-8)