add pffft

2024-11-09 14:57:18 -06:00
parent 78a00f71cc
commit a1790b8977
69 changed files with 25719 additions and 0 deletions
--- a/pffft/.github/workflows/c-cpp.yml
+++ b/pffft/.github/workflows/c-cpp.yml
@@ -0,0 +1,279 @@
 name: C/C++ CI
 on:
  push:
    branches:
      - master
      - github_actions
  pull_request:
    branches:
      - master
      - github_actions
 env:
  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
  BUILD_TYPE: Release
 jobs:
  build_w_mipp_ubuntu-amd64:
    runs-on: ubuntu-latest
    steps:
    - name: check out MIPP
      uses: actions/checkout@master
      with:
          repository: hayguen/MIPP
          path: ./MIPP
    - name: cmake configure MIPP
      run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
    - name: cmake install MIPP headers
      run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
    - uses: actions/checkout@v2
    - name: cmake_make_simd_float_double
      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
    - name: cmake_make_simd_float
      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
    - name: cmake_make_simd_double
      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
    - name: cmake_make_no-simd_float_double
      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
    - name: cmake_make_no-simd_scalar_float_double
      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
    - name: compress
      run: tar zcvf pffft_w_mipp_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_ubuntu_builds
        path: pffft_w_mipp_ubuntu-amd64.tar.gz
  build_ubuntu-amd64:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
    - name: cmake_make_simd_float_double
      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
    - name: cmake_make_simd_float
      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
    - name: cmake_make_simd_double
      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
    - name: cmake_make_no-simd_float_double
      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
    - name: cmake_make_no-simd_scalar_float_double
      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
    - name: compress
      run: tar zcvf pffft_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_ubuntu_builds
        path: pffft_ubuntu-amd64.tar.gz
  cross_build_win_from_linux:
    runs-on: ubuntu-20.04
    steps:
    - name: prerequisites
      run: sudo apt -qq update && sudo apt -yqq install gcc-mingw-w64 g++-mingw-w64
    - name: check out MIPP
      uses: actions/checkout@master
      with:
          repository: hayguen/MIPP
          path: ./MIPP
    - name: cmake configure MIPP
      working-directory: ${{runner.workspace}}
      run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
    - name: cmake install MIPP headers
      working-directory: ${{runner.workspace}}
      run: cmake --build MIPP_build --target install
    - uses: actions/checkout@v2
    - name: build_w32_no-simd
      working-directory: ${{runner.workspace}}
      run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
    - name: build_w32_simd_full
      working-directory: ${{runner.workspace}}
      run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=pentium4 -DTARGET_C_ARCH=pentium4 -DMIPP_INCLUDE_DIRS=$X/include/mipp
    - name: build_w64_no-simd
      working-directory: ${{runner.workspace}}
      run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
    - name: build_w64_simd_full
      working-directory: ${{runner.workspace}}
      run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=sandybridge -DTARGET_C_ARCH=sandybridge -DMIPP_INCLUDE_DIRS=$X/include/mipp
    - name: compress
      run: tar zcvf pffft_cross-build-windows-from-linux-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt  build_w32_no-simd build_w32_simd_full build_w64_no-simd build_w64_simd_full
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_windows_from_cross_builds
        path: pffft_cross-build-windows-from-linux-amd64.tar.gz
  build_win_msvc:
    # The CMake configure and build commands are platform agnostic and should work equally
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
    runs-on: windows-2019
    steps:
    - name: check out MIPP
      uses: actions/checkout@master
      with:
          repository: hayguen/MIPP
          path: ./MIPP
    - name: cmake configure MIPP
      shell: bash
      working-directory: ${{runner.workspace}}
      run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
    - name: cmake install MIPP headers
      working-directory: ${{runner.workspace}}
      run: cmake --build MIPP_build --target install
    - uses: actions/checkout@v2
    - name: Configure CMake No-SIMD
      shell: bash
      working-directory: ${{runner.workspace}}
      run: cmake -S $GITHUB_WORKSPACE -B build_no-simd -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DPFFFT_USE_SIMD=OFF -DTARGET_CXX_ARCH=none -DTARGET_C_ARCH=none
    - name: Build No-SIMD
      shell: bash
      working-directory: ${{runner.workspace}}
      # Execute the build.  You can specify a specific target with "--target <NAME>"
      run: cmake --build build_no-simd --config $BUILD_TYPE
    - name: Configure CMake SSE2
      shell: bash
      working-directory: ${{runner.workspace}}
      run: cmake -S $GITHUB_WORKSPACE -B build_sse2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=SSE2 -DTARGET_C_ARCH=SSE2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
    - name: Build SSE2
      shell: bash
      working-directory: ${{runner.workspace}}
      # Execute the build.  You can specify a specific target with "--target <NAME>"
      run: cmake --build build_sse2 --config $BUILD_TYPE
    - name: Configure CMake AVX
      # Use a bash shell so we can use the same syntax for environment variable
      # access regardless of the host operating system
      shell: bash
      working-directory: ${{runner.workspace}}
      run: cmake -S $GITHUB_WORKSPACE -B build_avx -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX -DTARGET_C_ARCH=AVX -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
    - name: Build AVX
      working-directory: ${{runner.workspace}}
      shell: bash
      # Execute the build.  You can specify a specific target with "--target <NAME>"
      run: cmake --build build_avx --config $BUILD_TYPE
    - name: Configure CMake AVX2
      # Use a bash shell so we can use the same syntax for environment variable
      # access regardless of the host operating system
      shell: bash
      working-directory: ${{runner.workspace}}
      run: cmake -S $GITHUB_WORKSPACE -B build_avx2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX2 -DTARGET_C_ARCH=AVX2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
    - name: Build AVX2
      working-directory: ${{runner.workspace}}
      shell: bash
      # Execute the build.  You can specify a specific target with "--target <NAME>"
      run: cmake --build build_avx2 --config $BUILD_TYPE
    - name: compress
      working-directory: ${{runner.workspace}}
      run: tar zcvf pffft_windows-msvc-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt  build_no-simd build_sse2 build_avx build_avx2
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_windows_msvc_builds
        path: ${{runner.workspace}}/pffft_windows-msvc-amd64.tar.gz
  build_win_mingw:
    runs-on: windows-2019
    strategy:
      matrix:
        compiler: [gcc]
        msystem: [MINGW64]
    defaults:
      run:
        shell: msys2 {0}
    steps:
    - uses: actions/checkout@v2
    - uses: msys2/setup-msys2@v2
      with:
        msystem: MINGW64
        install: gcc cmake make
    - name: Configure cmake
      run: CC=gcc cmake -DMINGW=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native -S . -B build_mgw64
    - name: Build
      run: cmake --build build_mgw64
    - name: compress
      run: tar zcvf pffft_windows-mingw-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt  build_mgw64
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_windows_mingw_builds
        path: pffft_windows-mingw-amd64.tar.gz
  build_macos11:
    # copied from build_ubuntu-amd64 with minor renaming
    runs-on: macos-11
    steps:
    - uses: actions/checkout@v2
    - name: cmake_make_simd_float_double
      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
    - name: cmake_make_simd_float
      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
    - name: cmake_make_simd_double
      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
    - name: cmake_make_no-simd_float_double
      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
    - name: cmake_make_no-simd_scalar_float_double
      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
    - name: compress
      run: tar zcvf pffft_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_macos_builds
        path: pffft_macos-11.tar.gz
  build_w_mipp_macos11:
    # copied from build_w_mipp_ubuntu-amd64 with minor renaming
    runs-on: macos-11
    steps:
    - name: check out MIPP
      uses: actions/checkout@master
      with:
          repository: hayguen/MIPP
          path: ./MIPP
    - name: cmake configure MIPP
      run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
    - name: cmake install MIPP headers
      run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
    - uses: actions/checkout@v2
    - name: cmake_make_simd_float_double
      run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_full
    - name: cmake_make_simd_float
      run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_float
    - name: cmake_make_simd_double
      run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_simd_double
    - name: cmake_make_no-simd_float_double
      run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native  && cmake --build build_no-simd_full
    - name: cmake_make_no-simd_scalar_float_double
      run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
    - name: compress
      run: tar zcvf pffft_w_mipp_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
    - name: 'Upload Artifact'
      uses: actions/upload-artifact@v2
      with:
        name: pffft_macos_builds
        path: pffft_w_mipp_macos-11.tar.gz
--- a/pffft/.gitignore
+++ b/pffft/.gitignore
@@ -0,0 +1,4 @@
 build
 build_benches
 build_*
 .vscode
--- a/pffft/.gitmodules
+++ b/pffft/.gitmodules
@@ -0,0 +1,9 @@
 [submodule "greenffts"]
 	path = greenffts
 	url = https://github.com/hayguen/greenffts.git
 [submodule "kissfft"]
 	path = kissfft
 	url = https://github.com/hayguen/kissfft.git
 [submodule "pocketfft"]
 	path = pocketfft
 	url = https://github.com/hayguen/pocketfft.git
--- a/pffft/CMakeLists.txt
+++ b/pffft/CMakeLists.txt
@@ -0,0 +1,663 @@
 cmake_minimum_required(VERSION 2.8)
 project(PRETTY_FAST_FFT)
 # smaller library size?
 option(PFFFT_USE_TYPE_FLOAT  "activate single precision 'float'?" ON)
 option(PFFFT_USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
 # architecture/optimization options
 option(PFFFT_USE_SIMD        "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
 option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
 # what to install?
 option(INSTALL_PFFFT      "install pffft to CMAKE_INSTALL_PREFIX?" ON)
 option(INSTALL_PFDSP      "install pfdsp to CMAKE_INSTALL_PREFIX?" OFF)
 option(INSTALL_PFFASTCONV "install pffastconv to CMAKE_INSTALL_PREFIX?" OFF)
 # test options
 option(PFFFT_USE_BENCH_FFTW   "use (system-installed) FFTW3 in fft benchmark?" OFF)
 option(PFFFT_USE_BENCH_GREEN  "use Green FFT in fft benchmark? - if exists in subdir" ON)
 option(PFFFT_USE_BENCH_KISS   "use KissFFT in fft benchmark? - if exists in subdir" ON)
 option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
 option(PFFFT_USE_BENCH_MKL    "use Intel MKL in fft benchmark? needs to be installed" OFF)
 option(PFFFT_USE_FFTPACK      "compile and use FFTPACK in fft benchmark & validation?" ON)
 option(PFFFT_USE_DEBUG_ASAN  "use GCC's address sanitizer?" OFF)
 option(PFFFT_DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
 # C90 requires the gcc extensions for function attributes like always_inline
 # C99 provides the function attributes: no gcc extensions required
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD 98)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 # populate what to install
 set(INSTALL_TARGETS "")
 set(INSTALL_HEADERS "")
 if ( (NOT PFFFT_USE_TYPE_FLOAT) AND (NOT PFFFT_USE_TYPE_DOUBLE) )
  message(FATAL_ERROR "activate at least one of PFFFT_USE_TYPE_FLOAT or PFFFT_USE_TYPE_DOUBLE")
 endif()
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 include(cmake/target_optimizations.cmake)
 include(cmake/compiler_warnings.cmake)
 find_package(PAPI)
 find_package(MIPP)
 if (MIPP_FOUND)
 # if (TARGET MIPP)
    message(STATUS "found MIPP")
 else()
    message(STATUS "NOT found MIPP")
 endif()
 if (PFFFT_USE_DEBUG_ASAN)
  set(ASANLIB "asan")
 else()
  set(ASANLIB "")
 endif()
 message(STATUS "INFO: CMAKE_C_COMPILER_ID is ${CMAKE_C_COMPILER_ID}")
 message(STATUS "INFO: CMAKE_CXX_COMPILER_ID is ${CMAKE_CXX_COMPILER_ID}")
 if (WIN32)
  message(STATUS "INFO: detected WIN32")
 else()
  message(STATUS "INFO: NOT WIN32")
 endif()
 if (MINGW)
  message(STATUS "INFO: detected MINGW with compiler ${CMAKE_C_COMPILER_ID}")
 else()
  message(STATUS "INFO: NOT MINGW")
 endif()
 if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
  message(STATUS "INFO: detected MSVC with compiler ${CMAKE_C_COMPILER_ID}")
 endif()
 if (PFFFT_USE_BENCH_GREEN)
  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/greenffts/CMakeLists.txt")
    message(STATUS "found subdir greenffts")
    set(PATH_GREEN "${CMAKE_CURRENT_LIST_DIR}/greenffts")
    add_subdirectory( "${PATH_GREEN}" )
  else()
    message(WARNING "GreenFFT not found in subdir greenffts")
  endif()
 endif()
 if (PFFFT_USE_BENCH_KISS)
  # git submodule add https://github.com/hayguen/kissfft.git
  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/kissfft/CMakeLists.txt")
    message(STATUS "found subdir kissfft")
    set(PATH_KISS "${CMAKE_CURRENT_LIST_DIR}/kissfft")
    add_subdirectory( "${PATH_KISS}" )
  else()
    message(WARNING "KissFFT not found in subdir kissfft")
  endif()
 endif()
 if (PFFFT_USE_BENCH_POCKET)
  # git submodule add https://github.com/hayguen/pocketfft.git
  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c")
    message(STATUS "found subdir pocketfft")
    set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft")
    add_subdirectory( "${PATH_POCKET}" )
  else()
    message(WARNING "PocketFFT not found in subdir pocketfft")
  endif()
 endif()
 ########################################################################
 # select the release build type by default to get optimization flags
 ########################################################################
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
   message(STATUS "Build type not specified: defaulting to release.")
 endif(NOT CMAKE_BUILD_TYPE)
 if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
  # using Visual Studio C++
  message(STATUS "INFO: detected MSVC: will not link math lib m")
  set(MATHLIB "")
  add_definitions("/D_CRT_SECURE_NO_WARNINGS")
  set(MSVC_DISABLED_WARNINGS_LIST
      "C4996"
  )
 else()
  if(PFFFT_DISABLE_LINK_WITH_M)
  else()
    message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
    set(MATHLIB "m")
  endif()
 endif()
 set(STDCXXLIB "")
 if (MINGW)
  set(STDCXXLIB "stdc++")
 endif()
 set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
 set( SIMD_DOUBLE_HDRS simd/pf_double.h simd/pf_avx_double.h simd/pf_scalar_double.h )
 if (PFFFT_USE_TYPE_FLOAT)
  set( FLOAT_SOURCES pffft.c pffft.h ${SIMD_FLOAT_HDRS} )
  if (INSTALL_PFFFT)
    set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.h)
  endif()
 else()
  set( FLOAT_SOURCES  )
 endif()
 if (PFFFT_USE_TYPE_DOUBLE)
  set( DOUBLE_SOURCES pffft_double.c pffft_double.h ${SIMD_DOUBLE_HDRS} )
  if (INSTALL_PFFFT)
    set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft_double.h)
  endif()
 else()
  set( DOUBLE_SOURCES )
 endif()
 ######################################################
 add_library(PFFFT STATIC ${FLOAT_SOURCES} ${DOUBLE_SOURCES} pffft_common.c pffft_priv_impl.h pffft.hpp )
 set_target_properties(PFFFT PROPERTIES OUTPUT_NAME "pffft")
 target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
 target_activate_c_compiler_warnings(PFFFT)
 if (PFFFT_USE_SCALAR_VECT)
  target_compile_definitions(PFFFT PRIVATE PFFFT_SCALVEC_ENABLED=1)
 endif()
 if (PFFFT_USE_DEBUG_ASAN)
  target_compile_options(PFFFT PRIVATE "-fsanitize=address")
 endif()
 target_set_c_arch_flags(PFFFT)
 if (NOT PFFFT_USE_SIMD)
  target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
 endif()
 target_link_libraries( PFFFT ${ASANLIB} ${MATHLIB} )
 set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
 )
 if (INSTALL_PFFFT)
  set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFFT)
  set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.hpp)
 endif()
 ######################################################
 if (PFFFT_USE_TYPE_FLOAT)
  add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_cplx.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
  set_property(TARGET PFDSP PROPERTY CXX_STANDARD 11)
  set_property(TARGET PFDSP PROPERTY CXX_STANDARD_REQUIRED ON)
  set_target_properties(PFDSP PROPERTIES OUTPUT_NAME "pfdsp")
  target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
  target_activate_cxx_compiler_warnings(PFDSP)
  if (PFFFT_USE_DEBUG_ASAN)
      target_compile_options(PFDSP PRIVATE "-fsanitize=address")
  endif()
  if (PFFFT_USE_SIMD)
      target_set_cxx_arch_flags(PFDSP)
  else()
      target_compile_definitions(PFDSP PRIVATE PFFFT_SIMD_DISABLE=1)
  endif()
  target_link_libraries( PFDSP ${MATHLIB} )
  set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
  )
  if (INSTALL_PFDSP)
      set(INSTALL_TARGETS ${INSTALL_TARGETS} PFDSP)
      set(INSTALL_HEADERS ${INSTALL_HEADERS} pf_mixer.h pf_cplx.h pf_carrier.h pf_cic.h)
  endif()
 endif()
 ######################################################
 if (PFFFT_USE_FFTPACK)
  # float / single precision
  add_library(FFTPACK_FLOAT STATIC fftpack.c fftpack.h)
  target_compile_definitions(FFTPACK_FLOAT PRIVATE _USE_MATH_DEFINES)
  target_activate_c_compiler_warnings(FFTPACK_FLOAT)
  target_link_libraries( FFTPACK_FLOAT ${MATHLIB} )
  set_property(TARGET FFTPACK_FLOAT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
  )
  # double precision
  add_library(FFTPACK_DOUBLE STATIC fftpack.c fftpack.h)
  target_compile_definitions(FFTPACK_DOUBLE PRIVATE _USE_MATH_DEFINES)
  target_compile_definitions(FFTPACK_DOUBLE PUBLIC FFTPACK_DOUBLE_PRECISION)
  target_activate_c_compiler_warnings(FFTPACK_DOUBLE)
  target_link_libraries( FFTPACK_DOUBLE ${MATHLIB} )
  set_property(TARGET FFTPACK_DOUBLE APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
  )
  # builtin test program of fftpack
  add_executable(test_fftpack_float fftpack.c fftpack.h)
  target_compile_definitions(test_fftpack_float PRIVATE _USE_MATH_DEFINES TESTING_FFTPACK)
  target_link_libraries(test_fftpack_float ${MATHLIB})
  add_executable(test_fftpack_double fftpack.c fftpack.h)
  target_compile_definitions(test_fftpack_double PRIVATE _USE_MATH_DEFINES FFTPACK_DOUBLE_PRECISION TESTING_FFTPACK)
  target_link_libraries(test_fftpack_double ${MATHLIB})
 endif()
 ######################################################
 if (PFFFT_USE_TYPE_FLOAT)
  # only 'float' supported in PFFASTCONV
  add_library(PFFASTCONV STATIC pffastconv.c pffastconv.h pffft.h )
  set_target_properties(PFFASTCONV PROPERTIES OUTPUT_NAME "pffastconv")
  target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
  target_activate_c_compiler_warnings(PFFASTCONV)
  if (PFFFT_USE_DEBUG_ASAN)
    target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
  endif()
  target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
  set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
  )
  if (INSTALL_PFFASTCONV)
    set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFASTCONV)
    set(INSTALL_HEADERS ${INSTALL_HEADERS} pffastconv.h)
  endif()
 endif()
 ######################################################
 install( TARGETS ${INSTALL_TARGETS}  DESTINATION lib)
 install( FILES  ${INSTALL_HEADERS}  DESTINATION include)
 add_custom_target(uninstall
    "${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/uninstall.cmake"
 )
 #######################################################
 if (PFFFT_USE_TYPE_FLOAT)
  add_executable( test_pffft_float  test_pffft.c )
  target_compile_definitions(test_pffft_float PRIVATE _USE_MATH_DEFINES)
  target_compile_definitions(test_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
  target_link_libraries( test_pffft_float  PFFFT ${ASANLIB} )
 endif()
 ######################################################
 if (PFFFT_USE_TYPE_DOUBLE)
  add_executable( test_pffft_double  test_pffft.c )
  target_compile_definitions(test_pffft_double PRIVATE _USE_MATH_DEFINES)
  target_compile_definitions(test_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
  target_link_libraries( test_pffft_double  PFFFT ${ASANLIB} )
 endif()
 ######################################################
 add_executable( test_fft_factors  test_fft_factors.c )
 if (PFFFT_USE_TYPE_FLOAT)
  target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_FLOAT)
 endif()
 if (PFFFT_USE_TYPE_DOUBLE)
  target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_DOUBLE)
 endif()
 target_link_libraries(test_fft_factors PFFFT ${ASANLIB} ${MATHLIB})
 ######################################################
 add_executable( test_pffft_cpp test_pffft.cpp )
 target_compile_definitions(test_pffft_cpp PRIVATE _USE_MATH_DEFINES)
 if (PFFFT_USE_TYPE_FLOAT)
  target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_FLOAT)
 endif()
 if (PFFFT_USE_TYPE_DOUBLE)
  target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_DOUBLE)
 endif()
 target_link_libraries( test_pffft_cpp  PFFFT ${STDCXXLIB} ${ASANLIB} )
 ######################################################
 add_executable( test_pffft_cpp_11 test_pffft.cpp )
 target_compile_definitions(test_pffft_cpp_11 PRIVATE _USE_MATH_DEFINES)
 if (PFFFT_USE_TYPE_FLOAT)
  target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_FLOAT)
 endif()
 if (PFFFT_USE_TYPE_DOUBLE)
  target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_DOUBLE)
 endif()
 target_link_libraries( test_pffft_cpp_11  PFFFT ${STDCXXLIB} ${ASANLIB} )
 set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD 11)
 set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD_REQUIRED ON)
 ######################################################
 if (PFFFT_USE_TYPE_FLOAT)
  add_executable(test_pffastconv   test_pffastconv.c
    ${SIMD_FLOAT_HDRS} ${SIMD_DOUBLE_HDRS}
  )
  target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
  if (PFFFT_USE_DEBUG_ASAN)
    target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
  endif()
  target_set_c_arch_flags(test_pffastconv)
  if (NOT PFFFT_USE_SIMD)
    target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
  endif()
  target_link_libraries( test_pffastconv  PFFASTCONV ${ASANLIB} ${MATHLIB} )
 endif()
 ######################################################
 if (PFFFT_USE_TYPE_FLOAT)
  add_executable(bench_pffft_float   bench_pffft.c pffft.h)
  target_compile_definitions(bench_pffft_float PRIVATE _USE_MATH_DEFINES)
  target_compile_definitions(bench_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
  if (PFFFT_USE_DEBUG_ASAN)
    target_compile_options(bench_pffft_float PRIVATE "-fsanitize=address")
  endif()
  target_link_libraries( bench_pffft_float  PFFFT ${ASANLIB} )
  if (PFFFT_USE_FFTPACK)
    target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTPACK=1)
    target_link_libraries(bench_pffft_float  FFTPACK_FLOAT)
  endif()
  if (PFFFT_USE_BENCH_FFTW)
    target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTW=1)
    target_link_libraries(bench_pffft_float  fftw3f)
  endif()
  if (PATH_GREEN AND PFFFT_USE_BENCH_GREEN)
    target_compile_definitions(bench_pffft_float PRIVATE HAVE_GREEN_FFTS=1)
    target_link_libraries(bench_pffft_float  GreenFFT)
  endif()
  if (PATH_KISS AND PFFFT_USE_BENCH_KISS)
    target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1)
    target_link_libraries(bench_pffft_float  KissFFT)
  endif()
  if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
    target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1)
    target_link_libraries(bench_pffft_float  PocketFFT)
  endif()
  if (PFFFT_USE_BENCH_MKL)
    if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
      # has chances to work
    else()
      # other PROCESSORs could be "ppc", "ppc64",  "arm", "aarch64", "armv7l" - or something else?!
      message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
    endif()
    message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
    target_compile_definitions(bench_pffft_float PRIVATE HAVE_MKL=1)
    target_link_libraries(bench_pffft_float  mkl_intel_lp64 mkl_sequential -lmkl_core)
  endif()
 endif()
 if (PFFFT_USE_TYPE_DOUBLE)
  add_executable(bench_pffft_double   bench_pffft.c pffft.h)
  target_compile_definitions(bench_pffft_double PRIVATE _USE_MATH_DEFINES)
  target_compile_definitions(bench_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
  if (PFFFT_USE_DEBUG_ASAN)
    target_compile_options(bench_pffft_double PRIVATE "-fsanitize=address")
  endif()
  target_link_libraries( bench_pffft_double  PFFFT ${ASANLIB} )
  if (PFFFT_USE_FFTPACK)
    target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTPACK=1)
    target_link_libraries(bench_pffft_double  FFTPACK_DOUBLE)
  endif()
  if (PFFFT_USE_BENCH_FFTW)
    target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1)
    target_link_libraries(bench_pffft_double  fftw3)
  endif()
  if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
    target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1)
    target_link_libraries(bench_pffft_double  PocketFFT)
  endif()
  if (PFFFT_USE_BENCH_MKL)
    if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
      # has chances to work
    else()
      # other PROCESSORs could be "ppc", "ppc64",  "arm", "aarch64", "armv7l" - or something else?!
      message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
    endif()
    message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
    target_compile_definitions(bench_pffft_double PRIVATE HAVE_MKL=1)
    target_link_libraries(bench_pffft_double  mkl_intel_lp64 mkl_sequential -lmkl_core)
  endif()
 endif()
 ######################################################
 if (PFFFT_USE_TYPE_FLOAT)
    add_executable(bench_pf_mixer_float   bench_mixers.cpp papi_perf_counter.h)
    target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
    target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
    target_link_libraries( bench_pf_mixer_float  ${ASANLIB} )
    if (PFFFT_USE_DEBUG_ASAN)
      target_compile_options(bench_pf_mixer_float PRIVATE "-fsanitize=address")
    endif()
    if (PAPI_FOUND)
        target_compile_definitions(bench_pf_mixer_float PRIVATE HAVE_PAPI=1)
        target_link_libraries(bench_pf_mixer_float ${PAPI_LIBRARIES})
    endif()
    target_link_libraries( bench_pf_mixer_float  PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
  ############################################################################
  add_library(pf_conv_arch_none pf_conv.cpp pf_conv.h pf_cplx.h)
  target_compile_definitions(pf_conv_arch_none PRIVATE CONV_ARCH_POST=none MIPP_NO_INTRINSICS=1)
  set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD 11)
  set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD_REQUIRED ON)
  target_activate_cxx_compiler_warnings(pf_conv_arch_none)
  add_library(pf_conv_dispatcher  pf_conv_dispatcher.cpp pf_conv_dispatcher.h pf_conv.h pf_cplx.h)
  set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD 11)
  set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD_REQUIRED ON)
  target_activate_cxx_compiler_warnings(pf_conv_dispatcher)
  add_library(pf_conv_arch_dflt pf_conv.cpp pf_conv.h pf_cplx.h)
  target_compile_definitions(pf_conv_arch_dflt PRIVATE CONV_ARCH_POST=dflt)
  set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD 11)
  set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD_REQUIRED ON)
  target_activate_cxx_compiler_warnings(pf_conv_arch_dflt)
  target_set_cxx_arch_flags(pf_conv_arch_dflt)
  target_link_libraries(pf_conv_dispatcher pf_conv_arch_none pf_conv_arch_dflt)
  if ((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64"))
    if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        set(PF_CONV_ARCHES "sse3;sse4;avx;avx2")
        set(PF_CONV_OPT_sse3 "core2")  # emulate a map
        set(PF_CONV_OPT_sse4 "nehalem")
        set(PF_CONV_OPT_avx  "sandybridge")
        set(PF_CONV_OPT_avx2 "haswell")
        target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AMD64)
    elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
        set(PF_CONV_ARCHES "sse2;avx;avx2")
        set(PF_CONV_OPT_sse2 "SSE2")  # emulate a map
        set(PF_CONV_OPT_avx  "AVX")
        set(PF_CONV_OPT_avx2 "AVX2")
        target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_MSVC_AMD64)
    else()
        set(PF_CONV_ARCHES "")
        message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
    endif()
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
      if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
          set(PF_CONV_ARCHES "armv8a")
          set(PF_CONV_OPT_armv8a   "armv8-a")  # emulate a map for arch
          target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AARCH64)
      else()
          set(PF_CONV_ARCHES "")
          message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
      endif()
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
    if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        set(PF_CONV_ARCHES "neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72")
        set(PF_CONV_OPT_neon_vfpv4        "armv7-a")    # emulate a map for arch
        set(PF_CONV_EXTRA_neon_vfpv4      "neon_vfpv4") # emulate a map for additional options (EXTRA)
        set(PF_CONV_OPT_neon_rpi3_a53     "armv7-a")
        set(PF_CONV_EXTRA_neon_rpi3_a53   "neon_rpi3_a53")
        set(PF_CONV_OPT_neon_rpi4_a72     "armv7-a")
        set(PF_CONV_EXTRA_neon_rpi4_a72   "neon_rpi4_a72")
        target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_ARM32NEON)
    else()
        set(PF_CONV_ARCHES "")
        message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
    endif()
  else()
      message(WARNING "this is unforseen CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
  endif()
  foreach (arch_opt ${PF_CONV_ARCHES})
      add_library(pf_conv_arch_${arch_opt} pf_conv.cpp pf_conv.h pf_cplx.h)
      set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD 11)
      set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD_REQUIRED ON)
      target_activate_cxx_compiler_warnings(pf_conv_arch_${arch_opt})
      target_compile_definitions(pf_conv_arch_${arch_opt} PRIVATE CONV_ARCH_POST=${arch_opt})
      target_set_cxx_arch_option(pf_conv_arch_${arch_opt} "${PF_CONV_OPT_${arch_opt}}" "${PF_CONV_EXTRA_${arch_opt}}"  "${PF_CONV_OPT_${arch_opt}}")
      target_link_libraries(pf_conv_dispatcher  pf_conv_arch_${arch_opt})
      message(STATUS "added library pf_conv_arch_${arch_opt}  with CONV_ARCH_POST=${arch_opt}")
  endforeach()
  if (PFFFT_USE_DEBUG_ASAN)
      foreach (arch_opt ${PF_CONV_ARCHES})
          target_compile_options(pf_conv_arch_${arch_opt} PRIVATE "-fsanitize=address")
          target_link_libraries( pf_conv_arch_${arch_opt} ${ASANLIB})
      endforeach()
      target_compile_options(pf_conv_arch_none  PRIVATE "-fsanitize=address")
      target_link_libraries( pf_conv_arch_none  ${ASANLIB})
      target_compile_options(pf_conv_dispatcher  PRIVATE "-fsanitize=address")
      target_link_libraries(pf_conv_dispatcher ${ASANLIB})
  endif()
  if(MIPP_FOUND)
      foreach (arch_opt ${PF_CONV_ARCHES})
          message(STATUS "link pf_conv_arch_${arch_opt} against MIPP")
          target_link_libraries(pf_conv_arch_${arch_opt} MIPP)
      endforeach()
      message(STATUS "link pf_conv_arch_none against MIPP")
      target_link_libraries(pf_conv_arch_none MIPP)
  endif()
  ############################################################################
  add_executable(bench_pf_conv_float   bench_conv.cpp papi_perf_counter.h)
  set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD 11)
  set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD_REQUIRED ON)
  target_compile_definitions(bench_pf_conv_float PRIVATE _USE_MATH_DEFINES)
  target_compile_definitions(bench_pf_conv_float PRIVATE PFFFT_ENABLE_FLOAT)
  if (PFFFT_USE_DEBUG_ASAN)
      target_compile_options(bench_pf_conv_float PRIVATE "-fsanitize=address")
  endif()
  target_link_libraries( bench_pf_conv_float  ${ASANLIB} )
  if (PAPI_FOUND)
      target_compile_definitions(bench_pf_conv_float PRIVATE HAVE_PAPI=1)
      target_link_libraries(bench_pf_conv_float ${PAPI_LIBRARIES})
  endif()
  if(MIPP_FOUND)
      target_link_libraries(bench_pf_conv_float MIPP)
  endif()
  target_link_libraries( bench_pf_conv_float  pf_conv_dispatcher PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
 endif()
 ######################################################
 add_subdirectory(examples)
 ######################################################
 enable_testing()
 add_test(NAME test_fft_factors
  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fft_factors"
  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
 if (PFFFT_USE_FFTPACK)
  add_test(NAME test_fftpack_float
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_float"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
  add_test(NAME test_fftpack_double
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_double"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
 endif()
 if (PFFFT_USE_TYPE_FLOAT)
  add_test(NAME bench_pffft_pow2
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--max-len" "128" "--quick"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
  add_test(NAME bench_pffft_non2
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--non-pow2" "--max-len" "192" "--quick"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
  # add_test(NAME bench_plots
  #   COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/plots.sh"
  #   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  # )
  add_test(NAME test_pfconv_lens_symetric
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
  add_test(NAME test_pfconv_lens_non_sym
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
  add_test(NAME bench_pfconv_symetric
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" "--sym"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
  add_test(NAME bench_pfconv_non_sym
    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
 endif()
--- a/pffft/LICENSE.txt
+++ b/pffft/LICENSE.txt
@@ -0,0 +1,38 @@
 Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
 Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
 Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
 Copyright (c) 2004 the University Corporation for Atmospheric
 Research ("UCAR"). All rights reserved. Developed by NCAR's
 Computational and Information Systems Laboratory, UCAR,
 www.cisl.ucar.edu.
 Redistribution and use of the Software in source and binary forms,
 with or without modification, is permitted provided that the
 following conditions are met:
 - Neither the names of NCAR's Computational and Information Systems
 Laboratory, the University Corporation for Atmospheric Research,
 nor the names of its sponsors or contributors may be used to
 endorse or promote products derived from this Software without
 specific prior written permission.  
 - Redistributions of source code must retain the above copyright
 notices, this list of conditions, and the disclaimer below.
 - Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions, and the disclaimer below in the
 documentation and/or other materials provided with the
 distribution.
 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
 SOFTWARE.
--- a/pffft/README.md
+++ b/pffft/README.md
@@ -0,0 +1,352 @@
 ---
 # PFFFT: a pretty fast FFT and fast convolution with PFFASTCONV
 ---
 <!-- toc -->
 - [Brief Description](#brief-description)
 - [Why does it exist?](#why-does-it-exist)
 - [CMake](#cmake)
 - [History / Origin / Changes](#history--origin--changes)
 - [Comparison with other FFTs](#comparison-with-other-ffts)
 - [Dependencies / Required Linux packages](#dependencies--required-linux-packages)
 - [Benchmarks and results](#benchmarks-and-results)
 <!-- tocstop -->
 ---
 ## Brief description:
 PFFFT does 1D Fast Fourier Transforms, of single precision real and
 complex vectors. It tries do it fast, it tries to be correct, and it
 tries to be small. Computations do take advantage of SSE1 instructions
 on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
 license is BSD-like.
 PFFFT is a fork of [Julien Pommier's library on bitbucket](https://bitbucket.org/jpommier/pffft/)
 with some changes and additions.
 PFFASTCONV does fast convolution (FIR filtering), of single precision 
 real vectors, utilizing the PFFFT library. The license is BSD-like.
 PFDSP contains a few other signal processing functions.
 Currently, mixing and carrier generation functions are contained.
 It is work in progress - also the API!
 The fast convolution from PFFASTCONV might get merged into PFDSP.
 ## Why does it exist:
 I (Julien Pommier) was in search of a good performing FFT library ,
 preferably very small and with a very liberal license.
 When one says "fft library", FFTW ("Fastest Fourier Transform in the
 West") is probably the first name that comes to mind -- I guess that
 99% of open-source projects that need a FFT do use FFTW, and are happy
 with it. However, it is quite a large library , which does everything
 fft related (2d transforms, 3d transforms, other transformations such
 as discrete cosine , or fast hartley). And it is licensed under the
 GNU GPL , which means that it cannot be used in non open-source
 products.
 An alternative to FFTW that is really small, is the venerable FFTPACK
 v4, which is available on NETLIB. A more recent version (v5) exists,
 but it is larger as it deals with multi-dimensional transforms. This
 is a library that is written in FORTRAN 77, a language that is now
 considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
 by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
 its age, benchmarks show it that it still a very good performing FFT
 library, see for example the 1d single precision benchmarks
 [here](http://www.fftw.org/speed/opteron-2.2GHz-32bit/). It is however not
 competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
 Apple vDSP. The reason for that is that those libraries do take
 advantage of the SSE SIMD instructions available on Intel CPUs,
 available since the days of the Pentium III. These instructions deal
 with small vectors of 4 floats at a time, instead of a single float
 for a traditionnal FPU, so when using these instructions one may expect
 a 4-fold performance improvement.
 The idea was to take this fortran fftpack v4 code, translate to C,
 modify it to deal with those SSE instructions, and check that the
 final performance is not completely ridiculous when compared to other
 SIMD FFT libraries. Translation to C was performed with [f2c](
 http://www.netlib.org/f2c/). The resulting file was a bit edited in
 order to remove the thousands of gotos that were introduced by
 f2c. You will find the fftpack.h and fftpack.c sources in the
 repository, this a complete translation of [fftpack](
 http://www.netlib.org/fftpack/), with the discrete cosine transform
 and the test program. There is no license information in the netlib
 repository, but it was confirmed to me by the fftpack v5 curators that
 the [same terms do apply to fftpack v4]
 (http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html). This is a
 "BSD-like" license, it is compatible with proprietary projects.
 Adapting fftpack to deal with the SIMD 4-element vectors instead of
 scalar single precision numbers was more complex than I originally
 thought, especially with the real transforms, and I ended up writing
 more code than I planned..
 ## The code:
 ### Good old C:
 The FFT API is very very simple, just make sure that you read the comments in `pffft.h`.
 The Fast convolution's API is also very simple, just make sure that you read the comments 
 in `pffastconv.h`.
 ### C++:
 A simple C++ wrapper is available in `pffft.hpp`.
 ### Git:
 This archive's source can be downloaded with git (without the submodules):
 ```
 git clone https://github.com/marton78/pffft.git
 ```
 ### Only two files?:
 _"Only two files, in good old C, pffft.c and pffft.h"_
 This statement does **NO LONGER** hold!
 With new functionality and support for AVX, there was need to restructure the sources.
 But you can compile and link **pffft** as a static library.
 ## CMake:
 There's now CMake support to build the static libraries `libPFFFT.a` 
 and `libPFFASTCONV.a` from the source files, plus the additional 
 `libFFTPACK.a` library. Later one's sources are there anyway for the benchmark.
 There are several CMake options to modify library size and optimization.
 You can explore all available options with `cmake-gui` or `ccmake`,
 the console version - after having installed (on Debian/Ubuntu Linux) one of
 ```
 sudo apt-get install cmake-qt-gui
 sudo apt-get install cmake-curses-gui
 ```
 Some of the options:
 * `PFFFT_USE_TYPE_FLOAT` to activate single precision 'float' (default: ON)
 * `PFFFT_USE_TYPE_DOUBLE` to activate 'double' precision float (default: ON)
 * `PFFFT_USE_SIMD` to use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? (default: ON)
 * `DISABLE_SIMD_AVX` to disable AVX CPU features (default: OFF)
 * `PFFFT_USE_SIMD_NEON` to force using NEON on ARM (requires PFFFT_USE_SIMD) (default: OFF)
 * `PFFFT_USE_SCALAR_VECT` to use 4-element vector scalar operations (if no other SIMD) (default: ON)
 Options can be passed to `cmake` at command line, e.g.
 ```
 cmake -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_TYPE_DOUBLE=ON
 ```
 My Linux distribution defaults to GCC. With installed CLANG and the bash shell, you can use it with
 ```
 mkdir build
 cd build
 CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -DCMAKE_BUILD_TYPE=Debug ../
 cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=~ ../
 ccmake .                          # or: cmake-gui .
 cmake --build .                   # or simply: make
 ctest                             # to execute some tests - including benchmarks
 cmake --build . --target install  # or simply: [sudo] make install
 ```
 With MSVC on Windows, you need some different options. Following ones to build a 64-bit Release with Visual Studio 2019:
 ```
 mkdir build
 cd build
 cmake -G "Visual Studio 16 2019" -A x64 ..
 cmake --build . --config Release
 ctest -C Release
 ```
 see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
 ## History / Origin / Changes:
 Origin for this code/fork is Julien Pommier's pffft on bitbucket:
 [https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/)
 Git history shows following first commits of the major contributors:
 * Julien Pommier: November 19, 2011
 * Marton Danoczy: September 30, 2015
 * Hayati Ayguen: December 22, 2019
 * Dario Mambro: March 24, 2020
 There are a few other contributors not listed here.
 The main changes include:
 * improved benchmarking, see [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks)
 * double support
 * avx(2) support
 * c++ headers (wrapper)
 * additional API helper functions
 * additional library for fast convolution
 * cmake support
 * ctest
 ## Comparison with other FFTs:
 The idea was not to break speed records, but to get a decently fast
 fft that is at least 50% as fast as the fastest FFT -- especially on
 slowest computers . I'm more focused on getting the best performance
 on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
 on getting top performance on today fastest cpus.
 It can be used in a real-time context as the fft functions do not
 perform any memory allocation -- that is why they accept a 'work'
 array in their arguments.
 It is also a bit focused on performing 1D convolutions, that is why it
 provides "unordered" FFTs , and a fourier domain convolution
 operation.
 Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages).
 It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast.
 The whole C++ implementation file is 161 lines, including the Copyright header, see
 [https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp)
 ## Dependencies / Required Linux packages
 On Debian/Ubuntu Linux following packages should be installed:
 ```
 sudo apt-get install build-essential gcc g++ cmake
 ```
 ## Benchmarks and results
 #### Quicklink
 Find results at [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
 #### General
 My (Hayati Ayguen) first look at FFT-benchmarks was with [benchFFT](http://www.fftw.org/benchfft/)
 and especially the results of the benchmarks [results](http://www.fftw.org/speed/),
 which demonstrate the performance of the [FFTW](http://www.fftw.org/).
 Looking at the benchmarked computer systems from todays view (2021), these are quite outdated.
 Having a look into the [benchFFT source code](http://www.fftw.org/benchfft/benchfft-3.1.tar.gz),
 the latest source changes, including competitive fft implementations, are dated November 2003.
 In 2019, when pffft got my attention at [bitbucket](https://bitbucket.org/jpommier/pffft/src/master/),
 there were also some benchmark results.
 Unfortunately the results are tables with numbers - without graphical plots.
 Without the plots, i could not get an impression. That was, why i started
 [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks),
 which includes GnuPlot figures.
 Today in June 2021, i realized the existence of [https://github.com/FFTW/benchfft](https://github.com/FFTW/benchfft).
 This repository is much more up-to-date with a commit in December 2020.
 Unfortunately, it looks not so simple to get it run - including the generation of plots.
 Is there any website showing benchFFT results of more recent computer systems?
 Of course, it's very important, that a benchmark can be compared with a bunch
 of different FFT algorithms/implementations.
 This requires to have these compiled/built and utilizable.
 #### Git submodules for Green-, Kiss- and Pocket-FFT
 Sources for [Green-](https://github.com/hayguen/greenffts),
 [Kiss-](https://github.com/hayguen/kissfft)
 and [Pocket-FFT](https://github.com/hayguen/pocketfft)
 can be downloaded directly with the sources of this repository - using git submodules:
 ```
 git clone --recursive https://github.com/marton78/pffft.git
 ```
 Important is `--recursive`, that does also fetch the submodules directly.
 But you might retrieve the submodules later, too:
 ```
 git submodule update --init
 ```
 #### Fastest Fourier Transform in the West: FFTW
 To allow comparison with FFTW [http://www.fftw.org/](http://www.fftw.org/),
 cmake option `-DPFFFT_USE_BENCH_FFTW=ON` has to be used with following commands.
 The cmake option requires previous setup of following (debian/ubuntu) package:
 ```
 sudo apt-get install libfftw3-dev
 ```
 #### Intel Math Kernel Library: MKL
 Intel's MKL [https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html)
 currently looks even faster than FFTW.
 On Ubuntu-Linux it's easy to setup with the package `intel-mkl`.
 Similar on Debian: `intel-mkl-full`.
 There are special repositories for following Linux distributions:
 * Debian/apt: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html)
 * RedHat/yum: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html)
 * Gentoo/ebuild: [https://packages.gentoo.org/packages/sci-libs/mkl](https://packages.gentoo.org/packages/sci-libs/mkl)
 #### Performing the benchmarks - with CMake
 Benchmarks should be prepared by creating a special build folder
 ```
 mkdir build_benches
 cd build_benches
 cmake ../bench
 ```
 There are several CMake options to parametrize, which fft implementations should be benched.
 You can explore all available options with `cmake-gui` or `ccmake`, see [CMake](#cmake).
 Some of the options:
 * `BENCH_ID`         name the benchmark - used in filename
 * `BENCH_ARCH`       target architecture passed to compiler for code optimization
 * `PFFFT_USE_BENCH_FFTW`   use (system-installed) FFTW3 in fft benchmark? (default: OFF)
 * `PFFFT_USE_BENCH_GREEN`  use Green FFT in fft benchmark? (default: ON)
 * `PFFFT_USE_BENCH_KISS`   use KissFFT in fft benchmark? (default: ON)
 * `PFFFT_USE_BENCH_POCKET` use PocketFFT in fft benchmark? (default: ON)
 * `PFFFT_USE_BENCH_MKL`    use Intel MKL in fft benchmark?  (default: OFF)
 These options can be passed to `cmake` at command line, e.g.
 ```
 cmake -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
 ```
 The benchmarks are built and executed with
 ```
 cmake --build .
 ```
 You can also specify to use a different compiler/version with the cmake step, e.g.:
 ```
 CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 cmake -DBENCH_ID=gcc9 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
 ```
 ```
 CC=/usr/bin/clang-11 CXX=/usr/bin/clang++-11 cmake -DBENCH_ID=clang11 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
 ```
 For using MSVC/Windows, the cmake command requires/needs the generator and architecture options and to be called from the VS Developer prompt:
 ```
 cmake -G "Visual Studio 16 2019" -A x64 ../bench/
 ```
 see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
 For running with different compiler version(s):
 * copy the result file (.tgz), e.g. `cp *.tgz ../`
 * delete the build directory: `rm -rf *`
 * then continue with the cmake step
 #### Benchmark results and contribution
 You might contribute by providing us the results of your computer(s).
 The benchmark results are stored in a separate git-repository:
 See [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
 This is to keep this repositories' sources small.
--- a/pffft/bench/CMakeLists.txt
+++ b/pffft/bench/CMakeLists.txt
@@ -0,0 +1,224 @@
 cmake_minimum_required(VERSION 2.8)
 project(BENCH_PFFFT)
 set(BENCH_ID  "default" CACHE STRING "ID: use single word without spaces. gets part of result filename")
 option(BENCH_FAST_MATH  "Build with fast math - non IEEE compliant" ON)
 if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
  set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge/ARM-NEON:armv7-a")
 elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
  set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge")
 elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC")  # others: "Intel"
  set(BENCH_ARCH "AVX" CACHE STRING "target architecture (/arch): SSE2/AVX")
 else()
  set(BENCH_ARCH "" CACHE STRING "target architecture - use full compiler option!")
 endif()
 # architecture/optimization options
 option(PFFFT_USE_SIMD        "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
 option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
 option(PFFFT_USE_SIMD_NEON   "force using NEON on ARM? (requires PFFFT_USE_SIMD)" OFF)
 option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
 option(PFFFT_USE_BENCH_FFTW   "use (system-installed) FFTW3 in fft benchmark?" OFF)
 option(PFFFT_USE_BENCH_GREEN  "use Green FFT in fft benchmark? - if exists in subdir" ON)
 option(PFFFT_USE_BENCH_KISS   "use KissFFT in fft benchmark? - if exists in subdir" ON)
 option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
 option(PFFFT_USE_BENCH_MKL    "use Intel MKL in fft benchmark? needs to be installed" OFF)
 set(OSSTR "")
 if (WIN32)
  set(OSSTR "Win32")
 endif (WIN32)
 if (UNIX)
  set(OSSTR "Unix")
 endif (UNIX)
 set(BUILD_DIR_TO_EXE "")
 set(CMAKE_PLATFORM_OPT "")
 set(CMAKE_MAKE_OPT "")
 if (MSVC)
  set(BUILD_DIR_TO_EXE "Release/")
  set(CMAKE_PLATFORM_OPT "-A \"${CMAKE_GENERATOR_PLATFORM}\"")
  set(CMAKE_MAKE_OPT "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}")
 endif()
 set(benchdir "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}")
 set(benchdir_flt "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/float")
 set(benchdir_dbl "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/double")
 set(builddir_flt "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_float")
 set(builddir_dbl "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_double")
 add_custom_command(OUTPUT "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir}"
  COMMAND ${CMAKE_COMMAND} -E echo "benchmark ${BENCH_ID}"   > "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "CMake major:    ${CMAKE_MAJOR_VERSION}"        >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "CMake minor:    ${CMAKE_MINOR_VERSION}"        >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "OS:             ${OSSTR}"                      >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "System:         ${CMAKE_SYSTEM_NAME}"          >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "System CPU:     ${CMAKE_SYSTEM_PROCESSOR}"     >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "System Version: ${CMAKE_HOST_SYSTEM_VERSION}"  >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "C   Compiler:   ${CMAKE_C_COMPILER_ID}"        >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "C   Version:    ${CMAKE_C_COMPILER_VERSION}"   >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "C++ Compiler:   ${CMAKE_CXX_COMPILER_ID}"      >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "C++ Version:    ${CMAKE_CXX_COMPILER_VERSION}" >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "MSVC Version:   ${MSVC_VERSION}"               >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "MSVC Toolset:   ${MSVC_TOOLSET_VERSION}"       >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "Exe Suffix:     ${CMAKE_EXECUTABLE_SUFFIX}"    >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "C   Byte Order: ${CMAKE_C_BYTE_ORDER}"         >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "C++ Byte Order: ${CMAKE_CXX_BYTE_ORDER}"       >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo ""                                              >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "Architecture:   ${BENCH_ARCH}"                 >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "Fast math:      ${BENCH_FAST_MATH}"            >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD=${PFFFT_USE_SIMD}"                   >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config DISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}"   >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}"         >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}"     >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}"       >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}"     >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}"       >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}"   >> "${benchdir}/info.txt"
  COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}"         >> "${benchdir}/info.txt"
 )
 if (UNIX)
  add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
    COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
    COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/unix_info.sh"
    DEPENDS "${benchdir}/info.txt"
    WORKING_DIRECTORY ${benchdir}
  )
 else()
  add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
    COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
    DEPENDS "${benchdir}/info.txt"
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
 endif()
 add_custom_command(OUTPUT "${builddir_flt}/directory.txt"
  COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_flt}"
  COMMAND ${CMAKE_COMMAND} -E touch "${builddir_flt}/directory.txt"
 )
 add_custom_command(OUTPUT "${builddir_dbl}/directory.txt"
  COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_dbl}"
  COMMAND ${CMAKE_COMMAND} -E touch "${builddir_dbl}/directory.txt"
 )
 add_custom_command(OUTPUT "${benchdir_flt}/directory.txt"
  COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_flt}"
  COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_flt}/directory.txt"
 )
 add_custom_command(OUTPUT "${benchdir_dbl}/directory.txt"
  COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_dbl}"
  COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_dbl}/directory.txt"
 )
 add_custom_target(build_float
  COMMAND ${CMAKE_COMMAND} -E echo "start cmake for float in ${builddir_flt}"
  COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
                        "${CMAKE_MAKE_OPT}"
                        -DCMAKE_BUILD_TYPE=Release
                        "-DARCH=${BENCH_ARCH}"
                        -DUSE_FAST_MATH=${BENCH_FAST_MATH}
                        -DPFFFT_USE_TYPE_FLOAT=ON
                        -DPFFFT_USE_TYPE_DOUBLE=OFF
                        -DUSE_FLOAT_PREC=ON
                        -DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
                        -DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
                        -DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
                        -DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
                        -DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
                        -DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
                        -DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
                        -DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
                        -DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
                        "${CMAKE_SOURCE_DIR}/.."
  # COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for float in ${builddir_flt}"
  COMMAND ${CMAKE_COMMAND} --build . --config Release
  DEPENDS "${builddir_flt}/directory.txt"
  WORKING_DIRECTORY "${builddir_flt}"
 )
 add_custom_target(build_double
  COMMAND ${CMAKE_COMMAND} -E echo "start cmake for double in ${builddir_dbl}"
  COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
                        "${CMAKE_MAKE_OPT}"
                        -DCMAKE_BUILD_TYPE=Release
                        "-DARCH=${BENCH_ARCH}"
                        -DUSE_FAST_MATH=${BENCH_FAST_MATH}
                        -DPFFFT_USE_TYPE_FLOAT=OFF
                        -DPFFFT_USE_TYPE_DOUBLE=ON
                        -DUSE_FLOAT_PREC=OFF
                        -DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
                        -DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
                        -DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
                        -DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
                        -DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
                        -DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
                        -DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
                        -DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
                        -DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
                        "${CMAKE_SOURCE_DIR}/.."
  COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for double in ${builddir_dbl}"
  COMMAND ${CMAKE_COMMAND} --build . --config Release
  DEPENDS "${builddir_dbl}/directory.txt"
  WORKING_DIRECTORY "${builddir_dbl}"
 )
 add_custom_target(bench_float
  COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for float"
  COMMAND "${builddir_flt}/${BUILD_DIR_TO_EXE}bench_pffft_float${CMAKE_EXECUTABLE_SUFFIX}"
  DEPENDS "${benchdir_flt}/directory.txt" build_float
  WORKING_DIRECTORY "${benchdir_flt}"
 )
 add_custom_target(bench_double
  COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for double"
  COMMAND "${builddir_dbl}/${BUILD_DIR_TO_EXE}bench_pffft_double${CMAKE_EXECUTABLE_SUFFIX}"
  DEPENDS "${benchdir_dbl}/directory.txt" build_double
  WORKING_DIRECTORY "${benchdir_dbl}"
 )
 add_custom_target(bench ALL
  COMMAND ${CMAKE_COMMAND} -E echo ""
  COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
  COMMAND ${CMAKE_COMMAND} -E echo ""
  COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
  # DEPENDS "${benchdir}/info.txt" "${benchdir}/unix_info.txt"
  DEPENDS "${benchdir}/info.txt" bench_float bench_double "${benchdir}/unix_info.txt"
  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
 )
 add_custom_target(bench_float_tar
  COMMAND ${CMAKE_COMMAND} -E echo ""
  COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
  COMMAND ${CMAKE_COMMAND} -E echo ""
  COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
  DEPENDS "${benchdir}/info.txt" bench_float "${benchdir}/unix_info.txt"
  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
 )
 add_custom_target(bench_double_tar
  COMMAND ${CMAKE_COMMAND} -E echo ""
  COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
  COMMAND ${CMAKE_COMMAND} -E echo ""
  COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
  DEPENDS "${benchdir}/info.txt" bench_double "${benchdir}/unix_info.txt"
  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
 )
 add_custom_target(clean_results
  COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_flt}"
  COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_dbl}"
  WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
 )
--- a/pffft/bench/unix_info.sh
+++ b/pffft/bench/unix_info.sh
@@ -0,0 +1,9 @@
 #!/bin/bash
 lscpu > unix_lscpu.txt
 cat /proc/cpuinfo > unix_cpuinfo.txt
 lsb_release -a  > unix_lsb_release.txt
 FILES=$(ls -1 /etc/*-release)
 if [ ! -z "$FILES" ]; then
  cp /etc/*-release ./
 fi
--- a/pffft/bench_conv.cpp
+++ b/pffft/bench_conv.cpp
@@ -0,0 +1,345 @@
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include <algorithm>
 #include <random>
 #include <cstdint>
 #include <complex>
 #include "papi_perf_counter.h"
 //#if defined(HAVE_MIPP) && !defined(NO_MIPP)
 #if defined(HAVE_MIPP)
 #include <mipp.h>
 #define MIPP_VECTOR  mipp::vector
 #else
 #define MIPP_VECTOR  std::vector
 #endif
 #include "pf_conv_dispatcher.h"
 #include "pf_conv.h"
 #define TEST_WITH_MIN_LEN     0
 MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
 {
    MIPP_VECTOR<float> v(N < 0 ? M : N);
    std::mt19937 g;
    g.seed(seed_value);
    constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
    for (int k = 0; k < M; ++k)
        v[k] = float(int_fast32_t(g())) * scale;
    for (int k = M; k < N; ++k)
        v[k] = 0.0F;
    return v;
 }
 int bench_oop_core(
        const conv_f_ptrs & conv_arch,
        const float * signal, const int sz_signal,
        const float * filter, const int sz_filter,
        const int blockLen,
        float * y
        )
 {
    conv_buffer_state state;
    const auto conv_oop = conv_arch.fp_conv_float_oop;
    int n_out_sum = 0;
    state.offset = 0;
    state.size = 0;
    papi_perf_counter perf_counter(1);
    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
    {
        state.size += blockLen;
        int n_out = conv_oop(signal, &state, filter, sz_filter, y);
        n_out_sum += n_out;
    }
    return n_out_sum;
 }
 int bench_inplace_core(
        const conv_f_ptrs & conv_arch,
        float * signal, const int sz_signal,
        const float * filter, const int sz_filter,
        const int blockLen
        )
 {
    conv_buffer_state state;
    const auto conv_inplace = conv_arch.fp_conv_float_inplace;
    int n_out_sum = 0;
    state.offset = 0;
    state.size = 0;
    papi_perf_counter perf_counter(1);
    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
    {
        state.size += blockLen;
        int n_out = conv_inplace(signal, &state, filter, sz_filter);
        n_out_sum += n_out;
    }
    return n_out_sum;
 }
 int bench_oop(
        const conv_f_ptrs & conv_arch,
        float * buffer,
        const float * signal, const int sz_signal,
        const float * filter, const int sz_filter,
        const int blockLen,
        float * y
        )
 {
    conv_buffer_state state;
    const auto conv_oop = conv_arch.fp_conv_float_oop;
    const auto move_rest = conv_arch.fp_conv_float_move_rest;
    int n_out_sum = 0;
    state.offset = 0;
    state.size = 0;
    papi_perf_counter perf_counter(1);
    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
    {
        move_rest(buffer, &state);
        //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
        std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
        state.size += blockLen;
        int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
        n_out_sum += n_out;
    }
    return n_out_sum;
 }
 int bench_cx_real_oop(
        const conv_f_ptrs & conv_arch,
        complexf * buffer,
        const float * signal_re, const int sz_signal_re,
        const float * filter, const int sz_filter,
        const int blockLen,
        float * y_re
        )
 {
    conv_buffer_state state;
    const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
    const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
    // interpret buffer, signal and output vector y  as complex data
    complexf * y = reinterpret_cast<complexf *>(y_re);
    const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
    const int sz_signal = sz_signal_re / 2;
    int n_out_sum = 0;
    state.offset = 0;
    state.size = 0;
    papi_perf_counter perf_counter(1);
    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
    {
        move_rest(buffer, &state);
        //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
        std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
        state.size += blockLen;
        int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
        n_out_sum += n_out;
    }
    return n_out_sum;
 }
 int main(int argc, char *argv[])
 {
    // cli defaults:
    // process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
    int arch = 0, N = 64 * 1024 * 1024;
    int filterLen = 128, blockLen = 1024;
    int seed_sig = 1, seed_filter = 2;
    bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
    for (int i = 1; i < argc; ++i)
    {
        if (i+1 < argc && !strcmp(argv[i], "-a"))
            arch = atoi(argv[++i]);
        else if (i+1 < argc && !strcmp(argv[i], "-n"))
            N = atoi(argv[++i]) * 1024 * 1024;
        else if (i+1 < argc && !strcmp(argv[i], "-f"))
            filterLen = atoi(argv[++i]);
        else if (i+1 < argc && !strcmp(argv[i], "-b"))
            blockLen = atoi(argv[++i]);
        else if (i+1 < argc && !strcmp(argv[i], "-ss"))
            seed_sig = atoi(argv[++i]);
        else if (i+1 < argc && !strcmp(argv[i], "-sf"))
            seed_filter = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-v"))
            verbose = true;
        else if (!strcmp(argv[i], "-h"))
            showUsage = exitFromUsage = true;
        else
            fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
    }
    int num_arch = 0;
    const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
    if (verbose)
    {
        fprintf(stderr, "num_arch is %d\n", num_arch);
        for (int a = 0; a < num_arch; ++a)
            if (conv_arch_ptrs[a])
                fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
            else
                fprintf(stderr, " arch %d is nullptr !!!\n", a );
        fprintf(stderr, "\n");
    }
    if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
    {
        fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
        fprintf(stderr, "    [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
        fprintf(stderr, "arch is one of:");
        for (int a = 0; a < num_arch; ++a)
            if (conv_arch_ptrs[a])
                fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
        fprintf(stderr, "\n");
        if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
            return 0;
    }
    if (verbose)
    {
        #ifdef HAVE_PAPI
        fprintf(stderr, "PAPI is available\n");
        #else
        fprintf(stderr, "PAPI is NOT available!\n");
        #endif
    }
    #if !defined(HAVE_MIPP)
    fprintf(stderr, "MIPP is NOT available!\n");
    #endif
    //int float_simd_size[num_arch];
    int max_simd_size = -1;
    for (int a = 0; a < num_arch; ++a)
    {
        if (conv_arch_ptrs[a])
        {
            const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
            //float_simd_size[a] = sz;
            if (max_simd_size < sz)
                max_simd_size = sz;
            if (verbose)
                fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
        }
        //else
        //    float_simd_size[a] = 0;
    }
    //const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
    if (verbose)
        fprintf(stderr, "max float simd size: %d\n", max_simd_size);
 #if TEST_WITH_MIN_LEN
    filterLen = 2;
 #endif
    // round up filter length
    filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
 #if TEST_WITH_MIN_LEN
    blockLen = 1;
    N = 2 * (3 + filterLen);    // produce 3+1 samples
 #endif
    if (!conv_arch_ptrs[arch])
    {
        fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
        return 1;
    }
    const conv_f_ptrs & conv_arch =  *conv_arch_ptrs[arch];
    if (verbose)
        fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
    fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
        N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
    MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
    MIPP_VECTOR<float> y(N + 1, 0.0F);
    MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
    MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
    MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
 #if 1 && TEST_WITH_MIN_LEN
    for (int k = 0; k < N; ++k)
        s[k] = (k+1);
    for (int k = 0; k < filterLen; ++k)
        filter[k] = (k+1);
 #endif
    s[N] = 123.0F;
    y[N] = 321.0F;
    buffer[blockLen + filterLen] = 789.0F;
    buffer_cx[blockLen + filterLen].i = 987.0F;
    fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
    int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
 #if TEST_WITH_MIN_LEN
    for (int k = 0; k < n_oop_out; ++k )
        fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
    fprintf(stderr, "\n");
 #endif
    fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
    n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
    assert(s[N] == 123.0F);
    assert(y[N] == 321.0F);
    assert(buffer[blockLen + filterLen] == 789.0F);
    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
 #if TEST_WITH_MIN_LEN
    for (int k = 0; k < n_oop_out; ++k )
        fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
    fprintf(stderr, "\n");
 #endif
    fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
    n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
    assert(s[N] == 123.0F);
    assert(y[N] == 321.0F);
    assert(buffer[blockLen + filterLen] == 789.0F);
    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
 #if TEST_WITH_MIN_LEN
    fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
    for (int k = 0; k < n_oop_out; ++k )
        fprintf(stderr, "y[%2d] = %g  %+g * i\n", k, y[2*k], y[2*k+1]);
    fprintf(stderr, "\n");
    const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
    const int Nc = N /2;
    fprintf(stderr, "reference with std::complex<float>:\n");
    for (int off = 0; off +filterLen <= Nc; ++off )
    {
        std::complex<float> sum(0.0F, 0.0F);
        for (int k=0; k < filterLen; ++k)
            sum += sc[off+k] * filter[k];
        fprintf(stderr, "yv[%2d] = %g  %+g * i\n", off, sum.real(), sum.imag() );
    }
 #endif
    fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
    int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
    fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
    assert(s[N] == 123.0F);
    assert(y[N] == 321.0F);
    assert(buffer[blockLen + filterLen] == 789.0F);
    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
 #if TEST_WITH_MIN_LEN
    for (int k = 0; k < n_inp_out; ++k )
        fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
    fprintf(stderr, "\n");
 #endif
    fprintf(stderr, "\n");
    return 0;
 }
--- a/pffft/bench_mixers.cpp
+++ b/pffft/bench_mixers.cpp
@@ -0,0 +1,889 @@
 /*
  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
  bench for mixer algorithm/implementations
 */
 #include <pf_mixer.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <assert.h>
 #include <string.h>
 #include "papi_perf_counter.h"
 #if defined(__linux__)
 #define HAVE_SYS_TIMES
 #endif
 #ifdef HAVE_SYS_TIMES
 #  include <sys/times.h>
 #  include <unistd.h>
 #endif
 #ifdef WIN32
 #define WIN32_LEAN_AND_MEAN
 #define VC_EXTRALEAN
 #include <windows.h>
 #endif
 #define BENCH_REF_TRIG_FUNC       1
 #define BENCH_OUT_OF_PLACE_ALGOS  0
 #define BENCH_INPLACE_ALGOS       1
 #define SAVE_BY_DEFAULT  0
 #define SAVE_LIMIT_MSPS           16
 #if 0
  #define BENCH_FILE_SHIFT_MATH_CC           "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
  #define BENCH_FILE_ADD_FAST_CC             "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
  #define BENCH_FILE_ADD_FAST_INP_C          "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
  #define BENCH_FILE_UNROLL_INP_C            "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_INP_C        "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
  #define BENCH_FILE_REC_OSC_CC              ""
  #define BENCH_FILE_REC_OSC_INP_C           "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
  #define BENCH_FILE_REC_OSC_SSE_INP_C       "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
 #else
  #define BENCH_FILE_SHIFT_MATH_CC           ""
  #define BENCH_FILE_ADD_FAST_CC             ""
  #define BENCH_FILE_ADD_FAST_INP_C          ""
  #define BENCH_FILE_UNROLL_INP_C            ""
  #define BENCH_FILE_LTD_UNROLL_INP_C        ""
  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  ""
  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  ""
  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  ""
  #define BENCH_FILE_REC_OSC_CC              ""
  #define BENCH_FILE_REC_OSC_INP_C           ""
  #define BENCH_FILE_REC_OSC_SSE_INP_C       ""
 #endif
 #if defined(HAVE_SYS_TIMES)
    static double ttclk = 0.;
    static double uclock_sec(int find_start)
    {
        struct tms t0, t;
        if (ttclk == 0.)
        {
            ttclk = sysconf(_SC_CLK_TCK);
            fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
        }
        times(&t);
        if (find_start)
        {
            t0 = t;
            while (t0.tms_utime == t.tms_utime)
                times(&t);
        }
        /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
        return ((double)t.tms_utime) / ttclk;
    }
 #elif defined(WIN32)
    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
    double uclock_sec(int find_start)
    {
        FILETIME a, b, c, d;
        if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
        {
            //  Returns total user time.
            //  Can be tweaked to include kernel times as well.
            return
                (double)(d.dwLowDateTime |
                    ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
        }
        else {
            //  Handle error
            return 0;
        }
    }
 #else
    double uclock_sec(int find_start)
    { return (double)clock()/(double)CLOCKS_PER_SEC; }
 #endif
 void save(complexf * d, int B, int N, const char * fn)
 {
    if (!fn || !fn[0])
    {
        if (! SAVE_BY_DEFAULT)
            return;
        fn = "/dev/shm/bench.bin";
    }
    FILE * f = fopen(fn, "wb");
    if (!f) {
        fprintf(stderr, "error writing result to %s\n", fn);
        return;
    }
    if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
        N = SAVE_LIMIT_MSPS * 1024 * 1024;
    for (int off = 0; off + B <= N; off += B)
    {
        fwrite(d+off, sizeof(complexf), B, f);
    }
    fclose(f);
 }
 double bench_core_shift_math_cc(
        const int B, const int N, const bool ignore_time,
        const complexf *input,
        complexf *output,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    float phase = 0.0F;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_math_cc(const int B, const int N, const bool ignore_time) {
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_math_cc(B, N, ignore_time, input, output,  iter, off);
    save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
    free(input);
    free(output);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_shift_table_cc(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    int table_size=65536;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_table_data_t table_data = shift_table_init(table_size);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );
    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_shift_addfast(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );
    save(output, B, off, BENCH_FILE_ADD_FAST_CC);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_addfast_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_addfast_data_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    float phase = 0.0F;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        phase = shift_addfast_inp_c(data+off, B, &state, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_addfast_inp(int B, int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
    int iter, off;
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_addfast_inplace(
                B, N, ignore_time, input, state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
    free(input);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_shift_unroll_oop(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );
    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_unroll_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_unroll_data_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    float phase = 0.0F;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        phase = shift_unroll_inp_c(data+off, B, &state, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_unroll_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
    int iter, off;
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_unroll_inplace(
                B, N, ignore_time, input, state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_UNROLL_INP_C);
    free(input);
    shift_unroll_deinit(&state);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_shift_limited_unroll_oop(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_limited_unroll_cc(input+off, output+off, B, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );
    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_limited_unroll_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_limited_unroll_data_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        shift_limited_unroll_inp_c(data+off, B, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_limited_unroll_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
    int iter, off;
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_limited_unroll_inplace(
                B, N, ignore_time, input, state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
    free(input);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_limited_unroll_A_sse_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_limited_unroll_A_sse_data_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        shift_limited_unroll_A_sse_inp_c(data+off, B, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_limited_unroll_A_sse_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_A_sse_data_t *state = (shift_limited_unroll_A_sse_data_t*)malloc(sizeof(shift_limited_unroll_A_sse_data_t));
    int iter, off;
    *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_limited_unroll_A_sse_inplace(
                B, N, ignore_time, input, *state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
    free(input);
    free(state);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_limited_unroll_B_sse_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_limited_unroll_B_sse_data_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        shift_limited_unroll_B_sse_inp_c(data+off, B, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_limited_unroll_B_sse_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_B_sse_data_t *state = (shift_limited_unroll_B_sse_data_t*)malloc(sizeof(shift_limited_unroll_B_sse_data_t));
    int iter, off;
    *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_limited_unroll_B_sse_inplace(
                B, N, ignore_time, input, *state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
    free(input);
    free(state);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_limited_unroll_C_sse_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_limited_unroll_C_sse_data_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        shift_limited_unroll_C_sse_inp_c(data+off, B, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_limited_unroll_C_sse_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_C_sse_data_t *state = (shift_limited_unroll_C_sse_data_t*)malloc(sizeof(shift_limited_unroll_C_sse_data_t));
    int iter, off;
    *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    double T = bench_core_shift_limited_unroll_C_sse_inplace(
                B, N, ignore_time, input, *state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
    free(input);
    free(state);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_shift_rec_osc_cc_oop(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state, shift_state;
    shift_recursive_osc_conf_t gen_conf, shift_conf;
    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );
    save(input, B, off, BENCH_FILE_REC_OSC_CC);
    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_rec_osc_cc_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_recursive_osc_conf_t &conf, shift_recursive_osc_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        shift_recursive_osc_inp_c(data+off, B, &conf, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_rec_osc_cc_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state, shift_state;
    shift_recursive_osc_conf_t gen_conf, shift_conf;
    int iter, off;
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
    double T = bench_core_shift_rec_osc_cc_inplace(
                B, N, ignore_time, input, shift_conf, shift_state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
    free(input);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 double bench_core_shift_rec_osc_sse_c_inplace(
        const int B, const int N, const bool ignore_time,
        complexf *data,
        shift_recursive_osc_sse_conf_t &conf, shift_recursive_osc_sse_t &state,
        int &iters_out, int &off_out
        )
 {
    const double t0 = uclock_sec(1);
    const double tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    double t1;
    int off = 0, iter = 0;
    papi_perf_counter perf_counter(1);
    do {
        // work
        shift_recursive_osc_sse_inp_c(data+off, B, &conf, &state);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( off + B < N && (ignore_time || t1 < tstop) );
    iters_out = iter;
    off_out = off;
    return t1 - t0;
 }
 double bench_shift_rec_osc_sse_c_inp(const int B, const int N, const bool ignore_time) {
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_recursive_osc_sse_t *shift_state = (shift_recursive_osc_sse_t*)malloc(sizeof(shift_recursive_osc_sse_t));
    shift_recursive_osc_sse_conf_t shift_conf;
    int iter, off;
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
    double T = bench_core_shift_rec_osc_sse_c_inplace(
                B, N, ignore_time, input, shift_conf, *shift_state,
                iter, off
                );
    save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
    free(input);
    free(shift_state);
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    double nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
 }
 int main(int argc, char **argv)
 {
    double rt;
    // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
    int B = 8 * 1024;
    int N = 64 * 1024 * 1024;
    int showUsage = 0;
    bool ignore_time = true;
    if (argc == 1)
        showUsage = 1;
    if (1 < argc)
        B = atoi(argv[1]);
    if (2 < argc)
        N = atoi(argv[2]) * 1024 * 1024;
    if ( !B || !N || showUsage )
    {
        fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
        if ( !B || !N )
            return 0;
    }
    fprintf(stderr, "processing up to N = %d MSamples with block length of %d samples\n",
        N / (1024 * 1024), B );
 #if BENCH_REF_TRIG_FUNC
    printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
    rt = bench_shift_math_cc(B, N, ignore_time);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
 #endif
 #if BENCH_OUT_OF_PLACE_ALGOS
    printf("starting bench of shift_table_cc (out-of-place) ..\n");
    rt = bench_shift_table_cc(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
    rt = bench_shift_addfast(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
    rt = bench_shift_unroll_oop(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
    rt = bench_shift_limited_unroll_oop(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
    rt = bench_shift_rec_osc_cc_oop(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
 #endif
 #if BENCH_INPLACE_ALGOS
    printf("starting bench of shift_addfast_inp_c in-place ..\n");
    rt = bench_shift_addfast_inp(B, N, ignore_time);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    printf("starting bench of shift_unroll_inp_c in-place ..\n");
    rt = bench_shift_unroll_inp(B, N, ignore_time);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
    rt = bench_shift_limited_unroll_inp(B, N, ignore_time);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    if ( have_sse_shift_mixer_impl() )
    {
        printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
        rt = bench_shift_limited_unroll_A_sse_inp(B, N, ignore_time);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);
        printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
        rt = bench_shift_limited_unroll_B_sse_inp(B, N, ignore_time);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);
        printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
        rt = bench_shift_limited_unroll_C_sse_inp(B, N, ignore_time);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);
    }
    printf("starting bench of shift_recursive_osc_cc in-place ..\n");
    rt = bench_shift_rec_osc_cc_inp(B, N, ignore_time);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
    if ( have_sse_shift_mixer_impl() )
    {
        printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
        rt = bench_shift_rec_osc_sse_c_inp(B, N, ignore_time);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);
    }
 #endif
    return 0;
 }
--- a/pffft/bench_pffft.c
+++ b/pffft/bench_pffft.c
--- a/pffft/cmake/FindMIPP.cmake
+++ b/pffft/cmake/FindMIPP.cmake
@@ -0,0 +1,26 @@
 if(MIPP_INCLUDE_DIRS)
  set(MIPP_FIND_QUIETLY TRUE)
 endif()
 find_path(MIPP_INCLUDE_DIRS NAMES mipp.h
    HINTS
        ${MIPP_ROOT}
        $ENV{HOME}/.local
    PATH_SUFFIXES include/mipp
 )
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MIPP DEFAULT_MSG MIPP_INCLUDE_DIRS)
 if(MIPP_FOUND AND NOT TARGET MIPP)
    message(STATUS "MIPP_FOUND -> creating interface library MIPP at ${MIPP_INCLUDE_DIRS}")
    add_library(MIPP INTERFACE)
    target_compile_definitions(MIPP INTERFACE HAVE_MIPP=1)
    target_include_directories(MIPP INTERFACE ${MIPP_INCLUDE_DIRS})
    target_compile_features(MIPP INTERFACE cxx_std_11)
 else()
    message(WARNING "MIPP not found.")
 endif()
 mark_as_advanced(MIPP_INCLUDE_DIRS)
--- a/pffft/cmake/FindPAPI.cmake
+++ b/pffft/cmake/FindPAPI.cmake
@@ -0,0 +1,25 @@
 # Find PAPI libraries
 # Once done this will define
 #  PAPI_FOUND - System has PAPI
 #  PAPI_INCLUDE_DIRS - The PAPI include directories
 #  PAPI_LIBRARIES - The libraries needed to use PAPI
 if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)
  set(PAPI_FIND_QUIETLY TRUE)
 endif()
 find_path(PAPI_INCLUDE_DIRS NAMES papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include)
 find_library(PAPI_LIBRARIES NAMES papi HINTS ${PAPI_ROOT} PATH_SUFFIXES lib lib64)
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARIES PAPI_INCLUDE_DIRS)
 if(PAPI_FOUND AND NOT TARGET PAPI::PAPI)
    set(PAPI_LIBRARIES ${PAPI_LIBRARIES} rt)
    add_library(PAPI::PAPI SHARED IMPORTED)
    set_target_properties(PAPI::PAPI PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}"
        IMPORTED_LOCATION "${PAPI_LIBRARIES}")
 endif()
 mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)
--- a/pffft/cmake/compiler_warnings.cmake
+++ b/pffft/cmake/compiler_warnings.cmake
@@ -0,0 +1,11 @@
 function(target_activate_cxx_compiler_warnings target)
    target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
    target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
 endfunction()
 function(target_activate_c_compiler_warnings target)
    target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
    target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
 endfunction()
--- a/pffft/cmake/target_optimizations.cmake
+++ b/pffft/cmake/target_optimizations.cmake
@@ -0,0 +1,197 @@
 # cmake options: TARGET_C_ARCH / TARGET_CPP_ARCH:
 #   and optionally:  TARGET_C_EXTRA TARGET_CXX_EXTRA
 #
 # provided:
 #   - function: target_set_c_arch_flags(<target>)    # uses options TARGET_C_ARCH and TARGET_C_EXTRA
 #   - function: target_set_cxx_arch_flags(<target>)  # uses options TARGET_CXX_ARCH and TARGET_CXX_EXTRA
 #   - macro:    target_set_cxx_arch_option(<target> <gcc/clang_march> <gcc/clang_extra> <msvc_arch>)
 #
 # see https://en.wikichip.org/wiki/x86/extensions
 # and https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
 #   for gcc specific architecture options
 # and https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
 # or  https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
 #   for msvc specific architecture options
 # https://en.wikichip.org/wiki/arm/versions
 # https://en.wikipedia.org/wiki/Raspberry_Pi
 # https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#ARM-Options
 # https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores
 # https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
 # arm32_rpi1 untested
 #   -mcpu=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp         -mtune=arm1176jzf-s
 # arm32_rpi2 untested
 #   "-march=armv7-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
 #   "-march=armv8-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
 # arm32_rpi3 with "armv7-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit  => MIPP test reports: NEONv1, 128 bits
 #   "-march=armv7-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
 # arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit  => MIPP test reports: NEONv1, 128 bits
 #   "-march=armv8-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
 # arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit  => MIPP test reports: NEONv1, 128 bits
 #   "-march=armv8-a"   "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53"
 # arm32_rpi4 untested
 #   RPi 4 Model B:    Cortex-A72  =>  "-mtune=cortex-a72"  ?
 #   "-mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits  -mtune=cortex-a72"
 set(MSVC_EXTRA_OPT_none "")
 set(GCC_EXTRA_OPT_none "")
 set(GCC_EXTRA_OPT_neon_vfpv4    "-mfloat-abi=hard" "-mfpu=neon-vfpv4")
 set(GCC_EXTRA_OPT_neon_rpi3_a53 "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53")
 set(GCC_EXTRA_OPT_neon_rpi4_a72 "-mfloat-abi=hard" "-mfpu=neon-fp-armv8" "-mtune=cortex-a72")
 if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
    set(GCC_MARCH_DESC "native/SSE2:pentium4/SSE3:core2/SSE4:nehalem/AVX:sandybridge/AVX2:haswell")
    set(GCC_MARCH_VALUES "none;native;pentium4;core2;nehalem;sandybridge;haswell" CACHE INTERNAL "List of possible architectures")
    set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible EXTRA options")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    set(GCC_MARCH_DESC "native/ARMwNEON:armv8-a")
    set(GCC_MARCH_VALUES "none;native;armv8-a" CACHE INTERNAL "List of possible architectures")
    set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
    set(GCC_MARCH_DESC "native/ARMwNEON:armv7-a")
    set(GCC_MARCH_VALUES "none;native;armv7-a" CACHE INTERNAL "List of possible architectures")
    set(GCC_EXTRA_VALUES "none;neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72" CACHE INTERNAL "List of possible additional options")
 else()
    message(WARNING "unsupported CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}'")
    # other PROCESSORs could be "ppc", "ppc64",  "arm" - or something else?!
    set(GCC_MARCH_DESC "native")
    set(GCC_MARCH_VALUES "none;native" CACHE INTERNAL "List of possible architectures")
    set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
 endif()
 # cmake options - depending on C/C++ compiler
 # how are chances, that C and C++ compilers are from different vendors?
 if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
    set(TARGET_C_ARCH "none" CACHE STRING "gcc target C architecture (-march): ${GCC_MARCH_DESC}")
    set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
        set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
        set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
    endif()
 elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
    set(TARGET_C_ARCH "none" CACHE STRING "clang target C architecture (-march): ${GCC_MARCH_DESC}")
    set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
        set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
        set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
    endif()
 elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
    set(TARGET_C_ARCH "none" CACHE STRING "msvc target C architecture (/arch): SSE2/AVX/AVX2/AVX512")
    set(TARGET_C_EXTRA "none" CACHE STRING "msvc additional options")
 else()
    message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
 endif()
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    set(TARGET_CXX_ARCH "none" CACHE STRING "gcc target C++ architecture (-march): ${GCC_MARCH_DESC}")
    set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
        set(TARGET_CXX_EXTRA "none" CACHE STRING "gcc additional options for C++")
        set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
    endif()
 elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    set(TARGET_CXX_ARCH "none" CACHE STRING "clang target C++ architecture (-march): ${GCC_MARCH_DESC}")
    set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
    if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
        set(TARGET_CXX_EXTRA "none" CACHE STRING "clang additional options for C++")
        set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
    endif()
 elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
    set(TARGET_CXX_ARCH "none" CACHE STRING "msvc target C++ architecture (/arch): SSE2/AVX/AVX2/AVX512")
    set(TARGET_CXX_EXTRA "none" CACHE STRING "msvc additional options")
 else()
    message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
 endif()
 ######################################################
 function(target_set_c_arch_flags target)
    if ( ("${TARGET_C_ARCH}" STREQUAL "") OR ("${TARGET_C_ARCH}" STREQUAL "none") )
        message(STATUS "C ARCH for target ${target} is not set!")
    else()
        if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
            target_compile_options(${target} PRIVATE "-march=${TARGET_C_ARCH}")
            message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
        elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
            target_compile_options(${target} PRIVATE "/arch:${TARGET_C_ARCH}")
            message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
        else()
            message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
        endif()
    endif()
    if ( ("${TARGET_C_EXTRA}" STREQUAL "") OR ("${TARGET_C_EXTRA}" STREQUAL "none") )
        message(STATUS "C additional options for target ${target} is not set!")
    else()
        if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
            target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
            message(STATUS "C additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
        elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
            # target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_C_EXTRA}}")
            message(STATUS "C additional options for target ${target} not usable with MSVC")
        else()
            message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
        endif()
        if ( ("${TARGET_C_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
            message(STATUS "additional option contains neon: setting PFFFT_ENABLE_NEON for C target ${target}")
            target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
        endif()
    endif()
 endfunction()
 function(target_set_cxx_arch_flags target)
    if ( ("${TARGET_CXX_ARCH}" STREQUAL "") OR ("${TARGET_CXX_ARCH}" STREQUAL "none") )
        message(STATUS "C++ ARCH for target ${target} is not set!")
    else()
        if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
            target_compile_options(${target} PRIVATE "-march=${TARGET_CXX_ARCH}")
            message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
        elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            target_compile_options(${target} PRIVATE "/arch:${TARGET_CXX_ARCH}")
            message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
        else()
            message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
        endif()
    endif()
    if ( ("${TARGET_CXX_EXTRA}" STREQUAL "") OR ("${TARGET_CXX_EXTRA}" STREQUAL "none") )
        message(STATUS "C++ additional options for target ${target} is not set!")
    else()
        if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
            target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
            message(STATUS "C++ additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
        elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
            # target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
            message(STATUS "C++ additional options for target ${target} not usable with MSVC")
        else()
          message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
        endif()
        if ( ("${TARGET_CXX_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
            message(STATUS "additional option contains 'neon': setting PFFFT_ENABLE_NEON for C++ target ${target}")
            target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
        endif()
    endif()
 endfunction()
 macro(target_set_cxx_arch_option target gcc_clang_arch gcc_clang_extra msvc_arch )
    if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
        if ( NOT (("${gcc_clang_arch}" STREQUAL "") OR ("${gcc_clang_arch}" STREQUAL "none") ) )
            target_compile_options(${target} PRIVATE "-march=${gcc_clang_arch}")
            message(STATUS "C++ ARCH for target ${target}: ${gcc_clang_arch}")
        endif()
        if (NOT ( ("${gcc_clang_extra}" STREQUAL "") OR ("${gcc_clang_extra}" STREQUAL "none") ) )
            target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${gcc_clang_extra}}")
            message(STATUS "C++ additional options for target ${target}: ${GCC_EXTRA_OPT_${gcc_clang_extra}}")
        endif()
    elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
        if ( NOT (("${msvc_arch}" STREQUAL "") OR ("${msvc_arch}" STREQUAL "none") ) )
            target_compile_options(${target} PRIVATE "/arch:${msvc_arch}")
            message(STATUS "C++ ARCH for target ${target} set: ${msvc_arch}")
        endif()
    else()
        message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_option(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
    endif()
 endmacro()
--- a/pffft/cross_build_mingw32.sh
+++ b/pffft/cross_build_mingw32.sh
@@ -0,0 +1,25 @@
 #!/bin/bash
 # requires debian/ubuntu packages: zip gcc-mingw-w64
 if [ -z "$1" ]; then
  echo "usage: $0 <zip-post> <any other cmake options>"
  exit 1
 fi
 ZIP_POST="$1"
 shift
 CROSS="i686-w64-mingw32"
 WN="w32"
 TOOLCHAIN="mingw-w32-i686.cmake"
 rm -rf build_${WN}_${ZIP_POST}
 echo -e "\n\n********************************************************"
 echo "start build of pffft_${WN}_${ZIP_POST}"
 mkdir build_${WN}_${ZIP_POST} && \
 cmake -S . -B build_${WN}_${ZIP_POST} \
  -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
  -DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
  "$@" && \
 cmake --build build_${WN}_${ZIP_POST}
--- a/pffft/cross_build_mingw64.sh
+++ b/pffft/cross_build_mingw64.sh
@@ -0,0 +1,25 @@
 #!/bin/bash
 # requires debian/ubuntu packages: zip gcc-mingw-w64
 if [ -z "$1" ]; then
  echo "usage: $0 <zip-post> <any other cmake options>"
  exit 1
 fi
 ZIP_POST="$1"
 shift
 # CROSS="x86_64-w64-mingw32"
 WN="w64"
 TOOLCHAIN="mingw-w64-x64_64.cmake"
 rm -rf build_${WN}_${ZIP_POST}
 echo -e "\n\n********************************************************"
 echo "start build of pffft_${WN}_${ZIP_POST}"
 mkdir build_${WN}_${ZIP_POST} && \
 cmake -S . -B build_${WN}_${ZIP_POST} \
  -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
  -DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
  "$@" && \
 cmake --build build_${WN}_${ZIP_POST}
--- a/pffft/examples/CMakeLists.txt
+++ b/pffft/examples/CMakeLists.txt
@@ -0,0 +1,63 @@
 cmake_minimum_required(VERSION 3.1)
 project(examples)
 if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
  # using Visual Studio C++
  message(STATUS "INFO: detected MSVC: will not link math lib m")
  set(MATHLIB "")
  add_definitions("/D_CRT_SECURE_NO_WARNINGS")
  set(MSVC_DISABLED_WARNINGS_LIST "C4996")
 else()
  if(PFFFT_DISABLE_LINK_WITH_M)
  else()
    message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
    set(MATHLIB "m")
  endif()
 endif()
 set(STDCXXLIB "")
 if (MINGW)
  set(STDCXXLIB "stdc++")
 endif()
 set(CMAKE_CXX_EXTENSIONS OFF)
 if (PFFFT_USE_TYPE_DOUBLE)
  add_executable(example_cpp11_real_dbl_fwd example_cpp11_real_dbl_fwd.cpp)
  target_compile_definitions(example_cpp11_real_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
  target_link_libraries(example_cpp11_real_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
  set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD 11)
  set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
  add_executable(example_cpp11_cplx_dbl_fwd example_cpp11_cplx_dbl_fwd.cpp)
  target_compile_definitions(example_cpp11_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
  target_link_libraries(example_cpp11_cplx_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
  set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD 11)
  set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
  add_executable(example_c_cplx_dbl_fwd example_c_cplx_dbl_fwd.c)
  target_compile_definitions(example_c_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_FLOAT)
  target_link_libraries(example_c_cplx_dbl_fwd PFFFT ${MATHLIB})
 endif()
 if (PFFFT_USE_TYPE_FLOAT)
  add_executable(example_cpp98_real_flt_fwd example_cpp98_real_flt_fwd.cpp)
  target_compile_definitions(example_cpp98_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
  target_link_libraries(example_cpp98_real_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
  set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD 98)
  set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
  add_executable(example_cpp98_cplx_flt_fwd example_cpp98_cplx_flt_fwd.cpp)
  target_compile_definitions(example_cpp98_cplx_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
  target_link_libraries(example_cpp98_cplx_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
  set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD 98)
  set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
  add_executable(example_c_real_flt_fwd example_c_real_flt_fwd.c)
  target_compile_definitions(example_c_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
  target_link_libraries(example_c_real_flt_fwd PFFFT ${MATHLIB})
 endif()
--- a/pffft/examples/example_c_cplx_dbl_fwd.c
+++ b/pffft/examples/example_c_cplx_dbl_fwd.c
@@ -0,0 +1,69 @@
 #include "pffft_double.h"
 #include <stdio.h>
 #include <stdlib.h>
 void c_forward_complex_double(const int transformLen)
 {
  printf("running %s()\n", __FUNCTION__);
  /* first check - might be skipped */
  if (transformLen < pffftd_min_fft_size(PFFFT_COMPLEX))
  {
    fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffftd_min_fft_size(PFFFT_COMPLEX));
    return;
  }
  /* instantiate FFT and prepare transformation for length N */
  PFFFTD_Setup *ffts = pffftd_new_setup(transformLen, PFFFT_COMPLEX);
  /* one more check */
  if (!ffts)
  {
    fprintf(stderr,
            "Error: transformation length %d is not decomposable into small prime factors. "
            "Next valid transform size is: %d ; next power of 2 is: %d\n",
            transformLen,
            pffftd_nearest_transform_size(transformLen, PFFFT_COMPLEX, 1),
            pffftd_next_power_of_two(transformLen) );
    return;
  }
  /* allocate aligned vectors for input X and output Y */
  double *X = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));  /* complex: re/im interleaved */
  double *Y = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));  /* complex: re/im interleaved */
  double *W = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));
  /* prepare some input data */
  for (int k = 0; k < 2 * transformLen; k += 4)
  {
    X[k] = k / 2;  /* real */
    X[k+1] = (k / 2) & 1;  /* imag */
    X[k+2] = -1 - k / 2;  /* real */
    X[k+3] = (k / 2) & 1;  /* imag */
  }
  /* do the forward transform; write complex spectrum result into Y */
  pffftd_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
  /* print spectral output */
  printf("output should be complex spectrum with %d complex bins\n", transformLen);
  for (int k = 0; k < 2 * transformLen; k += 2)
    printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
  pffftd_aligned_free(W);
  pffftd_aligned_free(Y);
  pffftd_aligned_free(X);
  pffftd_destroy_setup(ffts);
 }
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 16;
  c_forward_complex_double(N);
  return 0;
 }
--- a/pffft/examples/example_c_real_flt_fwd.c
+++ b/pffft/examples/example_c_real_flt_fwd.c
@@ -0,0 +1,66 @@
 #include "pffft.h"
 #include <stdio.h>
 #include <stdlib.h>
 void c_forward_real_float(const int transformLen)
 {
  printf("running %s()\n", __FUNCTION__);
  /* first check - might be skipped */
  if (transformLen < pffft_min_fft_size(PFFFT_REAL))
  {
    fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffft_min_fft_size(PFFFT_REAL));
    return;
  }
  /* instantiate FFT and prepare transformation for length N */
  PFFFT_Setup *ffts = pffft_new_setup(transformLen, PFFFT_REAL);
  /* one more check */
  if (!ffts)
  {
    fprintf(stderr,
            "Error: transformation length %d is not decomposable into small prime factors. "
            "Next valid transform size is: %d ; next power of 2 is: %d\n",
            transformLen,
            pffft_nearest_transform_size(transformLen, PFFFT_REAL, 1),
            pffft_next_power_of_two(transformLen) );
    return;
  }
  /* allocate aligned vectors for input X and output Y */
  float *X = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
  float *Y = (float*)pffft_aligned_malloc(transformLen * sizeof(float));  /* complex: re/im interleaved */
  float *W = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
  /* prepare some input data */
  for (int k = 0; k < transformLen; k += 2)
  {
    X[k] = k;
    X[k+1] = -1-k;
  }
  /* do the forward transform; write complex spectrum result into Y */
  pffft_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
  /* print spectral output */
  printf("output should be complex spectrum with %d complex bins\n", transformLen /2);
  for (int k = 0; k < transformLen; k += 2)
    printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
  pffft_aligned_free(W);
  pffft_aligned_free(Y);
  pffft_aligned_free(X);
  pffft_destroy_setup(ffts);
 }
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 32;
  c_forward_real_float(N);
  return 0;
 }
--- a/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
+++ b/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
@@ -0,0 +1,66 @@
 #include "pffft.hpp"
 #include <complex>
 #include <iostream>
 void cxx11_forward_complex_double(const int transformLen)
 {
  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
  // first check - might be skipped
  using FFT_T = pffft::Fft< std::complex<double> >;
  if (transformLen < FFT_T::minFFtsize())
  {
    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
    return;
  }
  // instantiate FFT and prepare transformation for length N
  pffft::Fft< std::complex<double> > fft(transformLen);
  // one more check
  if (!fft.isValid())
  {
    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
    return;
  }
  // allocate aligned vectors for input X and output Y
  auto X = fft.valueVector();
  auto Y = fft.spectrumVector();
  // alternative access: get raw pointers to aligned vectors
  std::complex<double> *Xs = X.data();
  std::complex<double> *Ys = Y.data();
  // prepare some input data
  for (int k = 0; k < transformLen; k += 2)
  {
    X[k] = std::complex<double>(k, k&1);        // access through AlignedVector<double>
    Xs[k+1] = std::complex<double>(-1-k, k&1);  // access through raw pointer
  }
  // do the forward transform; write complex spectrum result into Y
  fft.forward(X, Y);
  // print spectral output
  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
  for (unsigned k = 0; k < Y.size(); k += 2)
  {
    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
  }
 }
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 16;
  cxx11_forward_complex_double(N);
  return 0;
 }
--- a/pffft/examples/example_cpp11_real_dbl_fwd.cpp
+++ b/pffft/examples/example_cpp11_real_dbl_fwd.cpp
@@ -0,0 +1,66 @@
 #include "pffft.hpp"
 #include <complex>
 #include <iostream>
 void cxx11_forward_real_double(const int transformLen)
 {
  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
  // first check - might be skipped
  using FFT_T = pffft::Fft<double>;
  if (transformLen < FFT_T::minFFtsize())
  {
    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
    return;
  }
  // instantiate FFT and prepare transformation for length N
  pffft::Fft<double> fft { transformLen };
  // one more check
  if (!fft.isValid())
  {
    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
    return;
  }
  // allocate aligned vectors for (real) input X and (complex) output Y
  auto X = fft.valueVector();     // input vector;  type is AlignedVector<double>
  auto Y = fft.spectrumVector();  // output vector; type is AlignedVector< std::complex<double> >
  // alternative access: get raw pointers to aligned vectors
  double *Xs = X.data();
  std::complex<double> *Ys = Y.data();
  // prepare some input data
  for (int k = 0; k < transformLen; k += 2)
  {
    X[k] = k;        // access through AlignedVector<double>
    Xs[k+1] = -1-k;  // access through raw pointer
  }
  // do the forward transform; write complex spectrum result into Y
  fft.forward(X, Y);
  // print spectral output
  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
  for (unsigned k = 0; k < Y.size(); k += 2)
  {
    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
  }
 }
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 32;
  cxx11_forward_real_double(N);
  return 0;
 }
--- a/pffft/examples/example_cpp98_cplx_flt_fwd.cpp
+++ b/pffft/examples/example_cpp98_cplx_flt_fwd.cpp
@@ -0,0 +1,66 @@
 #include "pffft.hpp"
 #include <complex>
 #include <iostream>
 void cxx98_forward_complex_float(const int transformLen)
 {
  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
  // first check - might be skipped
  typedef pffft::Fft< std::complex<float> > FFT_T;
  if (transformLen < FFT_T::minFFtsize())
  {
    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
    return;
  }
  // instantiate FFT and prepare transformation for length N
  pffft::Fft< std::complex<float> > fft(transformLen);
  // one more check
  if (!fft.isValid())
  {
    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
    return;
  }
  // allocate aligned vectors for input X and output Y
  pffft::AlignedVector< std::complex<float> > X = fft.valueVector();
  pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
  // alternative access: get raw pointers to aligned vectors
  std::complex<float> *Xs = X.data();
  std::complex<float> *Ys = Y.data();
  // prepare some input data
  for (int k = 0; k < transformLen; k += 2)
  {
    X[k] = std::complex<float>(k, k&1);        // access through AlignedVector<float>
    Xs[k+1] = std::complex<float>(-1-k, k&1);  // access through raw pointer
  }
  // do the forward transform; write complex spectrum result into Y
  fft.forward(X, Y);
  // print spectral output
  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
  for (unsigned k = 0; k < Y.size(); k += 2)
  {
    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
  }
 }
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 16;
  cxx98_forward_complex_float(N);
  return 0;
 }
--- a/pffft/examples/example_cpp98_real_flt_fwd.cpp
+++ b/pffft/examples/example_cpp98_real_flt_fwd.cpp
@@ -0,0 +1,66 @@
 #include "pffft.hpp"
 #include <complex>
 #include <iostream>
 void cxx98_forward_real_float(const int transformLen)
 {
  std::cout << "running " << __FUNCTION__ << "()" << std::endl;
  // first check - might be skipped
  typedef pffft::Fft<float> FFT_T;
  if (transformLen < FFT_T::minFFtsize())
  {
    std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
    return;
  }
  // instantiate FFT and prepare transformation for length N
  pffft::Fft<float> fft(transformLen);
  // one more check
  if (!fft.isValid())
  {
    std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
              << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
              << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
    return;
  }
  // allocate aligned vectors for input X and output Y
  pffft::AlignedVector<float> X = fft.valueVector();
  pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
  // alternative access: get raw pointers to aligned vectors
  float *Xs = X.data();
  std::complex<float> *Ys = Y.data();
  // prepare some input data
  for (int k = 0; k < transformLen; k += 2)
  {
    X[k] = k;        // access through AlignedVector<float>
    Xs[k+1] = -1-k;  // access through raw pointer
  }
  // do the forward transform; write complex spectrum result into Y
  fft.forward(X, Y);
  // print spectral output
  std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
  std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
  for (unsigned k = 0; k < Y.size(); k += 2)
  {
    std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
    std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
  }
 }
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 32;
  cxx98_forward_real_float(N);
  return 0;
 }
--- a/pffft/fftpack.c
+++ b/pffft/fftpack.c
--- a/pffft/fftpack.h
+++ b/pffft/fftpack.h
@@ -0,0 +1,799 @@
 /*
  Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
   FFTPACK license:
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.  
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
   ChangeLog:
   2011/10/02: this is my first release of this file.
 */
 #ifndef FFTPACK_H
 #define FFTPACK_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft */
 #ifndef FFTPACK_DOUBLE_PRECISION
  typedef float fftpack_real;
  typedef int   fftpack_int;
 #else
  typedef double fftpack_real;
  typedef int    fftpack_int;
 #endif
  void cffti(fftpack_int n, fftpack_real *wsave);
  void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
  void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
  void rffti(fftpack_int n, fftpack_real *wsave);
  void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
  void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
  void cosqi(fftpack_int n, fftpack_real *wsave);
  void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
  void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
  void costi(fftpack_int n, fftpack_real *wsave);
  void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
  void sinqi(fftpack_int n, fftpack_real *wsave);
  void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
  void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
  void sinti(fftpack_int n, fftpack_real *wsave);
  void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
 #ifdef __cplusplus
 }
 #endif
 #endif /* FFTPACK_H */
 /*
                      FFTPACK
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
                  version 4  april 1985
     a package of fortran subprograms for the fast fourier
      transform of periodic and other symmetric sequences
                         by
                  paul n swarztrauber
  national center for atmospheric research  boulder,colorado 80307
   which is sponsored by the national science foundation
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 this package consists of programs which perform fast fourier
 transforms for both complex and real periodic sequences and
 certain other symmetric sequences that are listed below.
 1.   rffti     initialize  rfftf and rfftb
 2.   rfftf     forward transform of a real periodic sequence
 3.   rfftb     backward transform of a real coefficient array
 4.   ezffti    initialize ezfftf and ezfftb
 5.   ezfftf    a simplified real periodic forward transform
 6.   ezfftb    a simplified real periodic backward transform
 7.   sinti     initialize sint
 8.   sint      sine transform of a real odd sequence
 9.   costi     initialize cost
 10.  cost      cosine transform of a real even sequence
 11.  sinqi     initialize sinqf and sinqb
 12.  sinqf     forward sine transform with odd wave numbers
 13.  sinqb     unnormalized inverse of sinqf
 14.  cosqi     initialize cosqf and cosqb
 15.  cosqf     forward cosine transform with odd wave numbers
 16.  cosqb     unnormalized inverse of cosqf
 17.  cffti     initialize cfftf and cfftb
 18.  cfftf     forward transform of a complex periodic sequence
 19.  cfftb     unnormalized inverse of cfftf
 ******************************************************************
 subroutine rffti(n,wsave)
  ****************************************************************
 subroutine rffti initializes the array wsave which is used in
 both rfftf and rfftb. the prime factorization of n together with
 a tabulation of the trigonometric functions are computed and
 stored in wsave.
 input parameter
 n       the length of the sequence to be transformed.
 output parameter
 wsave   a work array which must be dimensioned at least 2*n+15.
        the same work array can be used for both rfftf and rfftb
        as long as n remains unchanged. different wsave arrays
        are required for different values of n. the contents of
        wsave must not be changed between calls of rfftf or rfftb.
 ******************************************************************
 subroutine rfftf(n,r,wsave)
 ******************************************************************
 subroutine rfftf computes the fourier coefficients of a real
 perodic sequence (fourier analysis). the transform is defined
 below at output parameter r.
 input parameters
 n       the length of the array r to be transformed.  the method
        is most efficient when n is a product of small primes.
        n may change so long as different work arrays are provided
 r       a real array of length n which contains the sequence
        to be transformed
 wsave   a work array which must be dimensioned at least 2*n+15.
        in the program that calls rfftf. the wsave array must be
        initialized by calling subroutine rffti(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
        the same wsave array can be used by rfftf and rfftb.
 output parameters
 r       r(1) = the sum from i=1 to i=n of r(i)
        if n is even set l =n/2   , if n is odd set l = (n+1)/2
          then for k = 2,...,l
             r(2*k-2) = the sum from i = 1 to i = n of
                  r(i)*cos((k-1)*(i-1)*2*pi/n)
             r(2*k-1) = the sum from i = 1 to i = n of
                 -r(i)*sin((k-1)*(i-1)*2*pi/n)
        if n is even
             r(n) = the sum from i = 1 to i = n of
                  (-1)**(i-1)*r(i)
 *****  note
             this transform is unnormalized since a call of rfftf
             followed by a call of rfftb will multiply the input
             sequence by n.
 wsave   contains results which must not be destroyed between
        calls of rfftf or rfftb.
 ******************************************************************
 subroutine rfftb(n,r,wsave)
 ******************************************************************
 subroutine rfftb computes the real perodic sequence from its
 fourier coefficients (fourier synthesis). the transform is defined
 below at output parameter r.
 input parameters
 n       the length of the array r to be transformed.  the method
        is most efficient when n is a product of small primes.
        n may change so long as different work arrays are provided
 r       a real array of length n which contains the sequence
        to be transformed
 wsave   a work array which must be dimensioned at least 2*n+15.
        in the program that calls rfftb. the wsave array must be
        initialized by calling subroutine rffti(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
        the same wsave array can be used by rfftf and rfftb.
 output parameters
 r       for n even and for i = 1,...,n
             r(i) = r(1)+(-1)**(i-1)*r(n)
                  plus the sum from k=2 to k=n/2 of
                   2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
                  -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
        for n odd and for i = 1,...,n
             r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
                  2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
                 -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
 *****  note
             this transform is unnormalized since a call of rfftf
             followed by a call of rfftb will multiply the input
             sequence by n.
 wsave   contains results which must not be destroyed between
        calls of rfftb or rfftf.
 ******************************************************************
 subroutine sinti(n,wsave)
 ******************************************************************
 subroutine sinti initializes the array wsave which is used in
 subroutine sint. the prime factorization of n together with
 a tabulation of the trigonometric functions are computed and
 stored in wsave.
 input parameter
 n       the length of the sequence to be transformed.  the method
        is most efficient when n+1 is a product of small primes.
 output parameter
 wsave   a work array with at least int(2.5*n+15) locations.
        different wsave arrays are required for different values
        of n. the contents of wsave must not be changed between
        calls of sint.
 ******************************************************************
 subroutine sint(n,x,wsave)
 ******************************************************************
 subroutine sint computes the discrete fourier sine transform
 of an odd sequence x(i). the transform is defined below at
 output parameter x.
 sint is the unnormalized inverse of itself since a call of sint
 followed by another call of sint will multiply the input sequence
 x by 2*(n+1).
 the array wsave which is used by subroutine sint must be
 initialized by calling subroutine sinti(n,wsave).
 input parameters
 n       the length of the sequence to be transformed.  the method
        is most efficient when n+1 is the product of small primes.
 x       an array which contains the sequence to be transformed
 wsave   a work array with dimension at least int(2.5*n+15)
        in the program that calls sint. the wsave array must be
        initialized by calling subroutine sinti(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
 output parameters
 x       for i=1,...,n
             x(i)= the sum from k=1 to k=n
                  2*x(k)*sin(k*i*pi/(n+1))
             a call of sint followed by another call of
             sint will multiply the sequence x by 2*(n+1).
             hence sint is the unnormalized inverse
             of itself.
 wsave   contains initialization calculations which must not be
        destroyed between calls of sint.
 ******************************************************************
 subroutine costi(n,wsave)
 ******************************************************************
 subroutine costi initializes the array wsave which is used in
 subroutine cost. the prime factorization of n together with
 a tabulation of the trigonometric functions are computed and
 stored in wsave.
 input parameter
 n       the length of the sequence to be transformed.  the method
        is most efficient when n-1 is a product of small primes.
 output parameter
 wsave   a work array which must be dimensioned at least 3*n+15.
        different wsave arrays are required for different values
        of n. the contents of wsave must not be changed between
        calls of cost.
 ******************************************************************
 subroutine cost(n,x,wsave)
 ******************************************************************
 subroutine cost computes the discrete fourier cosine transform
 of an even sequence x(i). the transform is defined below at output
 parameter x.
 cost is the unnormalized inverse of itself since a call of cost
 followed by another call of cost will multiply the input sequence
 x by 2*(n-1). the transform is defined below at output parameter x
 the array wsave which is used by subroutine cost must be
 initialized by calling subroutine costi(n,wsave).
 input parameters
 n       the length of the sequence x. n must be greater than 1.
        the method is most efficient when n-1 is a product of
        small primes.
 x       an array which contains the sequence to be transformed
 wsave   a work array which must be dimensioned at least 3*n+15
        in the program that calls cost. the wsave array must be
        initialized by calling subroutine costi(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
 output parameters
 x       for i=1,...,n
            x(i) = x(1)+(-1)**(i-1)*x(n)
             + the sum from k=2 to k=n-1
                 2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
             a call of cost followed by another call of
             cost will multiply the sequence x by 2*(n-1)
             hence cost is the unnormalized inverse
             of itself.
 wsave   contains initialization calculations which must not be
        destroyed between calls of cost.
 ******************************************************************
 subroutine sinqi(n,wsave)
 ******************************************************************
 subroutine sinqi initializes the array wsave which is used in
 both sinqf and sinqb. the prime factorization of n together with
 a tabulation of the trigonometric functions are computed and
 stored in wsave.
 input parameter
 n       the length of the sequence to be transformed. the method
        is most efficient when n is a product of small primes.
 output parameter
 wsave   a work array which must be dimensioned at least 3*n+15.
        the same work array can be used for both sinqf and sinqb
        as long as n remains unchanged. different wsave arrays
        are required for different values of n. the contents of
        wsave must not be changed between calls of sinqf or sinqb.
 ******************************************************************
 subroutine sinqf(n,x,wsave)
 ******************************************************************
 subroutine sinqf computes the fast fourier transform of quarter
 wave data. that is , sinqf computes the coefficients in a sine
 series representation with only odd wave numbers. the transform
 is defined below at output parameter x.
 sinqb is the unnormalized inverse of sinqf since a call of sinqf
 followed by a call of sinqb will multiply the input sequence x
 by 4*n.
 the array wsave which is used by subroutine sinqf must be
 initialized by calling subroutine sinqi(n,wsave).
 input parameters
 n       the length of the array x to be transformed.  the method
        is most efficient when n is a product of small primes.
 x       an array which contains the sequence to be transformed
 wsave   a work array which must be dimensioned at least 3*n+15.
        in the program that calls sinqf. the wsave array must be
        initialized by calling subroutine sinqi(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
 output parameters
 x       for i=1,...,n
             x(i) = (-1)**(i-1)*x(n)
                + the sum from k=1 to k=n-1 of
                2*x(k)*sin((2*i-1)*k*pi/(2*n))
             a call of sinqf followed by a call of
             sinqb will multiply the sequence x by 4*n.
             therefore sinqb is the unnormalized inverse
             of sinqf.
 wsave   contains initialization calculations which must not
        be destroyed between calls of sinqf or sinqb.
 ******************************************************************
 subroutine sinqb(n,x,wsave)
 ******************************************************************
 subroutine sinqb computes the fast fourier transform of quarter
 wave data. that is , sinqb computes a sequence from its
 representation in terms of a sine series with odd wave numbers.
 the transform is defined below at output parameter x.
 sinqf is the unnormalized inverse of sinqb since a call of sinqb
 followed by a call of sinqf will multiply the input sequence x
 by 4*n.
 the array wsave which is used by subroutine sinqb must be
 initialized by calling subroutine sinqi(n,wsave).
 input parameters
 n       the length of the array x to be transformed.  the method
        is most efficient when n is a product of small primes.
 x       an array which contains the sequence to be transformed
 wsave   a work array which must be dimensioned at least 3*n+15.
        in the program that calls sinqb. the wsave array must be
        initialized by calling subroutine sinqi(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
 output parameters
 x       for i=1,...,n
             x(i)= the sum from k=1 to k=n of
               4*x(k)*sin((2k-1)*i*pi/(2*n))
             a call of sinqb followed by a call of
             sinqf will multiply the sequence x by 4*n.
             therefore sinqf is the unnormalized inverse
             of sinqb.
 wsave   contains initialization calculations which must not
        be destroyed between calls of sinqb or sinqf.
 ******************************************************************
 subroutine cosqi(n,wsave)
 ******************************************************************
 subroutine cosqi initializes the array wsave which is used in
 both cosqf and cosqb. the prime factorization of n together with
 a tabulation of the trigonometric functions are computed and
 stored in wsave.
 input parameter
 n       the length of the array to be transformed.  the method
        is most efficient when n is a product of small primes.
 output parameter
 wsave   a work array which must be dimensioned at least 3*n+15.
        the same work array can be used for both cosqf and cosqb
        as long as n remains unchanged. different wsave arrays
        are required for different values of n. the contents of
        wsave must not be changed between calls of cosqf or cosqb.
 ******************************************************************
 subroutine cosqf(n,x,wsave)
 ******************************************************************
 subroutine cosqf computes the fast fourier transform of quarter
 wave data. that is , cosqf computes the coefficients in a cosine
 series representation with only odd wave numbers. the transform
 is defined below at output parameter x
 cosqf is the unnormalized inverse of cosqb since a call of cosqf
 followed by a call of cosqb will multiply the input sequence x
 by 4*n.
 the array wsave which is used by subroutine cosqf must be
 initialized by calling subroutine cosqi(n,wsave).
 input parameters
 n       the length of the array x to be transformed.  the method
        is most efficient when n is a product of small primes.
 x       an array which contains the sequence to be transformed
 wsave   a work array which must be dimensioned at least 3*n+15
        in the program that calls cosqf. the wsave array must be
        initialized by calling subroutine cosqi(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
 output parameters
 x       for i=1,...,n
             x(i) = x(1) plus the sum from k=2 to k=n of
                2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
             a call of cosqf followed by a call of
             cosqb will multiply the sequence x by 4*n.
             therefore cosqb is the unnormalized inverse
             of cosqf.
 wsave   contains initialization calculations which must not
        be destroyed between calls of cosqf or cosqb.
 ******************************************************************
 subroutine cosqb(n,x,wsave)
 ******************************************************************
 subroutine cosqb computes the fast fourier transform of quarter
 wave data. that is , cosqb computes a sequence from its
 representation in terms of a cosine series with odd wave numbers.
 the transform is defined below at output parameter x.
 cosqb is the unnormalized inverse of cosqf since a call of cosqb
 followed by a call of cosqf will multiply the input sequence x
 by 4*n.
 the array wsave which is used by subroutine cosqb must be
 initialized by calling subroutine cosqi(n,wsave).
 input parameters
 n       the length of the array x to be transformed.  the method
        is most efficient when n is a product of small primes.
 x       an array which contains the sequence to be transformed
 wsave   a work array that must be dimensioned at least 3*n+15
        in the program that calls cosqb. the wsave array must be
        initialized by calling subroutine cosqi(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
 output parameters
 x       for i=1,...,n
             x(i)= the sum from k=1 to k=n of
               4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
             a call of cosqb followed by a call of
             cosqf will multiply the sequence x by 4*n.
             therefore cosqf is the unnormalized inverse
             of cosqb.
 wsave   contains initialization calculations which must not
        be destroyed between calls of cosqb or cosqf.
 ******************************************************************
 subroutine cffti(n,wsave)
 ******************************************************************
 subroutine cffti initializes the array wsave which is used in
 both cfftf and cfftb. the prime factorization of n together with
 a tabulation of the trigonometric functions are computed and
 stored in wsave.
 input parameter
 n       the length of the sequence to be transformed
 output parameter
 wsave   a work array which must be dimensioned at least 4*n+15
        the same work array can be used for both cfftf and cfftb
        as long as n remains unchanged. different wsave arrays
        are required for different values of n. the contents of
        wsave must not be changed between calls of cfftf or cfftb.
 ******************************************************************
 subroutine cfftf(n,c,wsave)
 ******************************************************************
 subroutine cfftf computes the forward complex discrete fourier
 transform (the fourier analysis). equivalently , cfftf computes
 the fourier coefficients of a complex periodic sequence.
 the transform is defined below at output parameter c.
 the transform is not normalized. to obtain a normalized transform
 the output must be divided by n. otherwise a call of cfftf
 followed by a call of cfftb will multiply the sequence by n.
 the array wsave which is used by subroutine cfftf must be
 initialized by calling subroutine cffti(n,wsave).
 input parameters
 n      the length of the complex sequence c. the method is
       more efficient when n is the product of small primes. n
 c      a complex array of length n which contains the sequence
 wsave   a real work array which must be dimensioned at least 4n+15
        in the program that calls cfftf. the wsave array must be
        initialized by calling subroutine cffti(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
        the same wsave array can be used by cfftf and cfftb.
 output parameters
 c      for j=1,...,n
           c(j)=the sum from k=1,...,n of
                 c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
                       where i=sqrt(-1)
 wsave   contains initialization calculations which must not be
        destroyed between calls of subroutine cfftf or cfftb
 ******************************************************************
 subroutine cfftb(n,c,wsave)
 ******************************************************************
 subroutine cfftb computes the backward complex discrete fourier
 transform (the fourier synthesis). equivalently , cfftb computes
 a complex periodic sequence from its fourier coefficients.
 the transform is defined below at output parameter c.
 a call of cfftf followed by a call of cfftb will multiply the
 sequence by n.
 the array wsave which is used by subroutine cfftb must be
 initialized by calling subroutine cffti(n,wsave).
 input parameters
 n      the length of the complex sequence c. the method is
       more efficient when n is the product of small primes.
 c      a complex array of length n which contains the sequence
 wsave   a real work array which must be dimensioned at least 4n+15
        in the program that calls cfftb. the wsave array must be
        initialized by calling subroutine cffti(n,wsave) and a
        different wsave array must be used for each different
        value of n. this initialization does not have to be
        repeated so long as n remains unchanged thus subsequent
        transforms can be obtained faster than the first.
        the same wsave array can be used by cfftf and cfftb.
 output parameters
 c      for j=1,...,n
           c(j)=the sum from k=1,...,n of
                 c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
                       where i=sqrt(-1)
 wsave   contains initialization calculations which must not be
        destroyed between calls of subroutine cfftf or cfftb
 */
--- a/pffft/fmv.h
+++ b/pffft/fmv.h
@@ -0,0 +1,20 @@
 #ifndef FMV_H
 #if HAVE_FUNC_ATTRIBUTE_IFUNC
 #if defined(__has_attribute)
 #if __has_attribute(target_clones)
 #if defined(__x86_64)
 // see https://gcc.gnu.org/wiki/FunctionMultiVersioning
 #define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
 #define HAVE_PF_TARGET_CLONES  1
 #endif
 #endif
 #endif
 #endif
 #ifndef PF_TARGET_CLONES
 #define PF_TARGET_CLONES
 #endif
 #endif
--- a/pffft/mingw-w32-i686.cmake
+++ b/pffft/mingw-w32-i686.cmake
@@ -0,0 +1,25 @@
 # Sample toolchain file for building for Windows from an Ubuntu Linux system.
 #
 # Typical usage:
 #    *) install cross compiler: `sudo apt-get install mingw-w64`
 #    *) cd build
 #    *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w32-i686.cmake ..
 #
 # build for Windows' 32 bit architecture
 set(CMAKE_SYSTEM_NAME Windows)
 set(CMAKE_SYSTEM_PROCESSOR x86_64)
 set(TOOLCHAIN_PREFIX i686-w64-mingw32)
 # cross compilers to use for C, C++ and Fortran
 set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
 set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
 set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
 # target environment on the build host system
 set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
 # modify default behavior of FIND_XXX() commands
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
--- a/pffft/mingw-w64-x64_64.cmake
+++ b/pffft/mingw-w64-x64_64.cmake
@@ -0,0 +1,25 @@
 # Sample toolchain file for building for Windows from an Ubuntu Linux system.
 #
 # Typical usage:
 #    *) install cross compiler: `sudo apt-get install mingw-w64`
 #    *) cd build
 #    *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w64-x86_64.cmake ..
 #
 # build for Windows' 64 bit architecture
 set(CMAKE_SYSTEM_NAME Windows)
 set(CMAKE_SYSTEM_PROCESSOR x86_64)
 set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
 # cross compilers to use for C, C++ and Fortran
 set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
 set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
 set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
 # target environment on the build host system
 set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
 # modify default behavior of FIND_XXX() commands
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
--- a/pffft/papi_perf_counter.h
+++ b/pffft/papi_perf_counter.h
@@ -0,0 +1,97 @@
 #pragma once
 /* for measurement of CPU cycles ..
 *
 * requires
 *   sudo apt-get install libpapi-dev papi-tools
 * on debian/ubuntu linux distributions
 *
 */
 #ifdef HAVE_PAPI
 #include <papi.h>
 #endif
 #include <stdio.h>
 struct papi_perf_counter
 {
    papi_perf_counter()
        : realTime(0.0F), processTime(0.0F), instructions(0LL), ipc(0.0F)
        , started(false), finished(false), print_at_destruction(false)
    { }
    papi_perf_counter(int _start, bool print_at_destruction_ = true)
        : print_at_destruction(print_at_destruction_)
    {
        (void)_start;
        start();
    }
    ~papi_perf_counter()
    {
        if (print_at_destruction)
            print(stderr);
    }
    bool start()
    {
        static bool reported_start_error = false;
 #ifdef HAVE_PAPI
        int ret = PAPI_ipc(&realTime, &processTime, &instructions, &ipc);
        if (ret && !reported_start_error)
        {
            reported_start_error = true;
            fprintf(stderr, "papi_perf_counter::start(): PAPI_ipc() returned error %d\n", ret);
        }
 #else
        if (!reported_start_error)
        {
            reported_start_error = true;
            fprintf(stderr, "papi_perf_counter::start(): no HAVE_PAPI\n");
        }
        int ret = 1;
 #endif
        started = (!ret);
        finished = false;
        return started;
    }
    bool finish()
    {
        papi_perf_counter end(1, false);
        if (started && !finished && end.started)
        {
            realTime = end.realTime - realTime;
            processTime = end.processTime - processTime;
            instructions = end.instructions - instructions;
            ipc = end.ipc;
            finished = true;
            return true;
        }
        return false;
    }
    void print(FILE *f = stdout)
    {
        if (started && !finished)
            finish();
        if (!started || !finished)
            return;
        double cycles = instructions / ipc;
        fprintf(f, "real %g, process %g, instructions %lld, ins/cycle %f => cycles %g\n"
                , realTime, processTime, instructions, ipc, cycles
                );
        started = false;
    }
    float realTime;
    float processTime;
    long long instructions;
    float ipc;
    bool started;
    bool finished;
    bool print_at_destruction;
 };
--- a/pffft/pf_carrier.cpp
+++ b/pffft/pf_carrier.cpp
@@ -0,0 +1,298 @@
 /*
 This software is part of pffft/pfdsp, a set of simple DSP routines.
 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /* include own header first, to see missing includes */
 #include "pf_carrier.h"
 #include "fmv.h"
 #include <limits.h>
 #include <assert.h>
 PF_TARGET_CLONES
 void generate_dc_f(float* output, int size)
 {
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+i*0 */
        output[i++]=(127.0F / 128.0F);
        output[i++]=0.0F;
    }
 }
 PF_TARGET_CLONES
 void generate_dc_s16(short* output, int size)
 {
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+i*0 */
        output[i++]=SHRT_MAX;
        output[i++]=0;
    }
 }
 PF_TARGET_CLONES
 void generate_pos_fs4_f(float* output, int size)
 {
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+i*0 */
        output[i++]=(127.0F / 128.0F);
        output[i++]=0.0F;
        /* exp(i* +pi/2) = 0+i*1 */
        output[i++]=0.0F;
        output[i++]=(127.0F / 128.0F);
        /* exp(i* +pi) = -1+i*0 */
        output[i++]=(-127.0F / 128.0F);
        output[i++]=0.0F;
        /* exp(i* -pi/2) = 0+i*-1 */
        output[i++]=0.0F;
        output[i++]=(-127.0F / 128.0F);
    }
 }
 PF_TARGET_CLONES
 void generate_pos_fs4_s16(short* output, int size)
 {
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+i*0 */
        output[i++]=SHRT_MAX;
        output[i++]=0;
        /* exp(i* +pi/2) = 0+i*1 */
        output[i++]=0;
        output[i++]=SHRT_MAX;
        /* exp(i* +pi) = -1+i*0 */
        output[i++]=-SHRT_MAX;
        output[i++]=0;
        /* exp(i* -pi/2) = 0+i*-1 */
        output[i++]=0;
        output[i++]=-SHRT_MAX;
    }
 }
 PF_TARGET_CLONES
 void generate_neg_fs4_f(float* output, int size)
 {
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+i*0 */
        output[i++]=(127.0F / 128.0F);
        output[i++]=0.0F;
        /* exp(i* -pi/2) = 0+i*-1 */
        output[i++]=0.0F;
        output[i++]=(-127.0F / 128.0F);
        /* exp(i* +pi) = -1+i*0 */
        output[i++]=(-127.0F / 128.0F);
        output[i++]=0.0F;
        /* exp(i* +pi/2) = 0+i*1 */
        output[i++]=0.0F;
        output[i++]=(127.0F / 128.0F);
    }
 }
 PF_TARGET_CLONES
 void generate_neg_fs4_s16(short* output, int size)
 {
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+i*0 */
        output[i++]=SHRT_MAX;
        output[i++]=0;
        /* exp(i* -pi/2) = 0+i*-1 */
        output[i++]=0;
        output[i++]=-SHRT_MAX;
        /* exp(i* +pi) = -1+i*0 */
        output[i++]=-SHRT_MAX;
        output[i++]=0;
        /* exp(i* +pi/2) = 0+i*1 */
        output[i++]=0;
        output[i++]=SHRT_MAX;
    }
 }
 /****************************************************/
 PF_TARGET_CLONES
 void generate_dc_pos_fs4_s16(short* output, int size)
 {
    const int m = SHRT_MAX / 2;
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+1+i*0 */
        output[i++]=m+m;
        output[i++]=0;
        /* exp(i* +pi/2) = 1+0+i*1 */
        output[i++]=m+0;
        output[i++]=m;
        /* exp(i* +pi) = 1-1+i*0 */
        output[i++]=m-m;
        output[i++]=0;
        /* exp(i* -pi/2) = 1+0+i*-1 */
        output[i++]=m;
        output[i++]=-m;
    }
 }
 PF_TARGET_CLONES
 void generate_dc_neg_fs4_s16(short* output, int size)
 {
    const int m = SHRT_MAX / 2;
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* exp(i*0) = 1+1+i*0 */
        output[i++]=m+m;
        output[i++]=0;
        /* exp(i* -pi/2) = 1+0+i*-1 */
        output[i++]=m+0;
        output[i++]=-m;
        /* exp(i* +pi) = 1-1+i*0 */
        output[i++]=m-m;
        output[i++]=0;
        /* exp(i* +pi/2) = 1+0+i*1 */
        output[i++]=m+0;
        output[i++]=m;
    }
 }
 PF_TARGET_CLONES
 void generate_pos_neg_fs4_s16(short* output, int size)
 {
    const int m = SHRT_MAX / 2;
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* pos(0) + neg(0) = exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
        output[i++]=m;
        output[i++]=-m;
        /* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
        output[i++]=-m;
        output[i++]=m;
        /* pos(2) + neg(2) = exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
        output[i++]=-m;
        output[i++]=m;
        /* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
        output[i++]=m;
        output[i++]=-m;
    }
 }
 PF_TARGET_CLONES
 void generate_dc_pos_neg_fs4_s16(short* output, int size)
 {
    const int m = SHRT_MAX / 2;
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* dc + pos(0) + neg(0) = dc + exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
        output[i++]=m+m;
        output[i++]=-m;
        /* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
        output[i++]=0;
        output[i++]=m;
        /* dc + pos(2) + neg(2) = dc + exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
        output[i++]=0;
        output[i++]=m;
        /* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
        output[i++]=m+m;
        output[i++]=-m;
    }
 }
 PF_TARGET_CLONES
 void generate_pos_neg_fs2_s16(short* output, int size)
 {
    const int m = SHRT_MAX / 2;
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* dc + exp(i* 0 ) = +1 */
        output[i++]=m;
        output[i++]=0;
        /* dc + exp(i* pi) = -1 */
        output[i++]=-m;
        output[i++]=0;
        /* dc + exp(i* 0 ) = +1 */
        output[i++]=m;
        output[i++]=0;
        /* dc + exp(i* pi) = -1 */
        output[i++]=-m;
        output[i++]=0;
    }
 }
 PF_TARGET_CLONES
 void generate_dc_pos_neg_fs2_s16(short* output, int size)
 {
    const int m = SHRT_MAX / 2;
    /* size must be multiple of 4 */
    assert(!(size&3));
    for(int i=0;i<2*size;)
    {
        /* with dc = i*1 */
        /* dc + exp(i* 0 ) = i*1 +1 */
        output[i++]=m;
        output[i++]=m;
        /* dc + exp(i* pi) = i*1 -1 */
        output[i++]=-m;
        output[i++]=m;
        /* dc + exp(i* 0 ) = i*1 +1 */
        output[i++]=m;
        output[i++]=m;
        /* dc + exp(i* pi) = i*1 -1 */
        output[i++]=-m;
        output[i++]=m;
    }
 }
--- a/pffft/pf_carrier.h
+++ b/pffft/pf_carrier.h
@@ -0,0 +1,75 @@
 /*
 This software is part of pffft/pfdsp, a set of simple DSP routines.
 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #pragma once
 #include <stdio.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*
   _____                      _
  / ____|                    | |
 | |     ___  _ __ ___  _ __ | | _____  __
 | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
 | |___| (_) | | | | | | |_) | |  __/>  <
  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
                       | |
                       |_|
 */
 typedef struct complexf_s { float i; float q; } complexf;
 /* generation functions */
 void generate_dc_f(float* output, int size);
 void generate_dc_s16(short* output, int size);
 void generate_pos_fs4_f(float* output, int size);
 void generate_pos_fs4_s16(short* output, int size);
 void generate_neg_fs4_f(float* output, int size);
 void generate_neg_fs4_s16(short* output, int size);
 void generate_dc_pos_fs4_s16(short* output, int size);
 void generate_dc_neg_fs4_s16(short* output, int size);
 void generate_pos_neg_fs4_s16(short* output, int size);
 void generate_dc_pos_neg_fs4_s16(short* output, int size);
 void generate_pos_neg_fs2_s16(short* output, int size);
 void generate_dc_pos_neg_fs2_s16(short* output, int size);
 #ifdef __cplusplus
 }
 #endif
--- a/pffft/pf_cic.cpp
+++ b/pffft/pf_cic.cpp
@@ -0,0 +1,255 @@
 /*
 This software is part of pffft/pfdsp, a set of simple DSP routines.
 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /* gcc requires this for M_PI !? */
 #undef __STRICT_ANSI__
 /* include own header first, to see missing includes */
 #include "pf_cic.h"
 #include "fmv.h"
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 /*
   ____ ___ ____   ____  ____   ____
  / ___|_ _/ ___| |  _ \|  _ \ / ___|
 | |    | | |     | | | | | | | |
 | |___ | | |___  | |_| | |_| | |___
  \____|___\____| |____/|____/ \____|
 */
 #define SINESHIFT 12
 #define SINESIZE (1<<SINESHIFT)
 typedef int64_t cic_dt; // data type used for integrators and combs
 typedef struct {
    int factor;
    uint64_t phase;
    float gain;
    cic_dt ig0a, ig0b, ig1a, ig1b;
    cic_dt comb0a, comb0b, comb1a, comb1b;
    int16_t *sinetable;
 } cicddc_t;
 void *cicddc_init(int factor) {
    int i;
    int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
    cicddc_t *s;
    s = (cicddc_t *)malloc(sizeof(cicddc_t));
    memset(s, 0, sizeof(cicddc_t));
    float sineamp = 32767.0f;
    s->factor = factor;
    s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
    s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
    double f = 2.0 * M_PI / (double)SINESIZE;
    for(i = 0; i < sinesize2; i++) {
        s->sinetable[i] = sineamp * cos(f * i);
    }
    return s;
 }
 void cicddc_free(void *state) {
    cicddc_t *s = (cicddc_t *)state;
    free(s->sinetable);
    free(s);
 }
 PF_TARGET_CLONES
 void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
    cicddc_t *s = (cicddc_t *)state;
    int k;
    int factor = s->factor;
    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
    uint64_t phase = s->phase, freq;
    int16_t *sinetable = s->sinetable;
    float gain = s->gain;
    freq = rate * ((float)(1ULL << 63) * 2);
    int16_t *inp = input;
    for(k = 0; k < outsize; k++) {
        int i;
        cic_dt out0a, out0b, out1a, out1b;
        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
        for(i = 0; i < factor; i++) {
            cic_dt in_a, in_b;
            int sinep = phase >> (64-SINESHIFT);
            in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
            in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
            phase += freq;
            /* integrators:
            The calculations are ordered so that each integrator
            takes a result from previous loop iteration
            to make the code more "pipeline-friendly". */
            ig2a += ig1a; ig2b += ig1b;
            ig1a += ig0a; ig1b += ig0b;
            ig0a += in_a; ig0b += in_b;
        }
        inp += factor;
        // comb filters:
        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
        comb0a = ig2a;           comb0b = ig2b;
        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
        comb1a = out0a;          comb1b = out0b;
        output[k].i = (float)out1a * gain;
        output[k].q = (float)out1b * gain;
    }
    s->ig0a = ig0a; s->ig0b = ig0b;
    s->ig1a = ig1a; s->ig1b = ig1b;
    s->comb0a = comb0a; s->comb0b = comb0b;
    s->comb1a = comb1a; s->comb1b = comb1b;
    s->phase = phase;
 }
 PF_TARGET_CLONES
 void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
    cicddc_t *s = (cicddc_t *)state;
    int k;
    int factor = s->factor;
    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
    uint64_t phase = s->phase, freq;
    int16_t *sinetable = s->sinetable;
    float gain = s->gain;
    freq = rate * ((float)(1ULL << 63) * 2);
    int16_t *inp = input;
    for(k = 0; k < outsize; k++) {
        int i;
        cic_dt out0a, out0b, out1a, out1b;
        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
        for(i = 0; i < factor; i++) {
            cic_dt in_a, in_b;
            int32_t m_a, m_b, m_c, m_d;
            int sinep = phase >> (64-SINESHIFT);
            m_a = inp[2*i];
            m_b = inp[2*i+1];
            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
            m_d = (int32_t)sinetable[sinep];
            // complex multiplication:
            in_a = m_a*m_c - m_b*m_d;
            in_b = m_a*m_d + m_b*m_c;
            phase += freq;
            /* integrators:
            The calculations are ordered so that each integrator
            takes a result from previous loop iteration
            to make the code more "pipeline-friendly". */
            ig2a += ig1a; ig2b += ig1b;
            ig1a += ig0a; ig1b += ig0b;
            ig0a += in_a; ig0b += in_b;
        }
        inp += 2*factor;
        // comb filters:
        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
        comb0a = ig2a;           comb0b = ig2b;
        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
        comb1a = out0a;          comb1b = out0b;
        output[k].i = (float)out1a * gain;
        output[k].q = (float)out1b * gain;
    }
    s->ig0a = ig0a; s->ig0b = ig0b;
    s->ig1a = ig1a; s->ig1b = ig1b;
    s->comb0a = comb0a; s->comb0b = comb0b;
    s->comb1a = comb1a; s->comb1b = comb1b;
    s->phase = phase;
 }
 /* This is almost copy paste from cicddc_cs16_c.
   I'm afraid this is going to be annoying to maintain... */
 PF_TARGET_CLONES
 void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
    cicddc_t *s = (cicddc_t *)state;
    int k;
    int factor = s->factor;
    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
    uint64_t phase = s->phase, freq;
    int16_t *sinetable = s->sinetable;
    float gain = s->gain;
    freq = rate * ((float)(1ULL << 63) * 2);
    uint8_t *inp = input;
    for(k = 0; k < outsize; k++) {
        int i;
        cic_dt out0a, out0b, out1a, out1b;
        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
        for(i = 0; i < factor; i++) {
            cic_dt in_a, in_b;
            int32_t m_a, m_b, m_c, m_d;
            int sinep = phase >> (64-SINESHIFT);
            // subtract 127.4 (good for rtl-sdr)
            m_a = (((int32_t)inp[2*i])   << 8) - 32614;
            m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
            m_d = (int32_t)sinetable[sinep];
            // complex multiplication:
            in_a = m_a*m_c - m_b*m_d;
            in_b = m_a*m_d + m_b*m_c;
            phase += freq;
            /* integrators:
            The calculations are ordered so that each integrator
            takes a result from previous loop iteration
            to make the code more "pipeline-friendly". */
            ig2a += ig1a; ig2b += ig1b;
            ig1a += ig0a; ig1b += ig0b;
            ig0a += in_a; ig0b += in_b;
        }
        inp += 2*factor;
        // comb filters:
        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
        comb0a = ig2a;           comb0b = ig2b;
        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
        comb1a = out0a;          comb1b = out0b;
        output[k].i = (float)out1a * gain;
        output[k].q = (float)out1b * gain;
    }
    s->ig0a = ig0a; s->ig0b = ig0b;
    s->ig1a = ig1a; s->ig1b = ig1b;
    s->comb0a = comb0a; s->comb0b = comb0b;
    s->comb1a = comb1a; s->comb1b = comb1b;
    s->phase = phase;
 }
--- a/pffft/pf_cic.h
+++ b/pffft/pf_cic.h
@@ -0,0 +1,58 @@
 /*
 This software is part of pffft/pfdsp, a set of simple DSP routines.
 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #pragma once
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*
   ____ ___ ____   ____  ____   ____
  / ___|_ _/ ___| |  _ \|  _ \ / ___|
 | |    | | |     | | | | | | | |
 | |___ | | |___  | |_| | |_| | |___
  \____|___\____| |____/|____/ \____|
 */
 typedef struct complexf_s { float i; float q; } complexf;
 void *cicddc_init(int factor);
 void cicddc_free(void *state);
 void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
 void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
 void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
 #ifdef __cplusplus
 }
 #endif
--- a/pffft/pf_conv.cpp
+++ b/pffft/pf_conv.cpp
@@ -0,0 +1,322 @@
 #include "pf_conv.h"
 #include <string.h>
 #include <assert.h>
 #include <algorithm>
 #if 0
 #include <stdio.h>
 #define DPRINT(...) fprintf(stderr, __VA_ARGS__)
 #else
 #define DPRINT(...) do { } while (0)
 #endif
 #ifdef HAVE_MIPP
 #include <mipp.h>
 #endif
 #ifndef CONV_ARCH_POST
 #error CONV_ARCH_POST not defined
 #endif
 #define PP_STRINGIFY(X) #X
 #define PP_TOSTRING(X)  PP_STRINGIFY(X)
 #define PP_CONCAT_IMPL(x, y) x##y
 #define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y )
 #define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST)
 const char * ARCHFUNCNAME(id)()
 {
    return PP_TOSTRING(CONV_ARCH_POST);
 }
 int ARCHFUNCNAME(conv_float_simd_size)()
 {
 #if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP)
    // have a completely MIPP independent implementation
    return 1;
 #else
    return mipp::N<float>();
 #endif
 }
 void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state)
 {
    int R = state->size - state->offset;    // this many samples from prev conv_float were not processed
    if (R > 0)
    {
        // memmove(s, &s[state->offset], R * sizeof(s[0]));   // move them to the begin
        std::copy(&s[state->offset], &s[state->size], s);
    }
    else
        R = 0;
    state->offset = 0;      // data - to be processed - is at begin
    state->size = R;        // this many unprocessed samples
 }
 void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state)
 {
    int R = state->size - state->offset;    // this many samples from prev conv_float were not processed
    if (R > 0)
    {
        // memmove(s, &s[state->offset], R * sizeof(s[0]));   // move them to the begin
        std::copy(&s[state->offset], &s[state->size], s);
    }
    else
        R = 0;
    state->offset = 0;      // data - to be processed - is at begin
    state->size = R;        // this many unprocessed samples
 }
 #if defined(MIPP_NO_INTRINSICS)
 // have a completely MIPP independent implementation
 // #error missing HAVE_MIPP: there is no MIPP-independent implementation
 int ARCHFUNCNAME(conv_float_inplace)(
        float * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter
        )
 {
    const int off0 = state->offset;
    const int sz_s = state->size;
    int offset;
    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
    {
        float accu = 0.0F;
        for (int k = 0; k < sz_filter; ++k)
            accu += s[offset+k] * filter[k];
        s[offset] = accu;
    }
    state->offset = offset;
    return offset - off0;
 }
 int ARCHFUNCNAME(conv_float_oop)(
        const float * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter,
        float * RESTRICT y
        )
 {
    const int off0 = state->offset;
    const int sz_s = state->size;
    int offset;
    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
    {
        float accu = 0.0F;
        for (int k = 0; k < sz_filter; ++k)
            accu += s[offset+k] * filter[k];
        y[offset] = accu;
    }
    state->offset = offset;
    return offset - off0;
 }
 int ARCHFUNCNAME(conv_cplx_float_oop)(
        const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter,
        complexf * RESTRICT y_cplx
        )
 {
    const int off0 = state->offset;
    const int sz_s = state->size;
    const int sz_f = sz_filter;
    int offset;
    for ( offset = off0; offset + sz_f <= sz_s; ++offset)
    {
        float accu_re = 0.0F;
        float accu_im = 0.0F;
        for (int k = 0; k < sz_filter; ++k)
        {
            accu_re = s_cplx[offset+k].i * filter[k];   // accu += rS * rH;
            accu_im = s_cplx[offset+k].q * filter[k];   // accu += rS * rH;
        }
        y_cplx[offset].i = accu_re;  // == hadd() == sum of real parts
        y_cplx[offset].q = accu_im;  // == hadd() == sum of imag parts
    }
    state->offset = offset;
    return offset - off0;
 }
 #elif defined(HAVE_MIPP)
 int ARCHFUNCNAME(conv_float_inplace)(
        float * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter
        )
 {
    assert( (sz_filter % mipp::N<float>()) == 0 );  // size of filter must be divisible by conv_float_simd_size()
    mipp::Reg<float> accu, rS, rH;
    const int off0 = state->offset;
    const int sz_s = state->size;
    int offset;
    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
    {
        accu.set0();
        for (int k = 0; k < sz_filter; k += mipp::N<float>())
        {
            rS.load(&s[offset+k]);
            rH.load(&filter[k]);
            accu = mipp::fmadd(rS, rH, accu);   // accu += rS * rH;
        }
        s[offset] = accu.sum();    // == hadd()
    }
    state->offset = offset;
    return offset - off0;
 }
 int ARCHFUNCNAME(conv_float_oop)(
        const float * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter,
        float * RESTRICT y
        )
 {
    assert( (sz_filter % mipp::N<float>()) == 0 );  // size of filter must be divisible by conv_float_simd_size()
    mipp::Reg<float> accu, rS, rH;
    const int off0 = state->offset;
    const int sz_s = state->size;
    int offset;
    for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
    {
        accu.set0();
        for (int k = 0; k < sz_filter; k += mipp::N<float>())
        {
            rS.loadu(&s[offset+k]);
            rH.load(&filter[k]);
            accu = mipp::fmadd(rS, rH, accu);   // accu += rS * rH;
        }
        y[offset] = accu.sum();    // == hadd()
    }
    state->offset = offset;
    return offset - off0;
 }
 int ARCHFUNCNAME(conv_cplx_float_oop)(
        const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter,
        complexf * RESTRICT y_cplx
        )
 {
    assert( (sz_filter % mipp::N<float>()) == 0 );  // size of filter must be divisible by conv_float_simd_size()
    const float * RESTRICT s = &(s_cplx[0].i);
    float * RESTRICT y = &(y_cplx[0].i);
    mipp::Regx2<float> accu_x2, rS_x2, H_x2;
    const int off0 = 2 * state->offset;
    const int sz_s = 2 * state->size;
    const int sz_f2 = 2 * sz_filter;
    int offset;
    for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2)
    {
        accu_x2.val[0].set0();
        accu_x2.val[1].set0();
        for (int k = 0; k < sz_filter; k += mipp::N<float>())
        {
            mipp::Reg<float> rH;
            rS_x2.loadu(&s[offset+2*k]);
            rH.load(&filter[k]);
            H_x2 = mipp::interleave<float>(rH, rH);
            accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]);   // accu += rS * rH;
            accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]);   // accu += rS * rH;
        }
        H_x2 = mipp::deinterleave(accu_x2);
        y[offset]   = H_x2.val[0].sum();  // == hadd() == sum of real parts
        y[offset+1] = H_x2.val[1].sum();  // == hadd() == sum of imag parts
    }
    state->offset = offset /2;
    return (offset - off0) / 2;
 }
 #endif
 static const conv_f_ptrs conv_ptrs =
 {
    PP_TOSTRING(CONV_ARCH_POST),
 #ifndef MIPP_NO_INTRINSICS
    1,
 #else
    0,
 #endif
    ARCHFUNCNAME(id),
    ARCHFUNCNAME(conv_float_simd_size),
 #if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP)
    ARCHFUNCNAME(conv_float_move_rest),
    ARCHFUNCNAME(conv_float_inplace),
    ARCHFUNCNAME(conv_float_oop),
    ARCHFUNCNAME(conv_cplx_move_rest),
    ARCHFUNCNAME(conv_cplx_float_oop)
 #else
    nullptr,
    nullptr,
    nullptr,
    nullptr,
    nullptr
 #endif
 };
 const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)()
 {
    DPRINT("arch pointer for '%s':\n", conv_ptrs.id);
    if (!strcmp(conv_ptrs.id, "none"))
        return &conv_ptrs;
 #if defined(MIPP_NO_INTRINSICS)
    DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id);
    return &conv_ptrs;
 #elif defined(HAVE_MIPP)
    DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id);
    DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp);
    DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size());
    if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1)
        return &conv_ptrs;
    else
        DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size());
 #else
    DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id);
 #endif
    DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id);
    return nullptr;
 }
 #if defined(__cplusplus) && (__cplusplus >= 201703L)
 [[maybe_unused]]
 #endif
 static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);
--- a/pffft/pf_conv.h
+++ b/pffft/pf_conv.h
@@ -0,0 +1,109 @@
 #pragma once
 /* pf_conv.h/.cpp implements linear "slow" convolution.
 * this code is primarily for test/demonstration of runtime dispatching.
 * each "kernel" is compiled with different compiler/architecture options,
 * that activates different implementations in the MIPP headers.
 *
 * the dispatcher library 'pf_conv_dispatcher' collects (links agains)
 * all the pf_conv_arch_<opt> libraries ..
 * and provides the  get_all_conv_arch_ptrs() function,
 * which delivers an array of pointers to the struct (conv_f_ptrs)
 * containing the function pointers for the different implementations.
 *
 * requirement(s):
 * - installed MIPP headers
 * - compiler definitions for the different architecture types:
 *   see CMakeLists.txt CONV_ARCH_MSVC_AMD64, CONV_ARCH_GCC_ARM32NEON, ..
 * - one cmake library target pf_conv_arch_<opt> for each architecture option.
 *   each one gets it's specific  architecture/compiler  options
 *    utilizing the target_set_cxx_arch_option() macro in the CMakeLists.txt
 */
 #include "pf_cplx.h"
 #if defined(_MSC_VER)
 #  define RESTRICT __restrict
 #elif defined(__GNUC__)
 #  define RESTRICT __restrict
 #else
 #  define RESTRICT
 #endif
 struct conv_buffer_state
 {
    int offset; // sample index where data (to process) starts
    int size;   // actual - or previous - size in amount of samples from buffer start (NOT offset)
 };
 // declare provided function pointer types
 typedef const char * (*f_conv_id)();
 typedef int  (*f_conv_float_simd_size)();
 typedef void (*f_conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state);
 typedef void (*f_conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state);
 typedef int  (*f_conv_float_inplace)(
        float * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter
        );
 typedef int  (*f_conv_float_oop)(
        const float * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter,
        float * RESTRICT y
        );
 typedef int  (*f_conv_cplx_float_oop)(
        const complexf * RESTRICT s, conv_buffer_state * RESTRICT state,
        const float * RESTRICT filter, const int sz_filter,
        complexf * RESTRICT y
        );
 // struct with the provided function pointers
 struct conv_f_ptrs
 {
    const char * id;
    const int using_mipp;
    f_conv_id               fp_id;
    f_conv_float_simd_size  fp_conv_float_simd_size;
    f_conv_float_move_rest  fp_conv_float_move_rest;
    f_conv_float_inplace    fp_conv_float_inplace;
    f_conv_float_oop        fp_conv_float_oop;
    f_conv_cplx_move_rest   fp_conv_cplx_move_rest;
    f_conv_cplx_float_oop   fp_conv_cplx_float_oop;
 };
 typedef const conv_f_ptrs * ptr_to_conv_f_ptrs;
 // function pointer type, delivering the struct with the function pointers
 typedef const conv_f_ptrs* (*f_conv_ptrs)();
 // helper for systematic function names
 #define CONV_FN_ARCH(FN, ARCH) FN##_##ARCH
 // declare all functions - returning the structs with the function pointers
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, none)();  // = conv_ptrs_none()
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, dflt)();  // simd / mipp is activated
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse3)();  // = conv_ptrs_sse3()
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse4)();
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)();
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse2)();
 //extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();  // already declared
 //extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); // already declared
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();    // for armv7l / 32-bit ARM
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
 extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, armv8a)();  // for aarch64
--- a/pffft/pf_conv_dispatcher.cpp
+++ b/pffft/pf_conv_dispatcher.cpp
@@ -0,0 +1,61 @@
 #include "pf_conv_dispatcher.h"
 #if 0
 #include <stdio.h>
 #define DPRINT(...) fprintf(stderr, __VA_ARGS__)
 #else
 #define DPRINT(...) do { } while (0)
 #endif
 #define N_DEFAULT_ARCHES  2
 // 0 is "none"
 // 1 "dflt"
 ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch)
 {
    static ptr_to_conv_f_ptrs * all_arches = nullptr;
    static int n_arch = 0;
    if (!all_arches)
    {
        n_arch = N_DEFAULT_ARCHES;
        // @TODO: runtime check if actual CPU supports specific architecture
 #if defined(CONV_ARCH_GCC_AMD64)
        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+4] = {0};
        DPRINT("CONV_ARCH_GCC_AMD64: sse3, sse4, avx, avx2\n");
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse3)();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse4)();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
 #elif defined(CONV_ARCH_MSVC_AMD64)
        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
        DPRINT("CONV_ARCH_MSVC_AMD64: sse2, avx, avx2\n");
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse2)();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
 #elif defined(CONV_ARCH_GCC_ARM32NEON)
        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
        DPRINT("CONV_ARCH_GCC_ARM32NEON: neon_vfpv4, neon_rpi3_a53\n");
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
 #elif defined(CONV_ARCH_GCC_AARCH64)
        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+1] = {0};
        DPRINT("CONV_ARCH_GCC_AARCH64: -\n");
        conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, armv8a)();
 #else
        static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES] = {0};
        DPRINT("unknown CONV_ARCH: -\n");
 #endif
        conv_arch_ptrs[0] = CONV_FN_ARCH(conv_ptrs, none)();
        conv_arch_ptrs[1] = CONV_FN_ARCH(conv_ptrs, dflt)();
        all_arches = conv_arch_ptrs;
    }
    if (p_num_arch)
        *p_num_arch = n_arch;
    return all_arches;
 }
--- a/pffft/pf_conv_dispatcher.h
+++ b/pffft/pf_conv_dispatcher.h
@@ -0,0 +1,6 @@
 #pragma once
 #include "pf_conv.h"
 ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch);
--- a/pffft/pf_cplx.h
+++ b/pffft/pf_cplx.h
@@ -0,0 +1,44 @@
 /*
 This software is part of pffft/pfdsp, a set of simple DSP routines.
 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #pragma once
 /*
   _____                      _
  / ____|                    | |
 | |     ___  _ __ ___  _ __ | | _____  __
 | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
 | |___| (_) | | | | | | |_) | |  __/>  <
  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
                       | |
                       |_|
 */
 typedef struct complexf_s { float i; float q; } complexf;
--- a/pffft/pf_mixer.cpp
+++ b/pffft/pf_mixer.cpp
--- a/pffft/pf_mixer.h
+++ b/pffft/pf_mixer.h
@@ -0,0 +1,270 @@
 /*
 This software is part of pffft/pfdsp, a set of simple DSP routines.
 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #pragma once
 #include <stdio.h>
 #include <stdint.h>
 #include "pf_cplx.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 // =================================================================================
 int have_sse_shift_mixer_impl();
 /*********************************************************************/
 /**************/
 /*** ALGO A ***/
 /**************/
 float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase);
 /*********************************************************************/
 /**************/
 /*** ALGO B ***/
 /**************/
 typedef struct shift_table_data_s
 {
    float* table;
    int table_size;
 } shift_table_data_t;
 void shift_table_deinit(shift_table_data_t table_data);
 shift_table_data_t shift_table_init(int table_size);
 float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
 /*********************************************************************/
 /**************/
 /*** ALGO C ***/
 /**************/
 typedef struct shift_addfast_data_s
 {
    float dsin[4];
    float dcos[4];
    float phase_increment;
 } shift_addfast_data_t;
 shift_addfast_data_t shift_addfast_init(float rate);
 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
 float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
 /*********************************************************************/
 /**************/
 /*** ALGO D ***/
 /**************/
 typedef struct shift_unroll_data_s
 {
    float* dsin;
    float* dcos;
    float phase_increment;
    int size;
 } shift_unroll_data_t;
 shift_unroll_data_t shift_unroll_init(float rate, int size);
 void shift_unroll_deinit(shift_unroll_data_t* d);
 float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
 float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
 /*********************************************************************/
 /**************/
 /*** ALGO E ***/
 /**************/
 /* similar to shift_unroll_cc() - but, have fixed and limited precalc size
 * idea: smaller cache usage by table
 * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
 */
 #define PF_SHIFT_LIMITED_UNROLL_SIZE  128
 #define PF_SHIFT_LIMITED_SIMD_SZ  4
 typedef struct shift_limited_unroll_data_s
 {
    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
    complexf complex_phase;
    float phase_increment;
 } shift_limited_unroll_data_t;
 shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
 /* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
 /* starting_phase for next call is kept internal in state */
 void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
 void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
 /*********************************************************************/
 /**************/
 /*** ALGO F ***/
 /**************/
 typedef struct shift_limited_unroll_A_sse_data_s
 {
    /* small/limited trig table */
    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
    /* 4 times complex phase */
    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
    /* N_cplx_per_block times increment - for future parallel variants */
    float dcos_blk;
    float dsin_blk;
    /* */
    float phase_increment;
 } shift_limited_unroll_A_sse_data_t;
 shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
 void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
 /*********************************************************************/
 /**************/
 /*** ALGO G ***/
 /**************/
 typedef struct shift_limited_unroll_B_sse_data_s
 {
    /* small/limited trig table */
    float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
    /* 4 times complex phase */
    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
    /* N_cplx_per_block times increment - for future parallel variants */
    float dcos_blk;
    float dsin_blk;
    /* */
    float phase_increment;
 } shift_limited_unroll_B_sse_data_t;
 shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
 void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
 /*********************************************************************/
 /**************/
 /*** ALGO H ***/
 /**************/
 typedef struct shift_limited_unroll_C_sse_data_s
 {
    /* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
    float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
    /* 4 times complex phase */
    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
    /* N_cplx_per_block times increment - for future parallel variants */
    float dcos_blk;
    float dsin_blk;
    /* */
    float phase_increment;
 } shift_limited_unroll_C_sse_data_t;
 shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
 void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
 /*********************************************************************/
 /**************/
 /*** ALGO I ***/
 /**************/
 /* Recursive Quadrature Oscillator functions "recursive_osc"
 * see https://www.vicanek.de/articles/QuadOsc.pdf
 */
 #define PF_SHIFT_RECURSIVE_SIMD_SZ  8
 typedef struct shift_recursive_osc_s
 {
    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
 } shift_recursive_osc_t;
 typedef struct shift_recursive_osc_conf_s
 {
    float k1;
    float k2;
 } shift_recursive_osc_conf_t;
 void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
 void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 /* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
 /* starting_phase for next call is kept internal in state */
 void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 /*********************************************************************/
 /**************/
 /*** ALGO J ***/
 /**************/
 #define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ  4
 typedef struct shift_recursive_osc_sse_s
 {
    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
 } shift_recursive_osc_sse_t;
 typedef struct shift_recursive_osc_sse_conf_s
 {
    float k1;
    float k2;
 } shift_recursive_osc_sse_conf_t;
 void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
 void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
 void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
 #ifdef __cplusplus
 }
 #endif
--- a/pffft/pffastconv.c
+++ b/pffft/pffastconv.c
@@ -0,0 +1,264 @@
 /*
  Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
 */
 #include "pffastconv.h"
 #include "pffft.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <math.h>
 #include <assert.h>
 #include <string.h>
 #define FASTCONV_DBG_OUT  0
 /* detect compiler flavour */
 #if defined(_MSC_VER)
 #  define RESTRICT __restrict
 #pragma warning( disable : 4244 4305 4204 4456 )
 #elif defined(__GNUC__)
 #  define RESTRICT __restrict
 #endif
 void *pffastconv_malloc(size_t nb_bytes)
 {
  return pffft_aligned_malloc(nb_bytes);
 }
 void pffastconv_free(void *p)
 {
  pffft_aligned_free(p);
 }
 int pffastconv_simd_size()
 {
  return pffft_simd_size();
 }
 struct PFFASTCONV_Setup
 {
  float * Xt;      /* input == x in time domain - copy for alignment */
  float * Xf;      /* input == X in freq domain */
  float * Hf;      /* filterCoeffs == H in freq domain */
  float * Mf;      /* input * filterCoeffs in freq domain */
  PFFFT_Setup *st;
  int filterLen;   /* convolution length */
  int Nfft;        /* FFT/block length */
  int flags;
  float scale;
 };
 PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
 {
  PFFASTCONV_Setup * s = NULL;
  const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
  const int minFftLen = 2*pffft_simd_size()*pffft_simd_size();
  int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1);
 #if FASTCONV_DBG_OUT
  const int iOldBlkLen = *blockLen;
 #endif
  if ( Nfft < minFftLen )
    Nfft = minFftLen;
  if ( flags & PFFASTCONV_CPLX_FILTER )
    return NULL;
  s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
  if ( *blockLen > Nfft ) {
    Nfft = *blockLen;
    Nfft = pffft_next_power_of_two(Nfft);
  }
  *blockLen = Nfft;  /* this is in (complex) samples */
  Nfft *= cplxFactor;
  if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
    s->Xt = NULL;
  else
    s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
  s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
  s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
  s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
  s->st = pffft_new_setup(Nfft, PFFFT_REAL);  /* with complex: we do 2 x fft() */
  s->filterLen = filterLen;        /* filterLen == convolution length == length of impulse response */
  if ( cplxFactor == 2 )
    s->filterLen = 2 * filterLen - 1;
  s->Nfft = Nfft;  /* FFT/block length */
  s->flags = flags;
  s->scale = (float)( 1.0 / Nfft );
  memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
  if ( flags & PFFASTCONV_CORRELATION ) {
    for ( i = 0; i < filterLen; ++i )
      s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ];
  } else {
    for ( i = 0; i < filterLen; ++i )
      s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
  }
  pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
 #if FASTCONV_DBG_OUT
  printf("\n  fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
    , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
 #endif
  return s;
 }
 void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
 {
  if (!s)
    return;
  pffft_destroy_setup(s->st);
  pffastconv_free(s->Mf);
  pffastconv_free(s->Hf);
  pffastconv_free(s->Xf);
  if ( s->Xt )
    pffastconv_free(s->Xt);
  pffastconv_free(s);
 }
 int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
 {
  const float * RESTRICT X = input_;
  float * RESTRICT Y = output_;
  const int Nfft = s->Nfft;
  const int filterLen = s->filterLen;
  const int flags = s->flags;
  const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
  const int inputLen = cplxFactor * cplxInputLen;
  int inpOff, procLen, numOut = 0, j, part, cplxOff;
  /* applyFlush != 0:
   *     inputLen - inpOff -filterLen + 1 > 0
   * <=> inputLen -filterLen + 1 > inpOff
   * <=> inpOff < inputLen -filterLen + 1
   * 
   * applyFlush == 0:
   *     inputLen - inpOff >= Nfft
   * <=> inputLen - Nfft >= inpOff
   * <=> inpOff <= inputLen - Nfft
   * <=> inpOff < inputLen - Nfft + 1
   */
  if ( cplxFactor == 2 )
  {
    const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
 #if 0
    printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
 #endif
    for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
    {
      procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
      numOut = ( procLen - filterLen + 1 ) & ( ~1 );
      if (!numOut)
        break;
 #if 0
      if (!inpOff)
        printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
      if (inpOff + filterLen + 2 >= maxOff )
        printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
 #endif
      if ( flags & PFFASTCONV_DIRECT_INP )
      {
        pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
      }
      else
      {
        memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
        if ( procLen < Nfft )
          memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
        pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
      }
      pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
      if ( flags & PFFASTCONV_DIRECT_OUT )
      {
        pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
      }
      else
      {
        pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
        memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
      }
    }
    return inpOff / cplxFactor;
  }
  else
  {
    const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
    const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
    for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
    {
      procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
      numOut = procLen - filterLen + 1;
      for ( part = 0; part < numParts; ++part )  /* iterate per real/imag component */
      {
        if ( flags & PFFASTCONV_CPLX_INP_OUT )
        {
          cplxOff = 2 * inpOff + part;
          for ( j = 0; j < procLen; ++j )
            s->Xt[j] = X[cplxOff + 2 * j];
          if ( procLen < Nfft )
            memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
          pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
        }
        else if ( flags & PFFASTCONV_DIRECT_INP )
        {
          pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
        }
        else
        {
          memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
          if ( procLen < Nfft )
            memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
          pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
        }
        pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
        if ( flags & PFFASTCONV_CPLX_INP_OUT )
        {
          pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
          cplxOff = 2 * inpOff + part;
          for ( j = 0; j < numOut; ++j )
            Y[ cplxOff + 2 * j ] = s->Xf[j];
        }
        else if ( flags & PFFASTCONV_DIRECT_OUT )
        {
          pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
        }
        else
        {
          pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
          memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
        }
      }
    }
    return inpOff;
  }
 }
--- a/pffft/pffastconv.h
+++ b/pffft/pffastconv.h
@@ -0,0 +1,171 @@
 /* Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of PFFFT, PFFASTCONV, nor the names of its
   sponsors or contributors may be used to endorse or promote products
   derived from this Software without specific prior written permission.  
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 /*
   PFFASTCONV : a Pretty Fast Fast Convolution
   This is basically the implementation of fast convolution,
   utilizing the FFT (pffft).
   Restrictions: 
   - 1D transforms only, with 32-bit single precision.
   - all (float*) pointers in the functions below are expected to
   have an "simd-compatible" alignment, that is 16 bytes on x86 and
   powerpc CPUs.
   You can allocate such buffers with the functions
   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
   posix_memalign..)
 */
 #ifndef PFFASTCONV_H
 #define PFFASTCONV_H
 #include <stddef.h> /* for size_t */
 #include "pffft.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
  /* opaque struct holding internal stuff
     this struct can't be shared by many threads as it contains
     temporary data, computed within the convolution
  */
  typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
  typedef enum {
    PFFASTCONV_CPLX_INP_OUT = 1,
    /* set when input and output is complex,
     * with real and imag part interleaved in both vectors.
     * input[] has inputLen complex values: 2 * inputLen floats,
     * output[] is also written with complex values.
     * without this flag, the input is interpreted as real vector
     */
    PFFASTCONV_CPLX_FILTER = 2,
    /* set when filterCoeffs is complex,
     * with real and imag part interleaved.
     * filterCoeffs[] has filterLen complex values: 2 * filterLen floats
     * without this flag, the filter is interpreted as real vector
     * ATTENTION: this is not implemented yet!
     */
    PFFASTCONV_DIRECT_INP = 4,
    /* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
     * 1- input vecor X must be aligned
     * 2- (all) inputLen <= ouput blockLen
     * 3- X must have minimum length of output BlockLen
     * 4- the additional samples from inputLen .. BlockLen-1
     *   must contain valid small and non-NAN samples (ideally zero)
     * 
     * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
     */
    PFFASTCONV_DIRECT_OUT = 8,
    /* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
     * 1- output vector Y must be aligned
     * 2- (all) inputLen <= ouput blockLen
     * 3- Y must have minimum length of output blockLen
     * 
     * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
     */
    PFFASTCONV_CPLX_SINGLE_FFT = 16,
    /* hint to process complex data with one single FFT;
     * default is to use 2 FFTs: one for real part, one for imag part
     * */
    PFFASTCONV_SYMMETRIC = 32,
    /* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
    PFFASTCONV_CORRELATION = 64,
    /* filterCoeffs[] of pffastconv_new_setup are for correlation;
     * thus, do not flip them for the internal fft calculation
     * - as necessary for the fast convolution */
  } pffastconv_flags_t;
  /*
    prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
    The output 'blockLen' might be bigger to allow the fast convolution.
    'flags' are bitmask over the 'pffastconv_flags_t' enum.
    PFFASTCONV_Setup structure can't be shared accross multiple filters
    or concurrent threads.
  */
  PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
  void pffastconv_destroy_setup(PFFASTCONV_Setup *);
  /* 
     Perform the fast convolution.
     'input' and 'output' don't need to be aligned - unless any of
     PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
     inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
     in this case, multiple FFTs are called internally, to process the
     input[].
     'output' vector must have size >= (inputLen - filterLen + 1)
     set bool option 'applyFlush' to process the full input[].
     with this option, 'tail samples' of input are also processed.
     This might be inefficient, because the FFT is called to produce
     few(er) output samples, than possible.
     This option is useful to process the last samples of an input (file)
     or to reduce latency.
     return value is the number of produced samples in output[].
     the same amount of samples is processed from input[]. to continue
     processing, the caller must save/move the remaining samples of
     input[].
  */
  int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
  void *pffastconv_malloc(size_t nb_bytes);
  void pffastconv_free(void *);
  /* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
  int pffastconv_simd_size();
 #ifdef __cplusplus
 }
 #endif
 #endif /* PFFASTCONV_H */
--- a/pffft/pffft.c
+++ b/pffft/pffft.c
@@ -0,0 +1,134 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
   Based on original fortran 77 code from FFTPACKv4 from NETLIB
   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
   of NCAR, in 1985.
   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.
   FFTPACK license:
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.  
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
   PFFFT : a Pretty Fast FFT.
   This file is largerly based on the original FFTPACK implementation, modified in
   order to take advantage of SIMD instructions of modern CPUs.
 */
 /*
  ChangeLog: 
  - 2011/10/02, version 1: This is the very first release of this file.
 */
 #include "pffft.h"
 /* detect compiler flavour */
 #if defined(_MSC_VER)
 #  define COMPILER_MSVC
 #elif defined(__GNUC__)
 #  define COMPILER_GCC
 #endif
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <math.h>
 #include <assert.h>
 #if defined(COMPILER_GCC)
 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
 #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
 #  define RESTRICT __restrict
 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
 #elif defined(COMPILER_MSVC)
 #  define ALWAYS_INLINE(return_type) __forceinline return_type
 #  define NEVER_INLINE(return_type) __declspec(noinline) return_type
 #  define RESTRICT __restrict
 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
 #endif
 #ifdef COMPILER_MSVC
 #pragma warning( disable : 4244 4305 4204 4456 )
 #endif
 /* 
   vector support macros: the rest of the code is independant of
   SSE/Altivec/NEON -- adding support for other platforms with 4-element
   vectors should be limited to these macros 
 */
 #include "simd/pf_float.h"
 /* have code comparable with this definition */
 #define SETUP_STRUCT               PFFFT_Setup
 #define FUNC_NEW_SETUP             pffft_new_setup
 #define FUNC_DESTROY               pffft_destroy_setup
 #define FUNC_TRANSFORM_UNORDRD     pffft_transform
 #define FUNC_TRANSFORM_ORDERED     pffft_transform_ordered
 #define FUNC_ZREORDER              pffft_zreorder
 #define FUNC_ZCONVOLVE_ACCUMULATE  pffft_zconvolve_accumulate
 #define FUNC_ZCONVOLVE_NO_ACCU     pffft_zconvolve_no_accu
 #define FUNC_ALIGNED_MALLOC        pffft_aligned_malloc
 #define FUNC_ALIGNED_FREE          pffft_aligned_free
 #define FUNC_SIMD_SIZE             pffft_simd_size
 #define FUNC_MIN_FFT_SIZE          pffft_min_fft_size
 #define FUNC_IS_VALID_SIZE         pffft_is_valid_size
 #define FUNC_NEAREST_SIZE          pffft_nearest_transform_size
 #define FUNC_SIMD_ARCH             pffft_simd_arch
 #define FUNC_VALIDATE_SIMD_A       validate_pffft_simd
 #define FUNC_VALIDATE_SIMD_EX      validate_pffft_simd_ex
 #define FUNC_CPLX_FINALIZE         pffft_cplx_finalize
 #define FUNC_CPLX_PREPROCESS       pffft_cplx_preprocess
 #define FUNC_REAL_PREPROCESS_4X4   pffft_real_preprocess_4x4
 #define FUNC_REAL_PREPROCESS       pffft_real_preprocess
 #define FUNC_REAL_FINALIZE_4X4     pffft_real_finalize_4x4
 #define FUNC_REAL_FINALIZE         pffft_real_finalize
 #define FUNC_TRANSFORM_INTERNAL    pffft_transform_internal
 #define FUNC_COS  cosf
 #define FUNC_SIN  sinf
 #include "pffft_priv_impl.h"
--- a/pffft/pffft.h
+++ b/pffft/pffft.h
@@ -0,0 +1,241 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
   authored by Dr Paul Swarztrauber of NCAR, in 1985.
   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.
   FFTPACK license:
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.  
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 /*
   PFFFT : a Pretty Fast FFT.
   This is basically an adaptation of the single precision fftpack
   (v4) as found on netlib taking advantage of SIMD instruction found
   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
   For architectures where no SIMD instruction is available, the code
   falls back to a scalar version.  
   Restrictions: 
   - 1D transforms only, with 32-bit single precision.
   - supports only transforms for inputs of length N of the form
   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
   144, 160, etc are all acceptable lengths). Performance is best for
   128<=N<=8192.
   - all (float*) pointers in the functions below are expected to
   have an "simd-compatible" alignment, that is 16 bytes on x86 and
   powerpc CPUs.
   You can allocate such buffers with the functions
   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
   posix_memalign..)
 */
 #ifndef PFFFT_H
 #define PFFFT_H
 #include <stddef.h> /* for size_t */
 #ifdef __cplusplus
 extern "C" {
 #endif
  /* opaque struct holding internal stuff (precomputed twiddle factors)
     this struct can be shared by many threads as it contains only
     read-only data.  
  */
  typedef struct PFFFT_Setup PFFFT_Setup;
 #ifndef PFFFT_COMMON_ENUMS
 #define PFFFT_COMMON_ENUMS
  /* direction of the transform */
  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
  /* type of transform */
  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
 #endif
  /*
    prepare for performing transforms of size N -- the returned
    PFFFT_Setup structure is read-only so it can safely be shared by
    multiple concurrent threads. 
  */
  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
  void pffft_destroy_setup(PFFFT_Setup *);
  /* 
     Perform a Fourier transform , The z-domain data is stored in the
     most efficient order for transforming it back, or using it for
     convolution. If you need to have its content sorted in the
     "usual" way, that is as an array of interleaved complex numbers,
     either use pffft_transform_ordered , or call pffft_zreorder after
     the forward fft, and before the backward fft.
     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
     Typically you will want to scale the backward transform by 1/N.
     The 'work' pointer should point to an area of N (2*N for complex
     fft) floats, properly aligned. If 'work' is NULL, then stack will
     be used instead (this is probably the best strategy for small
     FFTs, say for N < 16384). Threads usually have a small stack, that
     there's no sufficient amount of memory, usually leading to a crash!
     Use the heap with pffft_aligned_malloc() in this case.
     For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
     input with input(=transformation) length N, the output array is
     'mostly' complex:
       index k in 1 .. N/2 -1  corresponds to frequency k * Samplerate / N
       index k == 0 is a special case:
         the real() part contains the result for the DC frequency 0,
         the imag() part contains the result for the Nyquist frequency Samplerate/2
     both 0-frequency and half frequency components, which are real,
     are assembled in the first entry as  F(0)+i*F(N/2).
     With the output size N/2 complex values (=N real/imag values), it is
     obvious, that the result for negative frequencies are not output,
     cause of symmetry.
     input and output may alias.
  */
  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
  /* 
     Similar to pffft_transform, but makes sure that the output is
     ordered as expected (interleaved complex numbers).  This is
     similar to calling pffft_transform and then pffft_zreorder.
     input and output may alias.
  */
  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
  /* 
     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
     PFFFT_FORWARD) if you want to have the frequency components in
     the correct "canonical" order, as interleaved complex numbers.
     (for real transforms, both 0-frequency and half frequency
     components, which are real, are assembled in the first entry as
     F(0)+i*F(n/2+1). Note that the original fftpack did place
     F(n/2+1) at the end of the arrays).
     input and output should not alias.
  */
  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
  /* 
     Perform a multiplication of the frequency components of dft_a and
     dft_b and accumulate them into dft_ab. The arrays should have
     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
     *not* have been reordered with pffft_zreorder (otherwise just
     perform the operation yourself as the dft coefs are stored as
     interleaved complex numbers).
     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
     The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
  /* 
     Perform a multiplication of the frequency components of dft_a and
     dft_b and put result in dft_ab. The arrays should have
     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
     *not* have been reordered with pffft_zreorder (otherwise just
     perform the operation yourself as the dft coefs are stored as
     interleaved complex numbers).
     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
     The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
  /* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
  int pffft_simd_size();
  /* return string identifier of used architecture (SSE/NEON/Altivec/..) */
  const char * pffft_simd_arch();
  /* following functions are identical to the pffftd_ functions */
  /* simple helper to get minimum possible fft size */
  int pffft_min_fft_size(pffft_transform_t transform);
  /* simple helper to determine next power of 2
     - without inexact/rounding floating point operations
  */
  int pffft_next_power_of_two(int N);
  /* simple helper to determine if power of 2 - returns bool */
  int pffft_is_power_of_two(int N);
  /* simple helper to determine size N is valid
     - factorizable to pffft_min_fft_size() with factors 2, 3, 5
     returns bool
  */
  int pffft_is_valid_size(int N, pffft_transform_t cplx);
  /* determine nearest valid transform size  (by brute-force testing)
     - factorizable to pffft_min_fft_size() with factors 2, 3, 5.
     higher: bool-flag to find nearest higher value; else lower.
  */
  int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
  /*
    the float buffers must have the correct alignment (16-byte boundary
    on intel and powerpc). This function may be used to obtain such
    correctly aligned buffers.  
  */
  void *pffft_aligned_malloc(size_t nb_bytes);
  void pffft_aligned_free(void *);
 #ifdef __cplusplus
 }
 #endif
 #endif /* PFFFT_H */
--- a/pffft/pffft.hpp
+++ b/pffft/pffft.hpp
--- a/pffft/pffft_common.c
+++ b/pffft/pffft_common.c
@@ -0,0 +1,53 @@
 #include "pffft.h"
 #include <stdlib.h>
 /* SSE and co like 16-bytes aligned pointers
 * with a 64-byte alignment, we are even aligned on L2 cache lines... */
 #define MALLOC_V4SF_ALIGNMENT 64
 static void * Valigned_malloc(size_t nb_bytes) {
  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
  if (!p0) return (void *) 0;
  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
  *((void **) p - 1) = p0;
  return p;
 }
 static void Valigned_free(void *p) {
  if (p) free(*((void **) p - 1));
 }
 static int next_power_of_two(int N) {
  /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
  /* compute the next highest power of 2 of 32-bit v */
  unsigned v = N;
  v--;
  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;
  v++;
  return v;
 }
 static int is_power_of_two(int N) {
  /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
  int f = N && !(N & (N - 1));
  return f;
 }
 void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
 void pffft_aligned_free(void *p) { Valigned_free(p); }
 int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
 int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
 void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
 void pffftd_aligned_free(void *p) { Valigned_free(p); }
 int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
 int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
--- a/pffft/pffft_double.c
+++ b/pffft/pffft_double.c
@@ -0,0 +1,147 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
   Based on original fortran 77 code from FFTPACKv4 from NETLIB
   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
   of NCAR, in 1985.
   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.
   FFTPACK license:
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.  
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
   PFFFT : a Pretty Fast FFT.
   This file is largerly based on the original FFTPACK implementation, modified in
   order to take advantage of SIMD instructions of modern CPUs.
 */
 /*
   NOTE: This file is adapted from Julien Pommier's original PFFFT,
   which works on 32 bit floating point precision using SSE instructions,
   to work with 64 bit floating point precision using AVX instructions.
   Author: Dario Mambro @ https://github.com/unevens/pffft
 */
 #include "pffft_double.h"
 /* detect compiler flavour */
 #if defined(_MSC_VER)
 #  define COMPILER_MSVC
 #elif defined(__GNUC__)
 #  define COMPILER_GCC
 #endif
 #ifdef COMPILER_MSVC
 #  define _USE_MATH_DEFINES
 #  include <malloc.h>
 #elif defined(__MINGW32__) || defined(__MINGW64__)
 #  include <malloc.h>
 #else
 #  include <alloca.h>
 #endif
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <math.h>
 #include <assert.h>
 #if defined(COMPILER_GCC)
 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
 #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
 #  define RESTRICT __restrict
 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
 #elif defined(COMPILER_MSVC)
 #  define ALWAYS_INLINE(return_type) __forceinline return_type
 #  define NEVER_INLINE(return_type) __declspec(noinline) return_type
 #  define RESTRICT __restrict
 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
 #endif
 #ifdef COMPILER_MSVC
 #pragma warning( disable : 4244 4305 4204 4456 )
 #endif
 /* 
   vector support macros: the rest of the code is independant of
   AVX -- adding support for other platforms with 4-element
   vectors should be limited to these macros 
 */
 #include "simd/pf_double.h"
 /* have code comparable with this definition */
 #define float double
 #define SETUP_STRUCT               PFFFTD_Setup
 #define FUNC_NEW_SETUP             pffftd_new_setup
 #define FUNC_DESTROY               pffftd_destroy_setup
 #define FUNC_TRANSFORM_UNORDRD     pffftd_transform
 #define FUNC_TRANSFORM_ORDERED     pffftd_transform_ordered
 #define FUNC_ZREORDER              pffftd_zreorder
 #define FUNC_ZCONVOLVE_ACCUMULATE  pffftd_zconvolve_accumulate
 #define FUNC_ZCONVOLVE_NO_ACCU     pffftd_zconvolve_no_accu
 #define FUNC_ALIGNED_MALLOC        pffftd_aligned_malloc
 #define FUNC_ALIGNED_FREE          pffftd_aligned_free
 #define FUNC_SIMD_SIZE             pffftd_simd_size
 #define FUNC_MIN_FFT_SIZE          pffftd_min_fft_size
 #define FUNC_IS_VALID_SIZE         pffftd_is_valid_size
 #define FUNC_NEAREST_SIZE          pffftd_nearest_transform_size
 #define FUNC_SIMD_ARCH             pffftd_simd_arch
 #define FUNC_VALIDATE_SIMD_A       validate_pffftd_simd
 #define FUNC_VALIDATE_SIMD_EX      validate_pffftd_simd_ex
 #define FUNC_CPLX_FINALIZE         pffftd_cplx_finalize
 #define FUNC_CPLX_PREPROCESS       pffftd_cplx_preprocess
 #define FUNC_REAL_PREPROCESS_4X4   pffftd_real_preprocess_4x4
 #define FUNC_REAL_PREPROCESS       pffftd_real_preprocess
 #define FUNC_REAL_FINALIZE_4X4     pffftd_real_finalize_4x4
 #define FUNC_REAL_FINALIZE         pffftd_real_finalize
 #define FUNC_TRANSFORM_INTERNAL    pffftd_transform_internal
 #define FUNC_COS  cos
 #define FUNC_SIN  sin
 #include "pffft_priv_impl.h"
--- a/pffft/pffft_double.h
+++ b/pffft/pffft_double.h
@@ -0,0 +1,236 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
   authored by Dr Paul Swarztrauber of NCAR, in 1985.
   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.
   FFTPACK license:
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.  
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 /*
   NOTE: This file is adapted from Julien Pommier's original PFFFT,
   which works on 32 bit floating point precision using SSE instructions,
   to work with 64 bit floating point precision using AVX instructions.
   Author: Dario Mambro @ https://github.com/unevens/pffft
 */
 /*
   PFFFT : a Pretty Fast FFT.
   This is basically an adaptation of the single precision fftpack
   (v4) as found on netlib taking advantage of SIMD instruction found
   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
   For architectures where no SIMD instruction is available, the code
   falls back to a scalar version.  
   Restrictions: 
   - 1D transforms only, with 64-bit double precision.
   - supports only transforms for inputs of length N of the form
   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
   144, 160, etc are all acceptable lengths). Performance is best for
   128<=N<=8192.
   - all (double*) pointers in the functions below are expected to
   have an "simd-compatible" alignment, that is 32 bytes on x86 and
   powerpc CPUs.
   You can allocate such buffers with the functions
   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
   posix_memalign..)
 */
 #ifndef PFFFT_DOUBLE_H
 #define PFFFT_DOUBLE_H
 #include <stddef.h> /* for size_t */
 #ifdef __cplusplus
 extern "C" {
 #endif
  /* opaque struct holding internal stuff (precomputed twiddle factors)
     this struct can be shared by many threads as it contains only
     read-only data.  
  */
  typedef struct PFFFTD_Setup PFFFTD_Setup;
 #ifndef PFFFT_COMMON_ENUMS
 #define PFFFT_COMMON_ENUMS
  /* direction of the transform */
  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
  /* type of transform */
  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
 #endif
  /*
    prepare for performing transforms of size N -- the returned
    PFFFTD_Setup structure is read-only so it can safely be shared by
    multiple concurrent threads. 
  */
  PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
  void pffftd_destroy_setup(PFFFTD_Setup *);
  /* 
     Perform a Fourier transform , The z-domain data is stored in the
     most efficient order for transforming it back, or using it for
     convolution. If you need to have its content sorted in the
     "usual" way, that is as an array of interleaved complex numbers,
     either use pffft_transform_ordered , or call pffft_zreorder after
     the forward fft, and before the backward fft.
     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
     Typically you will want to scale the backward transform by 1/N.
     The 'work' pointer should point to an area of N (2*N for complex
     fft) doubles, properly aligned. If 'work' is NULL, then stack will
     be used instead (this is probably the best strategy for small
     FFTs, say for N < 16384). Threads usually have a small stack, that
     there's no sufficient amount of memory, usually leading to a crash!
     Use the heap with pffft_aligned_malloc() in this case.
     input and output may alias.
  */
  void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
  /* 
     Similar to pffft_transform, but makes sure that the output is
     ordered as expected (interleaved complex numbers).  This is
     similar to calling pffft_transform and then pffft_zreorder.
     input and output may alias.
  */
  void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
  /* 
     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
     PFFFT_FORWARD) if you want to have the frequency components in
     the correct "canonical" order, as interleaved complex numbers.
     (for real transforms, both 0-frequency and half frequency
     components, which are real, are assembled in the first entry as
     F(0)+i*F(n/2+1). Note that the original fftpack did place
     F(n/2+1) at the end of the arrays).
     input and output should not alias.
  */
  void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
  /* 
     Perform a multiplication of the frequency components of dft_a and
     dft_b and accumulate them into dft_ab. The arrays should have
     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
     *not* have been reordered with pffft_zreorder (otherwise just
     perform the operation yourself as the dft coefs are stored as
     interleaved complex numbers).
     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
     The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
  /* 
     Perform a multiplication of the frequency components of dft_a and
     dft_b and put result in dft_ab. The arrays should have
     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
     *not* have been reordered with pffft_zreorder (otherwise just
     perform the operation yourself as the dft coefs are stored as
     interleaved complex numbers).
     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
     The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
  /* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
  int pffftd_simd_size();
  /* return string identifier of used architecture (AVX/..) */
  const char * pffftd_simd_arch();
  /* simple helper to get minimum possible fft size */
  int pffftd_min_fft_size(pffft_transform_t transform);
  /* simple helper to determine size N is valid
     - factorizable to pffft_min_fft_size() with factors 2, 3, 5
  */
  int pffftd_is_valid_size(int N, pffft_transform_t cplx);
  /* determine nearest valid transform size  (by brute-force testing)
     - factorizable to pffft_min_fft_size() with factors 2, 3, 5.
     higher: bool-flag to find nearest higher value; else lower.
  */
  int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
  /* following functions are identical to the pffft_ functions - both declared */
  /* simple helper to determine next power of 2
     - without inexact/rounding floating point operations
  */
  int pffftd_next_power_of_two(int N);
  int pffft_next_power_of_two(int N);
  /* simple helper to determine if power of 2 - returns bool */
  int pffftd_is_power_of_two(int N);
  int pffft_is_power_of_two(int N);
  /*
    the double buffers must have the correct alignment (32-byte boundary
    on intel and powerpc). This function may be used to obtain such
    correctly aligned buffers.  
  */
  void *pffftd_aligned_malloc(size_t nb_bytes);
  void *pffft_aligned_malloc(size_t nb_bytes);
  void pffftd_aligned_free(void *);
  void pffft_aligned_free(void *);
 #ifdef __cplusplus
 }
 #endif
 #endif /* PFFFT_DOUBLE_H */
--- a/pffft/pffft_priv_impl.h
+++ b/pffft/pffft_priv_impl.h
--- a/pffft/plots.sh
+++ b/pffft/plots.sh
@@ -0,0 +1,50 @@
 #!/bin/bash
 OUTPNG="1"
 W="1024"
 H="768"
 PTS="20"
 LWS="20"
 for f in $(ls -1 *-4-*.csv *-6-*.csv); do
  b=$(basename "$f" ".csv")
  #echo $b
  LASTCOL="$(head -n 1 $f |sed 's/,/,\n/g' |grep -c ',')"
  echo "${b}: last column is $LASTCOL"
  if [ $(echo "$b" |grep -c -- "-1-") -gt 0 ]; then
    YL="duration in ms; less is better"
  elif [ $(echo "$b" |grep -c -- "-4-") -gt 0 ]; then
    YL="duration relative to pffft; less is better"
  else
    YL=""
  fi
  E=""
  if [ "${OUTPNG}" = "1" ]; then
    E="set terminal png size $W,$H"
    E="${E} ; set output '${b}.png'"
  fi
  if [ -z "${E}" ]; then
    E="set key outside"
  else
    E="${E} ; set key outside"
  fi
  E="${E} ; set datafile separator ','"
  E="${E} ; set title '${b}'"
  E="${E} ; set xlabel 'fft order: fft size N = 2\\^order'"
  if [ ! -z "${YL}" ]; then
    #echo "  setting  Y label to ${YL}"
    E="${E} ; set ylabel '${YL}'"
  fi
  # unfortunately no effect for 
  #for LNO in $(seq 1 ${LASTCOL}) ; do
  #  E="${E} ; set style line ${LNO} ps ${PTS} lw ${LWS}"
  #done
  E="${E} ; plot for [col=3:${LASTCOL}] '${f}' using 2:col with lines title columnhead"
  if [ "${OUTPNG}" = "1" ]; then
    gnuplot -e "${E}"
  else
    gnuplot -e "${E}" --persist
  fi
 done
--- a/pffft/simd/pf_altivec_float.h
+++ b/pffft/simd/pf_altivec_float.h
@@ -0,0 +1,81 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_ALTIVEC_FLT_H
 #define PF_ALTIVEC_FLT_H
 /*
   Altivec support macros
 */
 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
 #pragma message( __FILE__ ": ALTIVEC float macros are defined" )
 typedef vector float v4sf;
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  float f[SIMD_SZ];
 } v4sf_union;
 #  define VREQUIRES_ALIGN 1  /* not sure, if really required */
 #  define VARCH "ALTIVEC"
 #  define VZERO() ((vector float) vec_splat_u8(0))
 #  define VMUL(a,b) vec_madd(a,b, VZERO())
 #  define VADD(a,b) vec_add(a,b)
 #  define VMADD(a,b,c) vec_madd(a,b,c)
 #  define VSUB(a,b) vec_sub(a,b)
 inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
 #  define LD_PS1(p) ld_ps1(&p)
 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
 #  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
  }
 #  define VTRANSPOSE4(x0,x1,x2,x3) {              \
    v4sf y0 = vec_mergeh(x0, x2);               \
    v4sf y1 = vec_mergel(x0, x2);               \
    v4sf y2 = vec_mergeh(x1, x3);               \
    v4sf y3 = vec_mergel(x1, x3);               \
    x0 = vec_mergeh(y0, y2);                    \
    x1 = vec_mergel(y0, y2);                    \
    x2 = vec_mergeh(y1, y3);                    \
    x3 = vec_mergel(y1, y3);                    \
  }
 #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
 #endif
 #endif /* PF_SSE1_FLT_H */
--- a/pffft/simd/pf_avx_double.h
+++ b/pffft/simd/pf_avx_double.h
@@ -0,0 +1,145 @@
 /*
   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
 */
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_AVX_DBL_H
 #define PF_AVX_DBL_H
 /*
   vector support macros: the rest of the code is independant of
   AVX -- adding support for other platforms with 4-element
   vectors should be limited to these macros
 */
 /*
  AVX support macros
 */
 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
 #pragma message( __FILE__ ": AVX macros are defined" )
 #include <immintrin.h>
 typedef __m256d v4sf;
 /* 4 doubles by simd vector */
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  double f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "AVX"
 #  define VREQUIRES_ALIGN 1
 #  define VZERO() _mm256_setzero_pd()
 #  define VMUL(a,b) _mm256_mul_pd(a,b)
 #  define VADD(a,b) _mm256_add_pd(a,b)
 #  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
 #  define VSUB(a,b) _mm256_sub_pd(a,b)
 #  define LD_PS1(p) _mm256_set1_pd(p)
 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 */
 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
 	__m256d tmp__ = _mm256_insertf128_pd(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
 		_mm_shuffle_pd(low1__, low2__, 3),								\
 		1);																\
 	out2 = _mm256_insertf128_pd(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
 		_mm_shuffle_pd(high1__, high2__, 3),							\
 		1);																\
 	out1 = tmp__;														\
 }
 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 */
 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
 	__m256d tmp__ = _mm256_insertf128_pd(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
 		_mm_shuffle_pd(low2__, high2__, 0),								\
 		1);																\
 	out2 = _mm256_insertf128_pd(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
 		_mm_shuffle_pd(low2__, high2__, 3),								\
 		1);																\
 	out1 = tmp__;														\
 }
 #  define VTRANSPOSE4(row0, row1, row2, row3) {				\
        __m256d tmp3, tmp2, tmp1, tmp0;                     \
                                                            \
        tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);       \
        tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);       \
        tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);       \
        tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);       \
                                                            \
        (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);	\
        (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);  \
        (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);  \
        (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);  \
    }
 /*VSWAPHL(a, b) pseudo code:
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
 /* reverse/flip all floats */
 #  define VREV_S(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
 /* reverse/flip complex floats */
 #  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
 #endif
 #endif /* PF_AVX_DBL_H */
--- a/pffft/simd/pf_double.h
+++ b/pffft/simd/pf_double.h
@@ -0,0 +1,84 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_DBL_H
 #define PF_DBL_H
 #include <assert.h>
 #include <string.h>
 #include <stdint.h>
 /*
 *  SIMD reference material:
 *
 * general SIMD introduction:
 * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
 *
 * SSE 1:
 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
 *
 * ARM NEON:
 * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
 *
 * Altivec:
 * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
 * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
 * better one?
 *
 */
 typedef double vsfscalar;
 #include "pf_avx_double.h"
 #include "pf_sse2_double.h"
 #include "pf_neon_double.h"
 #ifndef SIMD_SZ
 #  if !defined(PFFFT_SIMD_DISABLE)
 #    pragma message( "building double with simd disabled !" )
 #    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
 #  endif
 #endif
 #include "pf_scalar_double.h"
 /* shortcuts for complex multiplcations */
 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
 #ifndef SVMUL
 /* multiply a scalar with a vector */
 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
 #endif
 #endif /* PF_DBL_H */
--- a/pffft/simd/pf_float.h
+++ b/pffft/simd/pf_float.h
@@ -0,0 +1,84 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_FLT_H
 #define PF_FLT_H
 #include <assert.h>
 #include <string.h>
 #include <stdint.h>
 /*
 *  SIMD reference material:
 *
 * general SIMD introduction:
 * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
 *
 * SSE 1:
 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
 *
 * ARM NEON:
 * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
 *
 * Altivec:
 * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
 * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
 * better one?
 *
 */
 typedef float vsfscalar;
 #include "pf_sse1_float.h"
 #include "pf_neon_float.h"
 #include "pf_altivec_float.h"
 #ifndef SIMD_SZ
 #  if !defined(PFFFT_SIMD_DISABLE)
 #    pragma message( "building float with simd disabled !" )
 #    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
 #  endif
 #endif
 #include "pf_scalar_float.h"
 /* shortcuts for complex multiplcations */
 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
 #ifndef SVMUL
 /* multiply a scalar with a vector */
 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
 #endif
 #endif /* PF_FLT_H */
--- a/pffft/simd/pf_neon_double.h
+++ b/pffft/simd/pf_neon_double.h
@@ -0,0 +1,203 @@
 /*
   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
 */
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_NEON_DBL_H
 #define PF_NEON_DBL_H
 /*
  NEON 64bit support macros
 */
 #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
 #pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
 #include "pf_neon_double_from_avx.h"
 typedef __m256d v4sf;
 /* 4 doubles by simd vector */
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  double f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "NEON"
 #  define VREQUIRES_ALIGN 1
 #  define VZERO() _mm256_setzero_pd()
 #  define VMUL(a,b) _mm256_mul_pd(a,b)
 #  define VADD(a,b) _mm256_add_pd(a,b)
 #  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
 #  define VSUB(a,b) _mm256_sub_pd(a,b)
 #  define LD_PS1(p) _mm256_set1_pd(p)
 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
 FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
 {
    __m256d res;
    res.vect_f64[0] = a.vect_f64[0];
    res.vect_f64[1] = b;
    return res;
 }
 FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
 {
    float64x1_t al = vget_low_f64(a);
    float64x1_t bl = vget_low_f64(b);
    return vcombine_f64(al, bl);
 }
 FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
 {
    float64x1_t ah = vget_high_f64(a);
    float64x1_t bh = vget_high_f64(b);
    return vcombine_f64(ah, bh);
 }
 FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
 {
    __m256d res;
    res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
    res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
    return res;
 }
 FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
 {
    __m256d res;
    res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
    res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
    return res;
 }
 FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
    __m256d res;
    res.vect_f64[0] = a.vect_f64[0];
    res.vect_f64[1] = b.vect_f64[0];
    return res;
 }
 FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
 {
    __m256d res;
    res.vect_f64[0] = a.vect_f64[1];
    res.vect_f64[1] = b.vect_f64[1];
    return res;
 }
 FORCE_INLINE __m256d _mm256_reverse(__m256d x)
 {
    __m256d res;
    float64x2_t low = x.vect_f64[0];
    float64x2_t high = x.vect_f64[1];
    float64x1_t a = vget_low_f64(low);
    float64x1_t b = vget_high_f64(low);
    float64x1_t c = vget_low_f64(high);
    float64x1_t d = vget_high_f64(high);
    res.vect_f64[0] =  vcombine_f64(d, c);
    res.vect_f64[1] =  vcombine_f64(b, a);
    return res;
 }
 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 */
 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
 	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
 		_mm_shuffle_pd_11(low1__, low2__));								\
 	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
 		_mm_shuffle_pd_11(high1__, high2__));							\
 	out1 = tmp__;														\
 }
 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 */
 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
 	__m256d tmp__ = _mm256_insertf128_pd_1(								\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
 		_mm_shuffle_pd_00(low2__, high2__));							\
 	out2 = _mm256_insertf128_pd_1(										\
 		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
 		_mm_shuffle_pd_11(low2__, high2__));							\
 	out1 = tmp__;														\
 }
 #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
                                                            			\
        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
                                                            			\
        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
    }
 /*VSWAPHL(a, b) pseudo code:
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
 /* reverse/flip all floats */
 #  define VREV_S(a)   _mm256_reverse(a)
 /* reverse/flip complex floats */
 #  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
 #endif
 #endif /* PF_AVX_DBL_H */
--- a/pffft/simd/pf_neon_double_from_avx.h
+++ b/pffft/simd/pf_neon_double_from_avx.h
@@ -0,0 +1,123 @@
 /*
 * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 //see https://github.com/kunpengcompute/AvxToNeon
 #ifndef PF_NEON_DBL_FROM_AVX_H
 #define PF_NEON_DBL_FROM_AVX_H
 #include <arm_neon.h>
 #if defined(__GNUC__) || defined(__clang__)
 #pragma push_macro("FORCE_INLINE")
 #define FORCE_INLINE static inline __attribute__((always_inline))
 #else
 #error "Macro name collisions may happens with unknown compiler"
 #ifdef FORCE_INLINE
 #undef FORCE_INLINE
 #endif
 #define FORCE_INLINE static inline
 #endif
 typedef struct {
    float32x4_t vect_f32[2];
 } __m256;
 typedef struct {
    float64x2_t vect_f64[2];
 } __m256d;
 typedef float64x2_t __m128d;
 FORCE_INLINE __m256d _mm256_setzero_pd(void)
 {
    __m256d ret;
    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
    return ret;
 }
 FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
 {
    __m256d res_m256d;
    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
    return res_m256d;
 }
 FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
 {
    __m256d res_m256d;
    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
    return res_m256d;
 }
 FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
 {
    __m256d res_m256d;
    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
    return res_m256d;
 }
 FORCE_INLINE __m256d _mm256_set1_pd(double a)
 {
    __m256d ret;
    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
    return ret;
 }
 FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
 {
    __m256d res;
    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
    return res;
 }
 FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
 {
    __m256d res;
    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
    return res;
 }
 FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
 {
    return a.vect_f64[0];
 }
 FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
 {
    assert(imm8 >= 0 && imm8 <= 1);
    return a.vect_f64[imm8];
 }
 FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
 {
    __m256d res;
    res.vect_f64[0] = a;
    return res;
 }
 #endif /* PF_AVX_DBL_H */
--- a/pffft/simd/pf_neon_float.h
+++ b/pffft/simd/pf_neon_float.h
@@ -0,0 +1,87 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_NEON_FLT_H
 #define PF_NEON_FLT_H
 /*
  ARM NEON support macros
 */
 #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
 #pragma message( __FILE__ ": ARM NEON macros are defined" )
 #  include <arm_neon.h>
 typedef float32x4_t v4sf;
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  float f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "NEON"
 #  define VREQUIRES_ALIGN 0  /* usually no alignment required */
 #  define VZERO() vdupq_n_f32(0)
 #  define VMUL(a,b) vmulq_f32(a,b)
 #  define VADD(a,b) vaddq_f32(a,b)
 #  define VMADD(a,b,c) vmlaq_f32(c,a,b)
 #  define VSUB(a,b) vsubq_f32(a,b)
 #  define LD_PS1(p) vld1q_dup_f32(&(p))
 #  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
 #  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
 #  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
 #  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
 #  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
  }
 // marginally faster version
 //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
 /* reverse/flip all floats */
 #  define VREV_S(a)    vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
 /* reverse/flip complex floats */
 #  define VREV_C(a)    vextq_f32(a, a, 2)
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
 #else
 /* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
 #endif
 #endif /* PF_NEON_FLT_H */
--- a/pffft/simd/pf_scalar_double.h
+++ b/pffft/simd/pf_scalar_double.h
@@ -0,0 +1,185 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_SCAL_DBL_H
 #define PF_SCAL_DBL_H
 /*
  fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
 */
 #if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
 #pragma message( __FILE__ ": double SCALAR4 macros are defined" )
 typedef struct {
  vsfscalar a;
  vsfscalar b;
  vsfscalar c;
  vsfscalar d;
 } v4sf;
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  vsfscalar f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "4xScalar"
 #  define VREQUIRES_ALIGN 0
  static ALWAYS_INLINE(v4sf) VZERO() {
    v4sf r = { 0.f, 0.f, 0.f, 0.f };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
    v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
    v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
    v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
    v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
    v4sf r = { v, v, v, v };
    return r;
  }
 #  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
 #  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
  /* INTERLEAVE2() */
  #define INTERLEAVE2( A, B, C, D) \
  do { \
    v4sf Cr = { A.a, B.a, A.b, B.b }; \
    v4sf Dr = { A.c, B.c, A.d, B.d }; \
    C = Cr; \
    D = Dr; \
  } while (0)
  /* UNINTERLEAVE2() */
  #define UNINTERLEAVE2(A, B, C, D) \
  do { \
    v4sf Cr = { A.a, A.c, B.a, B.c }; \
    v4sf Dr = { A.b, A.d, B.b, B.d }; \
    C = Cr; \
    D = Dr; \
  } while (0)
  /* VTRANSPOSE4() */
  #define VTRANSPOSE4(A, B, C, D) \
  do { \
    v4sf Ar = { A.a, B.a, C.a, D.a }; \
    v4sf Br = { A.b, B.b, C.b, D.b }; \
    v4sf Cr = { A.c, B.c, C.c, D.c }; \
    v4sf Dr = { A.d, B.d, C.d, D.d }; \
    A = Ar; \
    B = Br; \
    C = Cr; \
    D = Dr; \
  } while (0)
  /* VSWAPHL() */
  static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
    v4sf r = { B.a, B.b, A.c, A.d };
    return r;
  }
  /* reverse/flip all floats */
  static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
    v4sf r = { A.d, A.c, A.b, A.a };
    return r;
  }
  /* reverse/flip complex floats */
  static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
    v4sf r = { A.c, A.d, A.a, A.b };
    return r;
  }
 #else
 /* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
 #endif
 #if !defined(SIMD_SZ)
 #pragma message( __FILE__ ": float SCALAR1 macros are defined" )
 typedef vsfscalar v4sf;
 #  define SIMD_SZ 1
 typedef union v4sf_union {
  v4sf  v;
  vsfscalar f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "Scalar"
 #  define VREQUIRES_ALIGN 0
 #  define VZERO() 0.0
 #  define VMUL(a,b) ((a)*(b))
 #  define VADD(a,b) ((a)+(b))
 #  define VMADD(a,b,c) ((a)*(b)+(c))
 #  define VSUB(a,b) ((a)-(b))
 #  define LD_PS1(p) (p)
 #  define VLOAD_UNALIGNED(ptr)  (*(ptr))
 #  define VLOAD_ALIGNED(ptr)    (*(ptr))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
 #else
 /* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
 #endif
 #endif /* PF_SCAL_DBL_H */
--- a/pffft/simd/pf_scalar_float.h
+++ b/pffft/simd/pf_scalar_float.h
@@ -0,0 +1,185 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_SCAL_FLT_H
 #define PF_SCAL_FLT_H
 /*
  fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
 */
 #if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
 #pragma message( __FILE__ ": float SCALAR4 macros are defined" )
 typedef struct {
  vsfscalar a;
  vsfscalar b;
  vsfscalar c;
  vsfscalar d;
 } v4sf;
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  vsfscalar f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "4xScalar"
 #  define VREQUIRES_ALIGN 0
  static ALWAYS_INLINE(v4sf) VZERO() {
    v4sf r = { 0.f, 0.f, 0.f, 0.f };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
    v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
    v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
    v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
    v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
    return r;
  }
  static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
    v4sf r = { v, v, v, v };
    return r;
  }
 #  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
 #  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
  /* INTERLEAVE2() */
  #define INTERLEAVE2( A, B, C, D) \
  do { \
    v4sf Cr = { A.a, B.a, A.b, B.b }; \
    v4sf Dr = { A.c, B.c, A.d, B.d }; \
    C = Cr; \
    D = Dr; \
  } while (0)
  /* UNINTERLEAVE2() */
  #define UNINTERLEAVE2(A, B, C, D) \
  do { \
    v4sf Cr = { A.a, A.c, B.a, B.c }; \
    v4sf Dr = { A.b, A.d, B.b, B.d }; \
    C = Cr; \
    D = Dr; \
  } while (0)
  /* VTRANSPOSE4() */
  #define VTRANSPOSE4(A, B, C, D) \
  do { \
    v4sf Ar = { A.a, B.a, C.a, D.a }; \
    v4sf Br = { A.b, B.b, C.b, D.b }; \
    v4sf Cr = { A.c, B.c, C.c, D.c }; \
    v4sf Dr = { A.d, B.d, C.d, D.d }; \
    A = Ar; \
    B = Br; \
    C = Cr; \
    D = Dr; \
  } while (0)
  /* VSWAPHL() */
  static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
    v4sf r = { B.a, B.b, A.c, A.d };
    return r;
  }
  /* reverse/flip all floats */
  static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
    v4sf r = { A.d, A.c, A.b, A.a };
    return r;
  }
  /* reverse/flip complex floats */
  static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
    v4sf r = { A.c, A.d, A.a, A.b };
    return r;
  }
 #else
 /* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
 #endif
 #if !defined(SIMD_SZ)
 #pragma message( __FILE__ ": float SCALAR1 macros are defined" )
 typedef vsfscalar v4sf;
 #  define SIMD_SZ 1
 typedef union v4sf_union {
  v4sf  v;
  vsfscalar f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "Scalar"
 #  define VREQUIRES_ALIGN 0
 #  define VZERO() 0.f
 #  define VMUL(a,b) ((a)*(b))
 #  define VADD(a,b) ((a)+(b))
 #  define VMADD(a,b,c) ((a)*(b)+(c))
 #  define VSUB(a,b) ((a)-(b))
 #  define LD_PS1(p) (p)
 #  define VLOAD_UNALIGNED(ptr)  (*(ptr))
 #  define VLOAD_ALIGNED(ptr)    (*(ptr))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
 #else
 /* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
 #endif
 #endif /* PF_SCAL_FLT_H */
--- a/pffft/simd/pf_sse1_float.h
+++ b/pffft/simd/pf_sse1_float.h
@@ -0,0 +1,82 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_SSE1_FLT_H
 #define PF_SSE1_FLT_H
 /*
  SSE1 support macros
 */
 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
 #pragma message( __FILE__ ": SSE1 float macros are defined" )
 #include <xmmintrin.h>
 typedef __m128 v4sf;
 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
 *  anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  float f[SIMD_SZ];
 } v4sf_union;
 #  define VARCH "SSE1"
 #  define VREQUIRES_ALIGN 1
 #  define VZERO() _mm_setzero_ps()
 #  define VMUL(a,b) _mm_mul_ps(a,b)
 #  define VADD(a,b) _mm_add_ps(a,b)
 #  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
 #  define VSUB(a,b) _mm_sub_ps(a,b)
 #  define LD_PS1(p) _mm_set1_ps(p)
 #  define VLOAD_UNALIGNED(ptr)  _mm_loadu_ps(ptr)
 #  define VLOAD_ALIGNED(ptr)    _mm_load_ps(ptr)
 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
 #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
 #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
 /* reverse/flip all floats */
 #  define VREV_S(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
 /* reverse/flip complex floats */
 #  define VREV_C(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
 #else
 /* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
 #endif
 #endif /* PF_SSE1_FLT_H */
--- a/pffft/simd/pf_sse2_double.h
+++ b/pffft/simd/pf_sse2_double.h
@@ -0,0 +1,281 @@
 /*
   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
 */
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 #ifndef PF_SSE2_DBL_H
 #define PF_SSE2_DBL_H
 //detect sse2 support under MSVC
 #if defined ( _M_IX86_FP )
 #  if _M_IX86_FP == 2
 #    if !defined(__SSE2__)
 #      define __SSE2__
 #    endif
 #  endif
 #endif
 /*
  SSE2 64bit support macros
 */
 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
 #pragma message (__FILE__ ": SSE2 double macros are defined" )
 #include <emmintrin.h>
 typedef struct {
    __m128d d128[2];
 } m256d;
 typedef m256d v4sf;
 #  define SIMD_SZ 4
 typedef union v4sf_union {
  v4sf  v;
  double f[SIMD_SZ];
 } v4sf_union;
 #if defined(__GNUC__) || defined(__clang__)
 #pragma push_macro("FORCE_INLINE")
 #define FORCE_INLINE static inline __attribute__((always_inline))
 #elif defined (_MSC_VER)
 #define FORCE_INLINE static __forceinline
 #else
 #error "Macro name collisions may happens with unknown compiler"
 #ifdef FORCE_INLINE
 #undef FORCE_INLINE
 #endif
 #define FORCE_INLINE static inline
 #endif
 FORCE_INLINE m256d mm256_setzero_pd(void)
 {
    m256d ret;
    ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
    return ret;
 }
 FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
 {
    m256d ret;
    ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
    ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
    return ret;
 }
 FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
 {
    m256d ret;
    ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
    ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
    return ret;
 }
 FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
 {
    m256d ret;
    ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
    ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
    return ret;
 }
 FORCE_INLINE m256d mm256_set1_pd(double a)
 {
    m256d ret;
    ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
    return ret;
 }
 FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
 {
    m256d res;
    res.d128[0] = _mm_load_pd((const double *)mem_addr);
    res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
    return res;
 }
 FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
 {
    m256d res;
    res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
    res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
    return res;
 }
 #  define VARCH "SSE2"
 #  define VREQUIRES_ALIGN 1
 #  define VZERO() mm256_setzero_pd()
 #  define VMUL(a,b) mm256_mul_pd(a,b)
 #  define VADD(a,b) mm256_add_pd(a,b)
 #  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
 #  define VSUB(a,b) mm256_sub_pd(a,b)
 #  define LD_PS1(p) mm256_set1_pd(p)
 #  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
 #  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
 FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
 {
    return a.d128[0];
 }
 FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
 {
    assert(imm8 >= 0 && imm8 <= 1);
    return a.d128[imm8];
 }
 FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
 {
    m256d res;
    res.d128[0] = a.d128[0];
    res.d128[1] = b;
    return res;
 }
 FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
 {
    m256d res;
    res.d128[0] = a;
    return res;
 }
 FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
 {
    m256d res;
    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
    return res;
 }
 FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
 {
    m256d res;
    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
    return res;
 }
 FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
    m256d res;
    res.d128[0] = a.d128[0];
    res.d128[1] = b.d128[0];
    return res;
 }
 FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
 {
    m256d res;
    res.d128[0] = a.d128[1];
    res.d128[1] = b.d128[1];
    return res;
 }
 FORCE_INLINE m256d mm256_reverse(m256d x)
 {
    m256d res;
    res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
    res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
    return res;
 }
 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
 */
 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
 	__m128d low1__ = mm256_castpd256_pd128(in1);						\
 	__m128d low2__ = mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
 	m256d tmp__ = mm256_insertf128_pd_1(								\
 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
 		_mm_shuffle_pd(low1__, low2__, 3));								\
 	out2 = mm256_insertf128_pd_1(										\
 		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
 		_mm_shuffle_pd(high1__, high2__, 3));							\
 	out1 = tmp__;														\
 }
 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
 */
 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
 	__m128d low1__ = mm256_castpd256_pd128(in1);						\
 	__m128d low2__ = mm256_castpd256_pd128(in2);						\
 	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
 	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
 	m256d tmp__ = mm256_insertf128_pd_1(								\
 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
 		_mm_shuffle_pd(low2__, high2__, 0));							\
 	out2 = mm256_insertf128_pd_1(										\
 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
 		_mm_shuffle_pd(low2__, high2__, 3));							\
 	out1 = tmp__;														\
 }
 #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
        m256d tmp3, tmp2, tmp1, tmp0;                     			\
                                                            			\
        tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
        tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
        tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
        tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
                                                            			\
        (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
        (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
        (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
        (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
    }
 /*VSWAPHL(a, b) pseudo code:
 return [ b[0], b[1], a[2], a[3] ]
 */
 #  define VSWAPHL(a,b)	\
   mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
 /* reverse/flip all floats */
 #  define VREV_S(a)   mm256_reverse(a)
 /* reverse/flip complex floats */
 #  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
 #endif
 #endif
--- a/pffft/sse2neon.h
+++ b/pffft/sse2neon.h
--- a/pffft/test_fft_factors.c
+++ b/pffft/test_fft_factors.c
@@ -0,0 +1,142 @@
 #ifdef PFFFT_ENABLE_FLOAT
 #include "pffft.h"
 #endif
 #ifdef PFFFT_ENABLE_DOUBLE
 #include "pffft_double.h"
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 #ifdef PFFFT_ENABLE_FLOAT
 int test_float(int TL)
 {
  PFFFT_Setup * S;
  for (int dir_i = 0; dir_i <= 1; ++dir_i)
  {
    for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
    {
      const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
      const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
      const int N_min = pffft_min_fft_size(cplx);
      const int N_max = N_min * 11 + N_min;
      int NTL = pffft_nearest_transform_size(TL, cplx, (!dir_i));
      double near_off = (NTL - TL) * 100.0 / (double)TL;
      fprintf(stderr, "testing float, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
          (!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
      for (int N = (N_min/2); N <= N_max; N += (N_min/2))
      {
        int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
        const int factorizable = pffft_is_valid_size(N, cplx);
        while (R >= 5*N_min && (R % 5) == 0) {  R /= 5; ++f5; }
        while (R >= 3*N_min && (R % 3) == 0) {  R /= 3; ++f3; }
        while (R >= 2*N_min && (R % 2) == 0) {  R /= 2; ++f2; }
        tmp_f = (R == N_min) ? 1 : 0;
        assert( factorizable == tmp_f );
        S = pffft_new_setup(N, cplx);
        if ( S && !factorizable )
        {
          fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
          return 1;
        }
        else if ( !S && factorizable)
        {
          fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
          return 1;
        }
        if (S)
          pffft_destroy_setup(S);
      }
    }
  }
  return 0;
 }
 #endif
 #ifdef PFFFT_ENABLE_DOUBLE
 int test_double(int TL)
 {
  PFFFTD_Setup * S;
  for (int dir_i = 0; dir_i <= 1; ++dir_i)
  {
    for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
    {
      const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
      const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
      const int N_min = pffftd_min_fft_size(cplx);
      const int N_max = N_min * 11 + N_min;
      int NTL = pffftd_nearest_transform_size(TL, cplx, (!dir_i));
      double near_off = (NTL - TL) * 100.0 / (double)TL;
      fprintf(stderr, "testing double, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
          (!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
      for (int N = (N_min/2); N <= N_max; N += (N_min/2))
      {
        int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
        const int factorizable = pffftd_is_valid_size(N, cplx);
        while (R >= 5*N_min && (R % 5) == 0) {  R /= 5; ++f5; }
        while (R >= 3*N_min && (R % 3) == 0) {  R /= 3; ++f3; }
        while (R >= 2*N_min && (R % 2) == 0) {  R /= 2; ++f2; }
        tmp_f = (R == N_min) ? 1 : 0;
        assert( factorizable == tmp_f );
        S = pffftd_new_setup(N, cplx);
        if ( S && !factorizable )
        {
          fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
          return 1;
        }
        else if ( !S && factorizable)
        {
          fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
          return 1;
        }
        if (S)
          pffftd_destroy_setup(S);
      }
    }
  }
  return 0;
 }
 #endif
 int main(int argc, char *argv[])
 {
  int N = (1 < argc) ? atoi(argv[1]) : 2;
  int r = 0;
 #ifdef PFFFT_ENABLE_FLOAT
  r = test_float(N);
  if (r)
    return r;
 #endif
 #ifdef PFFFT_ENABLE_DOUBLE
  r = test_double(N);
 #endif
  return r;
 }
--- a/pffft/test_pffastconv.c
+++ b/pffft/test_pffastconv.c
@@ -0,0 +1,991 @@
 /*
  Copyright (c) 2013 Julien Pommier.
  Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
 */
 #define _WANT_SNAN  1
 #include "pffft.h"
 #include "pffastconv.h"
 #include <math.h>
 #include <float.h>
 #include <limits.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <assert.h>
 #include <string.h>
 #ifdef HAVE_SYS_TIMES
 #  include <sys/times.h>
 #  include <unistd.h>
 #endif
 /* benchmark duration: 250 ms */
 #define BENCH_TEST_DURATION_IN_SEC      0.5
 /* 
   vector support macros: the rest of the code is independant of
   SSE/Altivec/NEON -- adding support for other platforms with 4-element
   vectors should be limited to these macros 
 */
 #if 0
 #include "simd/pf_float.h"
 #endif
 #if defined(_MSC_VER)
 #  define RESTRICT __restrict
 #elif defined(__GNUC__)
 #  define RESTRICT __restrict
 #else
 #  define RESTRICT
 #endif
 #if defined(_MSC_VER)
 #pragma warning( disable : 4244 )
 #endif
 #ifdef SNANF
  #define INVALID_FLOAT_VAL  SNANF
 #elif defined(SNAN)
  #define INVALID_FLOAT_VAL  SNAN
 #elif defined(NAN)
  #define INVALID_FLOAT_VAL  NAN
 #elif defined(INFINITY)
  #define INVALID_FLOAT_VAL  INFINITY
 #else
  #define INVALID_FLOAT_VAL  FLT_MAX
 #endif
 #if defined(HAVE_SYS_TIMES)
  inline double uclock_sec(void) {
    static double ttclk = 0.;
    struct tms t;
    if (ttclk == 0.)
      ttclk = sysconf(_SC_CLK_TCK);
    times(&t);
    /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
    return ((double)t.tms_utime)) / ttclk;
  }
 # else
  double uclock_sec(void)
 { return (double)clock()/(double)CLOCKS_PER_SEC; }
 #endif
 typedef int            (*pfnConvolution)  (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
 typedef void*          (*pfnConvSetup)    (float *Hfwd, int Nf, int * BlkLen, int flags);
 typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
 typedef void           (*pfnConvDestroy)  (void * setup);
 struct ConvSetup
 {
  pfnConvolution pfn;
  int N;
  int B;
  float * H;
  int flags;
 };
 void * convSetupRev( float * H, int N, int * BlkLen, int flags )
 {
  struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
  int i, Nr = N;
  if (flags & PFFASTCONV_CPLX_INP_OUT)
    Nr *= 2;
  Nr += 4;
  s->pfn = NULL;
  s->N = N;
  s->B = *BlkLen;
  s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
  s->flags = flags;
  memset(s->H, 0, (unsigned)Nr * sizeof(float));
  if (flags & PFFASTCONV_CPLX_INP_OUT)
  {
    for ( i = 0; i < N; ++i ) {
      s->H[2*(N-1 -i)  ] = H[i];
      s->H[2*(N-1 -i)+1] = H[i];
    }
    /* simpler detection of overruns */
    s->H[ 2*N    ] = INVALID_FLOAT_VAL;
    s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
    s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
    s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
  }
  else
  {
    for ( i = 0; i < N; ++i )
      s->H[ N-1 -i ] = H[i];
    /* simpler detection of overruns */
    s->H[ N    ] = INVALID_FLOAT_VAL;
    s->H[ N +1 ] = INVALID_FLOAT_VAL;
    s->H[ N +2 ] = INVALID_FLOAT_VAL;
    s->H[ N +3 ] = INVALID_FLOAT_VAL;
  }
  return s;
 }
 void convDestroyRev( void * setup )
 {
  struct ConvSetup * s = (struct ConvSetup*)setup;
  pffastconv_free(s->H);
  pffastconv_free(setup);
 }
 pfnConvolution ConvGetFnPtrRev( void * setup )
 {
  struct ConvSetup * s = (struct ConvSetup*)setup;
  if (!s)
    return NULL;
  return s->pfn;
 }
 void convSimdDestroy( void * setup )
 {
  convDestroyRev(setup);
 }
 void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
 {
  void * p = pffastconv_new_setup( H, N, BlkLen, flags );
  if (!p)
    printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
  return p;
 }
 void fastConvDestroy( void * setup )
 {
  pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
 }
 int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
 {
  struct ConvSetup * p = (struct ConvSetup*)setup;
  const float * RESTRICT X = input;
  const float * RESTRICT Hrev = p->H;
  float * RESTRICT Y = output;
  const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
  const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
  int i, j;
  (void)Yref;
  (void)applyFlush;
  if (p->flags & PFFASTCONV_CPLX_INP_OUT)
  {
    for ( i = 0; i <= lenNr; i += 2 )
    {
      float sumRe = 0.0F, sumIm = 0.0F;
      for ( j = 0; j < Nr; j += 2 )
      {
        sumRe += X[i+j  ] * Hrev[j];
        sumIm += X[i+j+1] * Hrev[j+1];
      }
      Y[i  ] = sumRe;
      Y[i+1] = sumIm;
    }
    return i/2;
  }
  else
  {
    for ( i = 0; i <= lenNr; ++i )
    {
      float sum = 0.0F;
      for (j = 0; j < Nr; ++j )
        sum += X[i+j]   * Hrev[j];
      Y[i] = sum;
    }
    return i;
  }
 }
 int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
 {
  float sum[4];
  struct ConvSetup * p = (struct ConvSetup*)setup;
  const float * RESTRICT X = input;
  const float * RESTRICT Hrev = p->H;
  float * RESTRICT Y = output;
  const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
  const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
  int i, j;
  (void)Yref;
  (void)applyFlush;
  if (p->flags & PFFASTCONV_CPLX_INP_OUT)
  {
    if ( (Nr & 3) == 0 )
    {
      for ( i = 0; i <= lenNr; i += 2 )
      {
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j < Nr; j += 4 )
        {
          sum[0] += X[i+j]   * Hrev[j];
          sum[1] += X[i+j+1] * Hrev[j+1];
          sum[2] += X[i+j+2] * Hrev[j+2];
          sum[3] += X[i+j+3] * Hrev[j+3];
        }
        Y[i  ] = sum[0] + sum[2];
        Y[i+1] = sum[1] + sum[3];
      }
    }
    else
    {
      const int M = Nr & (~3);
      for ( i = 0; i <= lenNr; i += 2 )
      {
        float tailSumRe = 0.0F, tailSumIm = 0.0F;
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j < M; j += 4 )
        {
          sum[0] += X[i+j  ] * Hrev[j  ];
          sum[1] += X[i+j+1] * Hrev[j+1];
          sum[2] += X[i+j+2] * Hrev[j+2];
          sum[3] += X[i+j+3] * Hrev[j+3];
        }
        for ( ; j < Nr; j += 2 ) {
          tailSumRe += X[i+j  ] * Hrev[j  ];
          tailSumIm += X[i+j+1] * Hrev[j+1];
        }
        Y[i  ] = ( sum[0] + sum[2] ) + tailSumRe;
        Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
      }
    }
    return i/2;
  }
  else
  {
    if ( (Nr & 3) == 0 )
    {
      for ( i = 0; i <= lenNr; ++i )
      {
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j < Nr; j += 4 )
        {
          sum[0] += X[i+j]   * Hrev[j];
          sum[1] += X[i+j+1] * Hrev[j+1];
          sum[2] += X[i+j+2] * Hrev[j+2];
          sum[3] += X[i+j+3] * Hrev[j+3];
        }
        Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
      }
      return i;
    }
    else
    {
      const int M = Nr & (~3);
      /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
      for ( i = 0; i <= lenNr; ++i )
      {
        float tailSum = 0.0;
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j < M; j += 4 )
        {
          sum[0] += X[i+j]   * Hrev[j];
          sum[1] += X[i+j+1] * Hrev[j+1];
          sum[2] += X[i+j+2] * Hrev[j+2];
          sum[3] += X[i+j+3] * Hrev[j+3];
        }
        for ( ; j < Nr; ++j )
          tailSum += X[i+j] * Hrev[j];
        Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
      }
      return i;
    }
  }
 }
 int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
 {
  float sum[4];
  struct ConvSetup * p = (struct ConvSetup*)setup;
  (void)Yref;
  (void)applyFlush;
  if (p->flags & PFFASTCONV_SYMMETRIC)
  {
    const float * RESTRICT X = input;
    const float * RESTRICT Hrev = p->H;
    float * RESTRICT Y = output;
    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
    const int h = Nr / 2 -4;
    const int E = Nr -4;
    int i, j;
    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
    {
      for ( i = 0; i <= lenNr; i += 2 )
      {
        const int k = i + E;
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j <= h; j += 4 )
        {
          sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+2] );
          sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
          sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j  ] );
          sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
        }
        Y[i  ] = sum[0] + sum[2];
        Y[i+1] = sum[1] + sum[3];
      }
      return i/2;
    }
    else
    {
      for ( i = 0; i <= lenNr; ++i )
      {
        const int k = i + E;
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j <= h; j += 4 )
        {
          sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+3] );
          sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
          sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
          sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j  ] );
        }
        Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
      }
      return i;
    }
  }
  else
  {
    const float * RESTRICT X = input;
    const float * RESTRICT Hrev = p->H;
    float * RESTRICT Y = output;
    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
    int i, j;
    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
    {
      for ( i = 0; i <= lenNr; i += 2 )
      {
        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
        for (j = 0; j < Nr; j += 4 )
        {
          sum[0] += X[i+j]   * Hrev[j];
          sum[1] += X[i+j+1] * Hrev[j+1];
          sum[2] += X[i+j+2] * Hrev[j+2];
          sum[3] += X[i+j+3] * Hrev[j+3];
        }
        Y[i  ] = sum[0] + sum[2];
        Y[i+1] = sum[1] + sum[3];
      }
      return i/2;
    }
    else
    {
      if ( (Nr & 3) == 0 )
      {
        for ( i = 0; i <= lenNr; ++i )
        {
          sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
          for (j = 0; j < Nr; j += 4 )
          {
            sum[0] += X[i+j]   * Hrev[j];
            sum[1] += X[i+j+1] * Hrev[j+1];
            sum[2] += X[i+j+2] * Hrev[j+2];
            sum[3] += X[i+j+3] * Hrev[j+3];
          }
          Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
        }
        return i;
      }
      else
      {
        const int M = Nr & (~3);
        /* printf("B: Nr = %d\n", Nr ); */
        for ( i = 0; i <= lenNr; ++i )
        {
          float tailSum = 0.0;
          sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
          for (j = 0; j < M; j += 4 )
          {
            sum[0] += X[i+j]   * Hrev[j];
            sum[1] += X[i+j+1] * Hrev[j+1];
            sum[2] += X[i+j+2] * Hrev[j+2];
            sum[3] += X[i+j+3] * Hrev[j+3];
          }
          for ( ; j < Nr; ++j )
            tailSum += X[i+j] * Hrev[j];
          Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
        }
        return i;
      }
    }
  }
 }
 int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
 {
  (void)Yref;
  return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
 }
 void printFirst( const float * V, const char * st, const int N, const int perLine )
 {
  (void)V;  (void)st;  (void)N;  (void)perLine;
  return;
 #if 0
  int i;
  for ( i = 0; i < N; ++i )
  {
    if ( (i % perLine) == 0 )
      printf("\n%s[%d]", st, i);
    printf("\t%.1f", V[i]);
  }
  printf("\n");
 #endif
 }
 #define NUMY       15
 int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed, int abortFirstFastAlgo, int printErrValues, int printAsCSV, int *pIsFirstFilterLen) {
  double t0, t1, tstop, td, tdref;
  float *X, *H;
  float *Y[NUMY];
  int64_t outN[NUMY];
  /* 256 KFloats or 16 MFloats data */
 #if 1
  const int len = testOutLen ? (1 << 18) : (1 << 24);
 #elif 0
  const int len = testOutLen ? (1 << 18) : (1 << 13);
 #else
  const int len = testOutLen ? (1 << 18) : (1024);
 #endif
  const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
  const int lenC = len / cplxFactor;
  int yi, yc, posMaxErr;
  float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
  int i, j, numErrOverLimit, iter;
  int retErr = 0;
  /*                                  0               1               2               3                   4                   5                   6                   7                   8                      9,                   10,                  11,                   12,                   13                     */
  pfnConvSetup   aSetup[NUMY]     = { convSetupRev,   convSetupRev,   convSetupRev,   fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,         fastConvSetup,       fastConvSetup,       fastConvSetup,        fastConvSetup,        fastConvSetup,         };
  pfnConvDestroy aDestroy[NUMY]   = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,       fastConvDestroy,     fastConvDestroy,     fastConvDestroy,      fastConvDestroy,      fastConvDestroy,       };
  pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,                NULL,                NULL,                 NULL,                 NULL,                  };
  pfnConvolution aConv[NUMY]      = { slow_conv_R,    slow_conv_A,    slow_conv_B,    fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,             fast_conv,           fast_conv,           fast_conv,            fast_conv,            fast_conv,             };
  const char * convText[NUMY]     = { "R(non-simd)",  "A(non-simd)",  "B(non-simd)",  "fast_conv_64",     "fast_conv_128",    "fast_conv_256",    "fast_conv_512",    "fast_conv_1K",     "fast_conv_2K",        "fast_conv_4K",      "fast_conv_8K",      "fast_conv_16K",      "fast_conv_32K",      "fast_conv_64K",       };
  int    aFastAlgo[NUMY]          = { 0,              0,              0,              1,                  1,                  1,                  1,                  1,                  1,                     1,                   1,                   1,                    1,                    1,                     };
  void * aSetupCfg[NUMY]          = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,                NULL,                NULL,                 NULL,                 NULL,                  };
 //int    aBlkLen[NUMY]            = { 1024,           1024,           1024,           64,                 128,                256,                512,                1024,               2048,                  4096,                8192,                16384,                32768,                65536,                 };
  int    aBlkLen[NUMY]            = { 8192,           8192,           8192,           64,                 128,                256,                512,                1024,               2048,                  4096,                8192,                16384,                32768,                65536,                 };
 #if 1
  int    aRunAlgo[NUMY]           = { 1,              1,              1,              FILTERLEN<64,       FILTERLEN<128,      FILTERLEN<256,      FILTERLEN<512,      FILTERLEN<1024,     FILTERLEN<2048,        FILTERLEN<4096,      FILTERLEN<8192,      FILTERLEN<16384,      FILTERLEN<32768,      FILTERLEN<65536,       };
 #elif 0
  int    aRunAlgo[NUMY]           = { 1,              0,              0,              0 && FILTERLEN<64,  1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536,  };
 #else
  int    aRunAlgo[NUMY]           = { 1,              1,              1,              0 && FILTERLEN<64,  0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536,  };
 #endif
  double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
  int aNumIters[NUMY], aNumLoops[NUMY];
  X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
  for ( i=0; i < NUMY; ++i)
  {
    if ( 1 || i < 2 )
      Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
    else
      Y[i] = Y[1];
    Y[i][0] = 123.F;  /* test for pffft_zconvolve_no_accu() */
    aSpeedFactor[i] = -1.0;
    aDuration[i] = -1.0;
    procSmpPerSec[i] = -1.0;
    aNumIters[i] = 0;
    aNumLoops[i] = 0;
  }
  H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
  /* initialize input */
  if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
  {
    for ( i = 0; i < lenC; ++i )
    {
      X[2*i  ] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
      X[2*i+1] = (float)((i+2048) % 4093);
    }
  }
  else
  {
    for ( i = 0; i < len; ++i )
      X[i] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
  }
  X[ len    ] = INVALID_FLOAT_VAL;
  X[ len +1 ] = INVALID_FLOAT_VAL;
  X[ len +2 ] = INVALID_FLOAT_VAL;
  X[ len +3 ] = INVALID_FLOAT_VAL;
  if (!testOutLen)
    printFirst( X, "X", 64, 8 );
  /* filter coeffs */
  memset( H, 0, FILTERLEN * sizeof(float) );
 #if 1
  if ( convFlags & PFFASTCONV_SYMMETRIC )
  {
    const int half = FILTERLEN / 2;
    for ( j = 0; j < half; ++j ) {
      switch (j % 3) {
        case 0: H[j] = H[FILTERLEN-1-j] = -1.0F;  break;
        case 1: H[j] = H[FILTERLEN-1-j] =  1.0F;  break;
        case 2: H[j] = H[FILTERLEN-1-j] =  0.5F;  break;
      }
    }
  }
  else
  {
    for ( j = 0; j < FILTERLEN; ++j ) {
      switch (j % 3) {
        case 0: H[j] = -1.0F;  break;
        case 1: H[j] = 1.0F;   break;
        case 2: H[j] = 0.5F;   break;
      }
    }
  }
 #else
  H[0] = 1.0F;
  H[FILTERLEN -1] = 1.0F;
 #endif
  if (!testOutLen)
    printFirst( H, "H", FILTERLEN, 8 );
  if (!printAsCSV)
  {
    printf("\n");
    printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
      ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
      (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
      ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
  }
  int hadFastAlgo = 0;
  while (1)
  {
    for ( yi = 0; yi < NUMY; ++yi )
    {
      if (!aRunAlgo[yi])
        continue;
      if ( aFastAlgo[yi] && abortFirstFastAlgo && hadFastAlgo )
      {
        aRunAlgo[yi] = 0;
        continue;
      }
      hadFastAlgo = hadFastAlgo | aFastAlgo[yi];
      aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
      /* get effective apply function ptr */
      if ( aSetupCfg[yi] && aGetFnPtr[yi] )
        aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
      if ( aSetupCfg[yi] && aConv[yi] )
      {
        if (testOutLen)
        {
          t0 = uclock_sec();
          outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
          t1 = uclock_sec();
          td = t1 - t0;
        }
        else
        {
          //const int blkLen = 4096;  /* required for 'fast_conv_4K' */
          const int blkLen = aBlkLen[yi];
          int64_t offC = 0, offS, Nout;
          int k;
          iter = 0;
          outN[yi] = 0;
          aNumLoops[yi] = 1;
          t0 = uclock_sec();
          tstop = t0 + BENCH_TEST_DURATION_IN_SEC;
          do
          {
            const int prev_iter = iter;
            for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
            {
              offS = cplxFactor * offC;
              Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], 0 /* applyFlush */ );
              offC += Nout;
              ++iter;
              if ( !Nout )
                break;
            }
            //if ( !Nout )
            //  break;
            t1 = uclock_sec();
            if ( prev_iter == iter )    // restart from begin of input?
            {
                offC = 0;
                ++aNumLoops[yi];
            }
          } while ( t1 < tstop );
          outN[yi] = offC;
          td = t1 - t0;
          procSmpPerSec[yi] = cplxFactor * (double)outN[yi] * (1.0 / td);
          aNumIters[yi] = iter;
          aDuration[yi] = td;
          //printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%.1f ms\n",
          //  convText[yi], (double)outN[yi]/(1000.0 * 1000.0), 1000.0 * aDuration[yi], procSmpPerSec[yi] * 0.001, aNumIters[yi], 1000.0 * td );
        }
      }
      else
      {
        outN[yi] = 0;
      }
      if ( yi == 0 ) {
        const float * Yvals = Y[0];
        const int64_t refOutLen = cplxFactor * outN[0];
        tdref = td;
        if (printDbg) {
          printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
          printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
        }
        aSpeedFactor[yi] = 1.0;
        /*  */
        yRangeMin = FLT_MAX;
        yRangeMax = FLT_MIN;
        for ( i = 0; i < refOutLen; ++i )
        {
          if ( yRangeMax < Yvals[i] )  yRangeMax = Yvals[i];
          if ( yRangeMin > Yvals[i] )  yRangeMin = Yvals[i];
        }
        yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
        /* yErrLimit = 0.01F; */
        if (testOutLen) {
          if (1) {
            printf("reference output len = %" PRId64 " smp\n", outN[0]);
            printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
          }
          printFirst( Yvals, "Yref", 64, 8 );
        }
      }
      else
      {
        aSpeedFactor[yi] = tdref / td;
        if (printDbg) {
          printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
          printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
        }
      }
    }
    int iMaxSpeedSlowAlgo = -1;
    int iFirstFastAlgo = -1;
    int iMaxSpeedFastAlgo = -1;
    int iPrintedRefOutLen = 0;
    {
      for ( yc = 1; yc < NUMY; ++yc )
      {
        if (!aRunAlgo[yc])
          continue;
        if (aFastAlgo[yc]) {
          if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
            iMaxSpeedFastAlgo = yc;
          if (iFirstFastAlgo < 0)
            iFirstFastAlgo = yc;
        }
        else
        {
          if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
            iMaxSpeedSlowAlgo = yc;
        }
      }
      if (printSpeed)
      {
        if (testOutLen)
        {
          if (iMaxSpeedSlowAlgo >= 0 )
            printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
          if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
          if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
          if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
            printf("first   fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo],    aSpeedFactor[iFirstFastAlgo],    1000.0 * aDuration[iFirstFastAlgo]);
          if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
            printf("2nd     fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1],  aSpeedFactor[iFirstFastAlgo+1],  1000.0 * aDuration[iFirstFastAlgo+1]);
          if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
          {
            printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
            if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
              printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
          }
          printf("\n");
        }
        else
        {
          // print columns in 1st line
          if (printAsCSV && *pIsFirstFilterLen)
          {
            printf("\n# filterLen, filterOrder, Re/Cx, type, sym, ");
            for ( yc = 0; yc < NUMY; ++yc )
            {
              if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
                continue;
              if (printAsCSV)
                printf("%s, ", convText[yc]);
            }
            *pIsFirstFilterLen = 0;
          }
          for ( yc = 0; yc < NUMY; ++yc )
          {
            if (!yc)
            {
              double filterExp = log10((double)FILTERLEN) / log10(2.0);
              printf("\n%5d, %5.1f, %s, %s, %s, ", FILTERLEN, filterExp,
                     ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
                     (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
                     ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym")
                     );
            }
            if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
              continue;
            if (printAsCSV)
              printf("%.0f, ", procSmpPerSec[yc] * 0.001);
            else
              printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%d loops\n",
                     convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001, aNumIters[yc], aNumLoops[yc] );
          }
        }
      }
    }
    for ( yc = 1; yc < NUMY; ++yc )
    {
      const float * Yref;
      const float * Ycurr;
      int outMin;
      if (!aRunAlgo[yc])
        continue;
      if (printDbg)
        printf("\n");
      if ( outN[yc] == 0 )
      {
        if (!printAsCSV)
          printf("output size 0: '%s' not implemented\n", convText[yc]);
      }
      else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
      {
        if (!iPrintedRefOutLen)
        {
          printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
          iPrintedRefOutLen = 1;
        }
        printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
          FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
        retErr = 1;
      }
      posMaxErr = 0;
      maxErr = -1.0;
      Yref = Y[0];
      Ycurr = Y[yc];
      outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
      numErrOverLimit = 0;
      for ( i = 0; i < outMin; ++i )
      {
        if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit && printErrValues )
        {
          printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
            convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
          ++numErrOverLimit;
        }
        if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
        {
          maxErr = fabsf(Ycurr[i] - Yref[i]);
          posMaxErr = i;
        }
      }
      if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
        printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
    }
    break;
  }
  pffastconv_free(X);
  for ( i=0; i < NUMY; ++i)
  {
    if ( 1 || i < 2 )
      pffastconv_free( Y[i] );
    if (!aRunAlgo[i])
      continue;
    aDestroy[i]( aSetupCfg[i] );
  }
  pffastconv_free(H);
  return retErr;
 }
 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
 void validate_pffft_simd();
 int  validate_pffft_simd_ex(FILE * DbgOut);
 int main(int argc, char **argv)
 {
  int result = 0;
  int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
  int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
  int testReal = 1, testCplx = 1, testSymetric = 0, abortFirstFastAlgo = 1, printErrValues = 0, printAsCSV = 1;
  int isFirstFilterLen = 1;
  for ( i = 1; i < argc; ++i ) {
    if (!strcmp(argv[i], "--test-simd")) {
      int numErrs = validate_pffft_simd_ex(stdout);
      fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
      return ( numErrs > 0 ? 1 : 0 );
    }
    if (!strcmp(argv[i], "--no-len")) {
      testOutLens = 0;
    }
    else if (!strcmp(argv[i], "--no-bench")) {
      benchConv = 0;
    }
    else if (!strcmp(argv[i], "--quick")) {
      quickTest = 1;
    }
    else if (!strcmp(argv[i], "--slow")) {
      slowTest = 1;
    }
    else if (!strcmp(argv[i], "--real")) {
      testCplx = 0;
    }
    else if (!strcmp(argv[i], "--cplx")) {
      testReal = 0;
    }
    else if (!strcmp(argv[i], "--sym")) {
      testSymetric = 1;
    }
    else /* if (!strcmp(argv[i], "--help")) */ {
      printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
      exit(1);
    }
  }
  if (testOutLens)
  {
    for ( k = 0; k < 3; ++k )
    {
      if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
        continue;
      printf("\n\n==========\n");
      printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
      printf("==========\n");
      flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
      flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
      flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
      testOutLen = 1;
      printDbg = 0;
      printSpeed = 0;
      for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
      {
        if ( (M % 16) != 0 && testSymetric )
          continue;
        result |= test(M, flagsB, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, 0, &isFirstFilterLen);
      }
    }
  }
  if (benchConv)
  {
      printf("quickTest is %d\n", quickTest);
      printf("slowTest is %d\n", slowTest);
    for ( k = 0; k < 3; ++k )
    {
      if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
        continue;
      if (!printAsCSV)
      {
        printf("\n\n==========\n");
        printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
        printf("==========\n");
      }
      flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
      flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
      flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
      testOutLen = 0;
      printDbg = 0;
      printSpeed = 1;
      if (!slowTest) {
        if (!quickTest) {
          result |= test(32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
          result |= test(32 + 16, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        }
        result |= test(64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        if (!quickTest) {
          result |= test(64 + 32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
          result |= test(128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        }
      }
      if (!quickTest) {
        result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(256,     flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(512,     flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(1024,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(2048,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(4096,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(8192,    flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(16384,   flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
        result |= test(32768,   flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
      }
      if (printAsCSV)
        printf("\n");
    }
  }
  return result;
 }
--- a/pffft/test_pffft.c
+++ b/pffft/test_pffft.c
@@ -0,0 +1,371 @@
 /*
  Copyright (c) 2013 Julien Pommier.
  Small test for PFFFT
  How to build: 
  on linux, with fftw3:
  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
  on macos, without fftw3:
  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
  on macos, with fftw3:
  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
  as alternative: replace clang by gcc.
  on windows, with visual c++:
  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
  build without SIMD instructions:
  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
 */
 #ifdef PFFFT_ENABLE_FLOAT
 #include "pffft.h"
 typedef float pffft_scalar;
 #else
 /*
 Note: adapted for double precision dynamic range version.
 */
 #include "pffft_double.h"
 typedef double pffft_scalar;
 #endif
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <assert.h>
 #include <string.h>
 /* define own constants required to turn off g++ extensions .. */
 #ifndef M_PI
  #define M_PI    3.14159265358979323846  /* pi */
 #endif
 /* EXPECTED_DYN_RANGE in dB:
 * single precision float has 24 bits mantissa
 * => 24 Bits * 6 dB = 144 dB
 * allow a few dB tolerance (even 144 dB looks good on my PC)
 */
 #ifdef PFFFT_ENABLE_FLOAT
 #define EXPECTED_DYN_RANGE  140.0
 #else
 #define EXPECTED_DYN_RANGE  215.0
 #endif
 /* maximum allowed phase error in degree */
 #define DEG_ERR_LIMIT   1E-4
 /* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
 #define MAG_ERR_LIMIT  1E-6
 #define PRINT_SPEC  0
 #define PWR2LOG(PWR)  ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
 int test(int N, int cplx, int useOrdered) {
  int Nfloat = (cplx ? N*2 : N);
 #ifdef PFFFT_ENABLE_FLOAT
  pffft_scalar *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *R = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
 #else
  pffft_scalar *X = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *Y = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *R = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *Z = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
  pffft_scalar *W = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
 #endif
  pffft_scalar amp = (pffft_scalar)1.0;
  double freq, dPhi, phi, phi0;
  double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
  int k, j, m, iter, kmaxOther, retError = 0;
 #ifdef PFFFT_ENABLE_FLOAT
  assert( pffft_is_power_of_two(N) );
  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
 #else
  assert( pffftd_is_power_of_two(N) );
  PFFFTD_Setup *s = pffftd_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
 #endif
  assert(s);
  if (!s) {
    printf("Error setting up PFFFT!\n");
    return 1;
  }
  for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
  {
    amp = (pffft_scalar)( ( (m % 3) == 0 ) ? 1.0 : 1.1 );
    freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
    dPhi = 2.0 * M_PI * freq;
    if ( dPhi < 0.0 )
      dPhi += 2.0 * M_PI;
    iter = -1;
    while (1)
    {
      ++iter;
      if (iter)
        printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
      /* generate cosine carrier as time signal - start at defined phase phi0 */
      phi = phi0 = (m % 4) * 0.125 * M_PI;  /* have phi0 < 90 deg to be normalized */
      for ( j = 0; j < N; ++j )
      {
        if (cplx) {
          X[2*j] = amp * (pffft_scalar)cos(phi);  /* real part */
          X[2*j+1] = amp * (pffft_scalar)sin(phi);  /* imag part */
        }
        else
          X[j] = amp * (pffft_scalar)cos(phi);  /* only real part */
        /* phase increment .. stay normalized - cos()/sin() might degrade! */
        phi += dPhi;
        if ( phi >= M_PI )
          phi -= 2.0 * M_PI;
      }
      /* forward transform from X --> Y  .. using work buffer W */
 #ifdef PFFFT_ENABLE_FLOAT
      if ( useOrdered )
        pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
      else
      {
        pffft_transform(s, X, R, W, PFFFT_FORWARD );  /* use R for reordering */
        pffft_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
      }
 #else
      if ( useOrdered )
        pffftd_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
      else
      {
        pffftd_transform(s, X, R, W, PFFFT_FORWARD );  /* use R for reordering */
        pffftd_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
      }
 #endif
      pwrOther = -1.0;
      pwrCar = 0;
      /* for positive frequencies: 0 to 0.5 * samplerate */
      /* and also for negative frequencies: -0.5 * samplerate to 0 */
      for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
      {
        if (!cplx && !j)  /* special treatment for DC for real input */
          pwr = Y[j]*Y[j];
        else if (!cplx && j == N/2)  /* treat 0.5 * samplerate */
          pwr = Y[1] * Y[1];  /* despite j (for freq calculation) we have index 1 */
        else
          pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
        if (iter || PRINT_SPEC)
          printf("%s fft %d:  pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
        if (k == j)
          pwrCar = pwr;
        else if ( pwr > pwrOther ) {
          pwrOther = pwr;
          kmaxOther = j;
        }
      }
      if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
        printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
        printf("  carrier power  at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
        printf("  carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
        printf("  max other pwr  at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
        printf("  dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
        retError = 1;
        if ( iter == 0 )
          continue;
      }
      if ( k > 0 && k != N/2 )
      {
        phi = atan2( Y[2*k+1], Y[2*k] );
        if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
        {
        retError = 1;
        printf("%s fft %d  bin %d amp %f : phase mismatch! phase = %f deg   expected = %f deg\n",
            (cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
        }
      }
      expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
      mag = sqrt(pwrCar) / N;
      if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
      {
        retError = 1;
        printf("%s fft %d  bin %d amp %f : mag = %g   expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
      }
      /* now convert spectrum back */
 #ifdef PFFFT_ENABLE_FLOAT
      if (useOrdered)
        pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
      else
        pffft_transform(s, R, Z, W, PFFFT_BACKWARD);
 #else
      if (useOrdered)
        pffftd_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
      else
        pffftd_transform(s, R, Z, W, PFFFT_BACKWARD);
 #endif
      errSum = 0.0;
      for ( j = 0; j < (cplx ? (2*N) : N); ++j )
      {
        /* scale back */
        Z[j] /= N;
        /* square sum errors over real (and imag parts) */
        err = (X[j]-Z[j]) * (X[j]-Z[j]);
        errSum += err;
      }
      if ( errSum > N * 1E-7 )
      {
        retError = 1;
        printf("%s fft %d  bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
      }
      break;
    }
  }
 #ifdef PFFFT_ENABLE_FLOAT
  pffft_destroy_setup(s);
  pffft_aligned_free(X);
  pffft_aligned_free(Y);
  pffft_aligned_free(Z);
  pffft_aligned_free(R);
  pffft_aligned_free(W);
 #else
  pffftd_destroy_setup(s);
  pffftd_aligned_free(X);
  pffftd_aligned_free(Y);
  pffftd_aligned_free(Z);
  pffftd_aligned_free(R);
  pffftd_aligned_free(W);
 #endif
  return retError;
 }
 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
 void validate_pffft_simd();
 int  validate_pffft_simd_ex(FILE * DbgOut);
 void validate_pffftd_simd();
 int  validate_pffftd_simd_ex(FILE * DbgOut);
 int main(int argc, char **argv)
 {
  int N, result, resN, resAll, i, k, resNextPw2, resIsPw2, resFFT;
  int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8,  9, 511, 512,  513 };
  int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
  for ( i = 1; i < argc; ++i ) {
    if (!strcmp(argv[i], "--test-simd")) {
 #ifdef PFFFT_ENABLE_FLOAT
      int numErrs = validate_pffft_simd_ex(stdout);
 #else
      int numErrs = validate_pffftd_simd_ex(stdout);
 #endif
      fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
      return ( numErrs > 0 ? 1 : 0 );
    }
  }
  resNextPw2 = 0;
  resIsPw2 = 0;
  for ( k = 0; k < (sizeof(inp_power_of_two)/sizeof(inp_power_of_two[0])); ++k) {
 #ifdef PFFFT_ENABLE_FLOAT
    N = pffft_next_power_of_two(inp_power_of_two[k]);
 #else
    N = pffftd_next_power_of_two(inp_power_of_two[k]);
 #endif
    if (N != ref_power_of_two[k]) {
      resNextPw2 = 1;
      printf("pffft_next_power_of_two(%d) does deliver %d, which is not reference result %d!\n",
        inp_power_of_two[k], N, ref_power_of_two[k] );
    }
 #ifdef PFFFT_ENABLE_FLOAT
    result = pffft_is_power_of_two(inp_power_of_two[k]);
 #else
    result = pffftd_is_power_of_two(inp_power_of_two[k]);
 #endif
    if (inp_power_of_two[k] == ref_power_of_two[k]) {
      if (!result) {
        resIsPw2 = 1;
        printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", inp_power_of_two[k]);
      }
    } else {
      if (result) {
        resIsPw2 = 1;
        printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", inp_power_of_two[k]);
      }
    }
  }
  if (!resNextPw2)
    printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
  if (!resIsPw2)
    printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
  resFFT = 0;
  for ( N = 32; N <= 65536; N *= 2 )
  {
    result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
    resN = result;
    resFFT |= result;
    result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
    resN |= result;
    resFFT |= result;
    result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
    resN |= result;
    resFFT |= result;
    result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
    resN |= result;
    resFFT |= result;
    if (!resN)
      printf("tests for size %d succeeded successfully.\n", N);
  }
  if (!resFFT) {
 #ifdef PFFFT_ENABLE_FLOAT
    printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, float) succeeded successfully.\n");
 #else
    printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, double) succeeded successfully.\n");
 #endif
  }
  resAll = resNextPw2 | resIsPw2 | resFFT;
  if (!resAll)
    printf("all tests succeeded successfully.\n");
  else
    printf("there are failed tests!\n");
  return resAll;
 }
--- a/pffft/test_pffft.cpp
+++ b/pffft/test_pffft.cpp
@@ -0,0 +1,377 @@
 /*
  Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
  Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
  Small test & bench for PFFFT, comparing its performance with the scalar
  FFTPACK, FFTW, and Apple vDSP
  How to build:
  on linux, with fftw3:
  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c
  test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
  on macos, without fftw3:
  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c
  -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
  on macos, with fftw3:
  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c
  test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f
  -framework Accelerate
  as alternative: replace clang by gcc.
  on windows, with visual c++:
  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
  build without SIMD instructions:
  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c
  fftpack.c -lm
 */
 #include "pffft.hpp"
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 /* define own constants required to turn off g++ extensions .. */
 #ifndef M_PI
  #define M_PI    3.14159265358979323846  /* pi */
 #endif
 /* maximum allowed phase error in degree */
 #define DEG_ERR_LIMIT 1E-4
 /* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
 #define MAG_ERR_LIMIT 1E-6
 #define PRINT_SPEC 0
 #define PWR2LOG(PWR) ((PWR) < 1E-30 ? 10.0 * log10(1E-30) : 10.0 * log10(PWR))
 template<typename T>
 bool
 Ttest(int N, bool useOrdered)
 {
  typedef pffft::Fft<T> Fft;
  typedef typename pffft::Fft<T>::Scalar  FftScalar;
  typedef typename Fft::Complex FftComplex;
  const bool cplx = pffft::Fft<T>::isComplexTransform();
  const double EXPECTED_DYN_RANGE = Fft::isDoubleScalar() ? 215.0 : 140.0;
  assert(Fft::isPowerOfTwo(N));
  Fft fft = Fft(N);  // instantiate and prepareLength() for length N
 #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
  // possible ways to declare/instatiate aligned vectors with C++11
  //   some lines require a typedef of above
  auto X = fft.valueVector();                    // for X = input vector
  pffft::AlignedVector<typename Fft::Complex> Y = fft.spectrumVector();  // for Y = forward(X)
  pffft::AlignedVector<FftScalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
  pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
                                                 //  or Z = inverseInternalLayout(R)
 #else
  // possible ways to declare/instatiate aligned vectors with C++98
  pffft::AlignedVector<T> X = fft.valueVector();     // for X = input vector
  pffft::AlignedVector<FftComplex>   Y = fft.spectrumVector();  // for Y = forward(X)
  pffft::AlignedVector<typename Fft::Scalar>  R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
  pffft::AlignedVector<T> Z = fft.valueVector();     // for Z = inverse(Y) = inverse( forward(X) )
                                                     //  or Z = inverseInternalLayout(R)
 #endif
  // work with complex - without the capabilities of a higher c++ standard
  FftScalar* Xs = reinterpret_cast<FftScalar*>(X.data()); // for X = input vector
  FftScalar* Ys = reinterpret_cast<FftScalar*>(Y.data()); // for Y = forward(X)
  FftScalar* Zs = reinterpret_cast<FftScalar*>(Z.data()); // for Z = inverse(Y) = inverse( forward(X) )
  int k, j, m, iter, kmaxOther;
  bool retError = false;
  double freq, dPhi, phi, phi0;
  double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
  double amp = 1.0;
  for (k = m = 0; k < (cplx ? N : (1 + N / 2)); k += N / 16, ++m) {
    amp = ((m % 3) == 0) ? 1.0F : 1.1F;
    freq = (k < N / 2) ? ((double)k / N) : ((double)(k - N) / N);
    dPhi = 2.0 * M_PI * freq;
    if (dPhi < 0.0)
      dPhi += 2.0 * M_PI;
    iter = -1;
    while (1) {
      ++iter;
      if (iter)
        printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
      /* generate cosine carrier as time signal - start at defined phase phi0 */
      phi = phi0 =
        (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
      for (j = 0; j < N; ++j) {
        if (cplx) {
          Xs[2 * j] = (FftScalar)( amp * cos(phi) );     /* real part */
          Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */
        } else
          Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */
        /* phase increment .. stay normalized - cos()/sin() might degrade! */
        phi += dPhi;
        if (phi >= M_PI)
          phi -= 2.0 * M_PI;
      }
      /* forward transform from X --> Y  .. using work buffer W */
      if (useOrdered)
        fft.forward(X, Y);
      else {
        fft.forwardToInternalLayout(X, R); /* use R for reordering */
        fft.reorderSpectrum(R, Y); /* have canonical order in Y[] for power calculations */
      }
      pwrOther = -1.0;
      pwrCar = 0;
      /* for positive frequencies: 0 to 0.5 * samplerate */
      /* and also for negative frequencies: -0.5 * samplerate to 0 */
      for (j = 0; j < (cplx ? N : (1 + N / 2)); ++j) {
        if (!cplx && !j) /* special treatment for DC for real input */
          pwr = Ys[j] * Ys[j];
        else if (!cplx && j == N / 2) /* treat 0.5 * samplerate */
          pwr = Ys[1] *
                Ys[1]; /* despite j (for freq calculation) we have index 1 */
        else
          pwr = Ys[2 * j] * Ys[2 * j] + Ys[2 * j + 1] * Ys[2 * j + 1];
        if (iter || PRINT_SPEC)
          printf("%s fft %d:  pwr[j = %d] = %g == %f dB\n",
                 (cplx ? "cplx" : "real"),
                 N,
                 j,
                 pwr,
                 PWR2LOG(pwr));
        if (k == j)
          pwrCar = pwr;
        else if (pwr > pwrOther) {
          pwrOther = pwr;
          kmaxOther = j;
        }
      }
      if (PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE) {
        printf("%s fft %d amp %f iter %d:\n",
               (cplx ? "cplx" : "real"),
               N,
               amp,
               iter);
        printf("  carrier power  at bin %d: %g == %f dB\n",
               k,
               pwrCar,
               PWR2LOG(pwrCar));
        printf("  carrier mag || at bin %d: %g\n", k, sqrt(pwrCar));
        printf("  max other pwr  at bin %d: %g == %f dB\n",
               kmaxOther,
               pwrOther,
               PWR2LOG(pwrOther));
        printf("  dynamic range: %f dB\n\n",
               PWR2LOG(pwrCar) - PWR2LOG(pwrOther));
        retError = true;
        if (iter == 0)
          continue;
      }
      if (k > 0 && k != N / 2) {
        phi = atan2(Ys[2 * k + 1], Ys[2 * k]);
        if (fabs(phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0) {
          retError = true;
          printf("%s fft %d  bin %d amp %f : phase mismatch! phase = %f deg   "
                 "expected = %f deg\n",
                 (cplx ? "cplx" : "real"),
                 N,
                 k,
                 amp,
                 phi * 180.0 / M_PI,
                 phi0 * 180.0 / M_PI);
        }
      }
      expextedMag = cplx ? amp : ((k == 0 || k == N / 2) ? amp : (amp / 2));
      mag = sqrt(pwrCar) / N;
      if (fabs(mag - expextedMag) > MAG_ERR_LIMIT) {
        retError = true;
        printf("%s fft %d  bin %d amp %f : mag = %g   expected = %g\n",
               (cplx ? "cplx" : "real"),
               N,
               k,
               amp,
               mag,
               expextedMag);
      }
      /* now convert spectrum back */
      if (useOrdered)
        fft.inverse(Y, Z);
      else
        fft.inverseFromInternalLayout(R, Z); /* inverse() from internal Layout */
      errSum = 0.0;
      for (j = 0; j < (cplx ? (2 * N) : N); ++j) {
        /* scale back */
        Zs[j] /= N;
        /* square sum errors over real (and imag parts) */
        err = (Xs[j] - Zs[j]) * (Xs[j] - Zs[j]);
        errSum += err;
      }
      if (errSum > N * 1E-7) {
        retError = true;
        printf("%s fft %d  bin %d : inverse FFT doesn't match original signal! "
               "errSum = %g ; mean err = %g\n",
               (cplx ? "cplx" : "real"),
               N,
               k,
               errSum,
               errSum / N);
      }
      break;
    }
  }
  // using the std::vector<> base classes .. no need for alignedFree() for X, Y, Z and R
  return retError;
 }
 bool
 test(int N, bool useComplex, bool useOrdered)
 {
  if (useComplex) {
    return
 #ifdef PFFFT_ENABLE_FLOAT
           Ttest< std::complex<float> >(N, useOrdered)
 #endif
 #if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
        &&
 #endif
 #ifdef PFFFT_ENABLE_DOUBLE
           Ttest< std::complex<double> >(N, useOrdered)
 #endif
           ;
  } else {
    return
 #ifdef PFFFT_ENABLE_FLOAT
           Ttest<float>(N, useOrdered)
 #endif
 #if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
        &&
 #endif
 #ifdef PFFFT_ENABLE_DOUBLE
           Ttest<double>(N, useOrdered)
 #endif
           ;
  }
 }
 int
 main(int argc, char** argv)
 {
  int N, result, resN, resAll, k, resNextPw2, resIsPw2, resFFT;
  int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
  int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
  resNextPw2 = 0;
  resIsPw2 = 0;
  for (k = 0; k < (sizeof(inp_power_of_two) / sizeof(inp_power_of_two[0]));
       ++k) {
 #ifdef PFFFT_ENABLE_FLOAT
    N = pffft::Fft<float>::nextPowerOfTwo(inp_power_of_two[k]);
 #else
    N = pffft::Fft<double>::nextPowerOfTwo(inp_power_of_two[k]);
 #endif
    if (N != ref_power_of_two[k]) {
      resNextPw2 = 1;
      printf("pffft_next_power_of_two(%d) does deliver %d, which is not "
             "reference result %d!\n",
             inp_power_of_two[k],
             N,
             ref_power_of_two[k]);
    }
 #ifdef PFFFT_ENABLE_FLOAT
    result = pffft::Fft<float>::isPowerOfTwo(inp_power_of_two[k]);
 #else
    result = pffft::Fft<double>::isPowerOfTwo(inp_power_of_two[k]);
 #endif
    if (inp_power_of_two[k] == ref_power_of_two[k]) {
      if (!result) {
        resIsPw2 = 1;
        printf("pffft_is_power_of_two(%d) delivers false; expected true!\n",
               inp_power_of_two[k]);
      }
    } else {
      if (result) {
        resIsPw2 = 1;
        printf("pffft_is_power_of_two(%d) delivers true; expected false!\n",
               inp_power_of_two[k]);
      }
    }
  }
  if (!resNextPw2)
    printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
  if (!resIsPw2)
    printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
  resFFT = 0;
  for (N = 32; N <= 65536; N *= 2) {
    result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
    resN = result;
    resFFT |= result;
    result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
    resN |= result;
    resFFT |= result;
    result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
    resN |= result;
    resFFT |= result;
    result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
    resN |= result;
    resFFT |= result;
    if (!resN)
      printf("tests for size %d succeeded successfully.\n", N);
  }
  if (!resFFT)
    printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, "
 #ifdef PFFFT_ENABLE_FLOAT
           "float"
 #endif
 #if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
            "/"
 #endif
 #ifdef PFFFT_ENABLE_DOUBLE
           "double"
 #endif
           ") succeeded successfully.\n");
  resAll = resNextPw2 | resIsPw2 | resFFT;
  if (!resAll)
    printf("all tests succeeded successfully.\n");
  else
    printf("there are failed tests!\n");
  return resAll;
 }
--- a/pffft/uninstall.cmake
+++ b/pffft/uninstall.cmake
@@ -0,0 +1,24 @@
 set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
 if(NOT EXISTS ${MANIFEST})
    message(FATAL_ERROR "Cannot find install manifest: '${MANIFEST}'")
 endif()
 file(STRINGS ${MANIFEST} files)
 foreach(file ${files})
    if(EXISTS ${file})
        message(STATUS "Removing file: '${file}'")
        exec_program(
            ${CMAKE_COMMAND} ARGS "-E remove ${file}"
            OUTPUT_VARIABLE stdout
            RETURN_VALUE result
        )
        if(NOT "${result}" STREQUAL 0)
            message(FATAL_ERROR "Failed to remove file: '${file}'.")
        endif()
    else()
        MESSAGE(STATUS "File '${file}' does not exist.")
    endif()
 endforeach(file)
--- a/pffft/use_gcc8.inc
+++ b/pffft/use_gcc8.inc
@@ -0,0 +1,2 @@
 export GCC_WITH_CMAKE=$(which gcc-8)
 export GPP_WITH_CMAKE=$(which g++-8)
		`@@ -0,0 +1,2 @@`
							`export GCC_WITH_CMAKE=$(which gcc-8)`
							`export GPP_WITH_CMAKE=$(which g++-8)`