From a1790b8977e787e7740fd546b1af8bfd309d7850 Mon Sep 17 00:00:00 2001 From: "John K. Luebs" Date: Sat, 9 Nov 2024 14:57:18 -0600 Subject: [PATCH] add pffft --- pffft/.github/workflows/c-cpp.yml | 279 + pffft/.gitignore | 4 + pffft/.gitmodules | 9 + pffft/CMakeLists.txt | 663 ++ pffft/LICENSE.txt | 38 + pffft/README.md | 352 + pffft/bench/CMakeLists.txt | 224 + pffft/bench/unix_info.sh | 9 + pffft/bench_conv.cpp | 345 + pffft/bench_mixers.cpp | 889 +++ pffft/bench_pffft.c | 1402 ++++ pffft/cmake/FindMIPP.cmake | 26 + pffft/cmake/FindPAPI.cmake | 25 + pffft/cmake/compiler_warnings.cmake | 11 + pffft/cmake/target_optimizations.cmake | 197 + pffft/cross_build_mingw32.sh | 25 + pffft/cross_build_mingw64.sh | 25 + pffft/examples/CMakeLists.txt | 63 + pffft/examples/example_c_cplx_dbl_fwd.c | 69 + pffft/examples/example_c_real_flt_fwd.c | 66 + pffft/examples/example_cpp11_cplx_dbl_fwd.cpp | 66 + pffft/examples/example_cpp11_real_dbl_fwd.cpp | 66 + pffft/examples/example_cpp98_cplx_flt_fwd.cpp | 66 + pffft/examples/example_cpp98_real_flt_fwd.cpp | 66 + pffft/fftpack.c | 3130 +++++++++ pffft/fftpack.h | 799 +++ pffft/fmv.h | 20 + pffft/mingw-w32-i686.cmake | 25 + pffft/mingw-w64-x64_64.cmake | 25 + pffft/papi_perf_counter.h | 97 + pffft/pf_carrier.cpp | 298 + pffft/pf_carrier.h | 75 + pffft/pf_cic.cpp | 255 + pffft/pf_cic.h | 58 + pffft/pf_conv.cpp | 322 + pffft/pf_conv.h | 109 + pffft/pf_conv_dispatcher.cpp | 61 + pffft/pf_conv_dispatcher.h | 6 + pffft/pf_cplx.h | 44 + pffft/pf_mixer.cpp | 1148 ++++ pffft/pf_mixer.h | 270 + pffft/pffastconv.c | 264 + pffft/pffastconv.h | 171 + pffft/pffft.c | 134 + pffft/pffft.h | 241 + pffft/pffft.hpp | 1060 +++ pffft/pffft_common.c | 53 + pffft/pffft_double.c | 147 + pffft/pffft_double.h | 236 + pffft/pffft_priv_impl.h | 2233 ++++++ pffft/plots.sh | 50 + pffft/simd/pf_altivec_float.h | 81 + pffft/simd/pf_avx_double.h | 145 + pffft/simd/pf_double.h | 84 + pffft/simd/pf_float.h | 84 + pffft/simd/pf_neon_double.h | 203 + pffft/simd/pf_neon_double_from_avx.h | 123 + pffft/simd/pf_neon_float.h | 87 + pffft/simd/pf_scalar_double.h | 185 + pffft/simd/pf_scalar_float.h | 185 + pffft/simd/pf_sse1_float.h | 82 + pffft/simd/pf_sse2_double.h | 281 + pffft/sse2neon.h | 5956 +++++++++++++++++ pffft/test_fft_factors.c | 142 + pffft/test_pffastconv.c | 991 +++ pffft/test_pffft.c | 371 + pffft/test_pffft.cpp | 377 ++ pffft/uninstall.cmake | 24 + pffft/use_gcc8.inc | 2 + 69 files changed, 25719 insertions(+) create mode 100644 pffft/.github/workflows/c-cpp.yml create mode 100644 pffft/.gitignore create mode 100644 pffft/.gitmodules create mode 100644 pffft/CMakeLists.txt create mode 100644 pffft/LICENSE.txt create mode 100644 pffft/README.md create mode 100644 pffft/bench/CMakeLists.txt create mode 100755 pffft/bench/unix_info.sh create mode 100644 pffft/bench_conv.cpp create mode 100644 pffft/bench_mixers.cpp create mode 100644 pffft/bench_pffft.c create mode 100644 pffft/cmake/FindMIPP.cmake create mode 100644 pffft/cmake/FindPAPI.cmake create mode 100644 pffft/cmake/compiler_warnings.cmake create mode 100644 pffft/cmake/target_optimizations.cmake create mode 100755 pffft/cross_build_mingw32.sh create mode 100755 pffft/cross_build_mingw64.sh create mode 100644 pffft/examples/CMakeLists.txt create mode 100644 pffft/examples/example_c_cplx_dbl_fwd.c create mode 100644 pffft/examples/example_c_real_flt_fwd.c create mode 100644 pffft/examples/example_cpp11_cplx_dbl_fwd.cpp create mode 100644 pffft/examples/example_cpp11_real_dbl_fwd.cpp create mode 100644 pffft/examples/example_cpp98_cplx_flt_fwd.cpp create mode 100644 pffft/examples/example_cpp98_real_flt_fwd.cpp create mode 100644 pffft/fftpack.c create mode 100644 pffft/fftpack.h create mode 100644 pffft/fmv.h create mode 100644 pffft/mingw-w32-i686.cmake create mode 100644 pffft/mingw-w64-x64_64.cmake create mode 100644 pffft/papi_perf_counter.h create mode 100644 pffft/pf_carrier.cpp create mode 100644 pffft/pf_carrier.h create mode 100644 pffft/pf_cic.cpp create mode 100644 pffft/pf_cic.h create mode 100644 pffft/pf_conv.cpp create mode 100644 pffft/pf_conv.h create mode 100644 pffft/pf_conv_dispatcher.cpp create mode 100644 pffft/pf_conv_dispatcher.h create mode 100644 pffft/pf_cplx.h create mode 100644 pffft/pf_mixer.cpp create mode 100644 pffft/pf_mixer.h create mode 100644 pffft/pffastconv.c create mode 100644 pffft/pffastconv.h create mode 100644 pffft/pffft.c create mode 100644 pffft/pffft.h create mode 100644 pffft/pffft.hpp create mode 100644 pffft/pffft_common.c create mode 100644 pffft/pffft_double.c create mode 100644 pffft/pffft_double.h create mode 100644 pffft/pffft_priv_impl.h create mode 100755 pffft/plots.sh create mode 100644 pffft/simd/pf_altivec_float.h create mode 100644 pffft/simd/pf_avx_double.h create mode 100644 pffft/simd/pf_double.h create mode 100644 pffft/simd/pf_float.h create mode 100644 pffft/simd/pf_neon_double.h create mode 100644 pffft/simd/pf_neon_double_from_avx.h create mode 100644 pffft/simd/pf_neon_float.h create mode 100644 pffft/simd/pf_scalar_double.h create mode 100644 pffft/simd/pf_scalar_float.h create mode 100644 pffft/simd/pf_sse1_float.h create mode 100644 pffft/simd/pf_sse2_double.h create mode 100644 pffft/sse2neon.h create mode 100644 pffft/test_fft_factors.c create mode 100644 pffft/test_pffastconv.c create mode 100644 pffft/test_pffft.c create mode 100644 pffft/test_pffft.cpp create mode 100644 pffft/uninstall.cmake create mode 100644 pffft/use_gcc8.inc diff --git a/pffft/.github/workflows/c-cpp.yml b/pffft/.github/workflows/c-cpp.yml new file mode 100644 index 0000000..7eabe3b --- /dev/null +++ b/pffft/.github/workflows/c-cpp.yml @@ -0,0 +1,279 @@ +name: C/C++ CI + +on: + push: + branches: + - master + - github_actions + pull_request: + branches: + - master + - github_actions + +env: + # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) + BUILD_TYPE: Release + +jobs: + build_w_mipp_ubuntu-amd64: + runs-on: ubuntu-latest + + steps: + - name: check out MIPP + uses: actions/checkout@master + with: + repository: hayguen/MIPP + path: ./MIPP + - name: cmake configure MIPP + run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local + - name: cmake install MIPP headers + run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/ + + - uses: actions/checkout@v2 + - name: cmake_make_simd_float_double + run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full + - name: cmake_make_simd_float + run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float + - name: cmake_make_simd_double + run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double + - name: cmake_make_no-simd_float_double + run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full + - name: cmake_make_no-simd_scalar_float_double + run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full + - name: compress + run: tar zcvf pffft_w_mipp_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_ubuntu_builds + path: pffft_w_mipp_ubuntu-amd64.tar.gz + + build_ubuntu-amd64: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: cmake_make_simd_float_double + run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full + - name: cmake_make_simd_float + run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float + - name: cmake_make_simd_double + run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double + - name: cmake_make_no-simd_float_double + run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full + - name: cmake_make_no-simd_scalar_float_double + run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full + - name: compress + run: tar zcvf pffft_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_ubuntu_builds + path: pffft_ubuntu-amd64.tar.gz + + cross_build_win_from_linux: + runs-on: ubuntu-20.04 + + steps: + - name: prerequisites + run: sudo apt -qq update && sudo apt -yqq install gcc-mingw-w64 g++-mingw-w64 + + - name: check out MIPP + uses: actions/checkout@master + with: + repository: hayguen/MIPP + path: ./MIPP + - name: cmake configure MIPP + working-directory: ${{runner.workspace}} + run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd) + - name: cmake install MIPP headers + working-directory: ${{runner.workspace}} + run: cmake --build MIPP_build --target install + + - uses: actions/checkout@v2 + - name: build_w32_no-simd + working-directory: ${{runner.workspace}} + run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF + - name: build_w32_simd_full + working-directory: ${{runner.workspace}} + run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=pentium4 -DTARGET_C_ARCH=pentium4 -DMIPP_INCLUDE_DIRS=$X/include/mipp + + - name: build_w64_no-simd + working-directory: ${{runner.workspace}} + run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF + - name: build_w64_simd_full + working-directory: ${{runner.workspace}} + run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=sandybridge -DTARGET_C_ARCH=sandybridge -DMIPP_INCLUDE_DIRS=$X/include/mipp + + - name: compress + run: tar zcvf pffft_cross-build-windows-from-linux-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_w32_no-simd build_w32_simd_full build_w64_no-simd build_w64_simd_full + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_windows_from_cross_builds + path: pffft_cross-build-windows-from-linux-amd64.tar.gz + + + build_win_msvc: + # The CMake configure and build commands are platform agnostic and should work equally + # well on Windows or Mac. You can convert this to a matrix build if you need + # cross-platform coverage. + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + runs-on: windows-2019 + + steps: + - name: check out MIPP + uses: actions/checkout@master + with: + repository: hayguen/MIPP + path: ./MIPP + - name: cmake configure MIPP + shell: bash + working-directory: ${{runner.workspace}} + run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd) + - name: cmake install MIPP headers + working-directory: ${{runner.workspace}} + run: cmake --build MIPP_build --target install + + - uses: actions/checkout@v2 + + - name: Configure CMake No-SIMD + shell: bash + working-directory: ${{runner.workspace}} + run: cmake -S $GITHUB_WORKSPACE -B build_no-simd -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DPFFFT_USE_SIMD=OFF -DTARGET_CXX_ARCH=none -DTARGET_C_ARCH=none + - name: Build No-SIMD + shell: bash + working-directory: ${{runner.workspace}} + # Execute the build. You can specify a specific target with "--target " + run: cmake --build build_no-simd --config $BUILD_TYPE + + - name: Configure CMake SSE2 + shell: bash + working-directory: ${{runner.workspace}} + run: cmake -S $GITHUB_WORKSPACE -B build_sse2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=SSE2 -DTARGET_C_ARCH=SSE2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp + - name: Build SSE2 + shell: bash + working-directory: ${{runner.workspace}} + # Execute the build. You can specify a specific target with "--target " + run: cmake --build build_sse2 --config $BUILD_TYPE + + - name: Configure CMake AVX + # Use a bash shell so we can use the same syntax for environment variable + # access regardless of the host operating system + shell: bash + working-directory: ${{runner.workspace}} + run: cmake -S $GITHUB_WORKSPACE -B build_avx -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX -DTARGET_C_ARCH=AVX -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp + - name: Build AVX + working-directory: ${{runner.workspace}} + shell: bash + # Execute the build. You can specify a specific target with "--target " + run: cmake --build build_avx --config $BUILD_TYPE + + - name: Configure CMake AVX2 + # Use a bash shell so we can use the same syntax for environment variable + # access regardless of the host operating system + shell: bash + working-directory: ${{runner.workspace}} + run: cmake -S $GITHUB_WORKSPACE -B build_avx2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX2 -DTARGET_C_ARCH=AVX2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp + - name: Build AVX2 + working-directory: ${{runner.workspace}} + shell: bash + # Execute the build. You can specify a specific target with "--target " + run: cmake --build build_avx2 --config $BUILD_TYPE + + - name: compress + working-directory: ${{runner.workspace}} + run: tar zcvf pffft_windows-msvc-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_no-simd build_sse2 build_avx build_avx2 + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_windows_msvc_builds + path: ${{runner.workspace}}/pffft_windows-msvc-amd64.tar.gz + + + build_win_mingw: + runs-on: windows-2019 + strategy: + matrix: + compiler: [gcc] + msystem: [MINGW64] + defaults: + run: + shell: msys2 {0} + steps: + - uses: actions/checkout@v2 + - uses: msys2/setup-msys2@v2 + with: + msystem: MINGW64 + install: gcc cmake make + - name: Configure cmake + run: CC=gcc cmake -DMINGW=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native -S . -B build_mgw64 + - name: Build + run: cmake --build build_mgw64 + + - name: compress + run: tar zcvf pffft_windows-mingw-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_mgw64 + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_windows_mingw_builds + path: pffft_windows-mingw-amd64.tar.gz + + + build_macos11: + # copied from build_ubuntu-amd64 with minor renaming + runs-on: macos-11 + + steps: + - uses: actions/checkout@v2 + - name: cmake_make_simd_float_double + run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full + - name: cmake_make_simd_float + run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float + - name: cmake_make_simd_double + run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double + - name: cmake_make_no-simd_float_double + run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full + - name: cmake_make_no-simd_scalar_float_double + run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full + - name: compress + run: tar zcvf pffft_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_macos_builds + path: pffft_macos-11.tar.gz + + build_w_mipp_macos11: + # copied from build_w_mipp_ubuntu-amd64 with minor renaming + runs-on: macos-11 + + steps: + - name: check out MIPP + uses: actions/checkout@master + with: + repository: hayguen/MIPP + path: ./MIPP + - name: cmake configure MIPP + run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local + - name: cmake install MIPP headers + run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/ + + - uses: actions/checkout@v2 + - name: cmake_make_simd_float_double + run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full + - name: cmake_make_simd_float + run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float + - name: cmake_make_simd_double + run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double + - name: cmake_make_no-simd_float_double + run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full + - name: cmake_make_no-simd_scalar_float_double + run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full + - name: compress + run: tar zcvf pffft_w_mipp_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full + - name: 'Upload Artifact' + uses: actions/upload-artifact@v2 + with: + name: pffft_macos_builds + path: pffft_w_mipp_macos-11.tar.gz diff --git a/pffft/.gitignore b/pffft/.gitignore new file mode 100644 index 0000000..a476319 --- /dev/null +++ b/pffft/.gitignore @@ -0,0 +1,4 @@ +build +build_benches +build_* +.vscode diff --git a/pffft/.gitmodules b/pffft/.gitmodules new file mode 100644 index 0000000..9ef3633 --- /dev/null +++ b/pffft/.gitmodules @@ -0,0 +1,9 @@ +[submodule "greenffts"] + path = greenffts + url = https://github.com/hayguen/greenffts.git +[submodule "kissfft"] + path = kissfft + url = https://github.com/hayguen/kissfft.git +[submodule "pocketfft"] + path = pocketfft + url = https://github.com/hayguen/pocketfft.git diff --git a/pffft/CMakeLists.txt b/pffft/CMakeLists.txt new file mode 100644 index 0000000..c159a91 --- /dev/null +++ b/pffft/CMakeLists.txt @@ -0,0 +1,663 @@ +cmake_minimum_required(VERSION 2.8) +project(PRETTY_FAST_FFT) + +# smaller library size? +option(PFFFT_USE_TYPE_FLOAT "activate single precision 'float'?" ON) +option(PFFFT_USE_TYPE_DOUBLE "activate 'double' precision float?" ON) + +# architecture/optimization options +option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON) +option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON) + +# what to install? +option(INSTALL_PFFFT "install pffft to CMAKE_INSTALL_PREFIX?" ON) +option(INSTALL_PFDSP "install pfdsp to CMAKE_INSTALL_PREFIX?" OFF) +option(INSTALL_PFFASTCONV "install pffastconv to CMAKE_INSTALL_PREFIX?" OFF) + +# test options +option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF) +option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON) +option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON) +option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON) +option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF) +option(PFFFT_USE_FFTPACK "compile and use FFTPACK in fft benchmark & validation?" ON) + +option(PFFFT_USE_DEBUG_ASAN "use GCC's address sanitizer?" OFF) + +option(PFFFT_DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF) + +# C90 requires the gcc extensions for function attributes like always_inline +# C99 provides the function attributes: no gcc extensions required +set(CMAKE_C_STANDARD 99) +set(CMAKE_C_EXTENSIONS OFF) + +set(CMAKE_CXX_STANDARD 98) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +# populate what to install +set(INSTALL_TARGETS "") +set(INSTALL_HEADERS "") + + +if ( (NOT PFFFT_USE_TYPE_FLOAT) AND (NOT PFFFT_USE_TYPE_DOUBLE) ) + message(FATAL_ERROR "activate at least one of PFFFT_USE_TYPE_FLOAT or PFFFT_USE_TYPE_DOUBLE") +endif() + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") +include(cmake/target_optimizations.cmake) +include(cmake/compiler_warnings.cmake) +find_package(PAPI) +find_package(MIPP) +if (MIPP_FOUND) +# if (TARGET MIPP) + message(STATUS "found MIPP") +else() + message(STATUS "NOT found MIPP") +endif() + + +if (PFFFT_USE_DEBUG_ASAN) + set(ASANLIB "asan") +else() + set(ASANLIB "") +endif() + +message(STATUS "INFO: CMAKE_C_COMPILER_ID is ${CMAKE_C_COMPILER_ID}") +message(STATUS "INFO: CMAKE_CXX_COMPILER_ID is ${CMAKE_CXX_COMPILER_ID}") +if (WIN32) + message(STATUS "INFO: detected WIN32") +else() + message(STATUS "INFO: NOT WIN32") +endif() +if (MINGW) + message(STATUS "INFO: detected MINGW with compiler ${CMAKE_C_COMPILER_ID}") +else() + message(STATUS "INFO: NOT MINGW") +endif() +if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" ) + message(STATUS "INFO: detected MSVC with compiler ${CMAKE_C_COMPILER_ID}") +endif() + + +if (PFFFT_USE_BENCH_GREEN) + if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/greenffts/CMakeLists.txt") + message(STATUS "found subdir greenffts") + set(PATH_GREEN "${CMAKE_CURRENT_LIST_DIR}/greenffts") + add_subdirectory( "${PATH_GREEN}" ) + else() + message(WARNING "GreenFFT not found in subdir greenffts") + endif() +endif() + +if (PFFFT_USE_BENCH_KISS) + # git submodule add https://github.com/hayguen/kissfft.git + if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/kissfft/CMakeLists.txt") + message(STATUS "found subdir kissfft") + set(PATH_KISS "${CMAKE_CURRENT_LIST_DIR}/kissfft") + add_subdirectory( "${PATH_KISS}" ) + else() + message(WARNING "KissFFT not found in subdir kissfft") + endif() +endif() + +if (PFFFT_USE_BENCH_POCKET) + # git submodule add https://github.com/hayguen/pocketfft.git + if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c") + message(STATUS "found subdir pocketfft") + set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft") + add_subdirectory( "${PATH_POCKET}" ) + else() + message(WARNING "PocketFFT not found in subdir pocketfft") + endif() +endif() + + +######################################################################## +# select the release build type by default to get optimization flags +######################################################################## +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") + message(STATUS "Build type not specified: defaulting to release.") +endif(NOT CMAKE_BUILD_TYPE) + +if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" ) + # using Visual Studio C++ + message(STATUS "INFO: detected MSVC: will not link math lib m") + set(MATHLIB "") + + add_definitions("/D_CRT_SECURE_NO_WARNINGS") + + set(MSVC_DISABLED_WARNINGS_LIST + "C4996" + ) + +else() + if(PFFFT_DISABLE_LINK_WITH_M) + else() + message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m") + set(MATHLIB "m") + endif() +endif() + +set(STDCXXLIB "") +if (MINGW) + set(STDCXXLIB "stdc++") +endif() + + +set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h ) +set( SIMD_DOUBLE_HDRS simd/pf_double.h simd/pf_avx_double.h simd/pf_scalar_double.h ) + +if (PFFFT_USE_TYPE_FLOAT) + set( FLOAT_SOURCES pffft.c pffft.h ${SIMD_FLOAT_HDRS} ) + if (INSTALL_PFFFT) + set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.h) + endif() +else() + set( FLOAT_SOURCES ) +endif() + + +if (PFFFT_USE_TYPE_DOUBLE) + set( DOUBLE_SOURCES pffft_double.c pffft_double.h ${SIMD_DOUBLE_HDRS} ) + if (INSTALL_PFFFT) + set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft_double.h) + endif() +else() + set( DOUBLE_SOURCES ) +endif() + +###################################################### + +add_library(PFFFT STATIC ${FLOAT_SOURCES} ${DOUBLE_SOURCES} pffft_common.c pffft_priv_impl.h pffft.hpp ) +set_target_properties(PFFFT PROPERTIES OUTPUT_NAME "pffft") +target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES) +target_activate_c_compiler_warnings(PFFFT) +if (PFFFT_USE_SCALAR_VECT) + target_compile_definitions(PFFFT PRIVATE PFFFT_SCALVEC_ENABLED=1) +endif() +if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(PFFFT PRIVATE "-fsanitize=address") +endif() +target_set_c_arch_flags(PFFFT) +if (NOT PFFFT_USE_SIMD) + target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1) +endif() +target_link_libraries( PFFFT ${ASANLIB} ${MATHLIB} ) +set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ +) +if (INSTALL_PFFFT) + set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFFT) + set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.hpp) +endif() + +###################################################### + +if (PFFFT_USE_TYPE_FLOAT) + add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_cplx.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h ) + set_property(TARGET PFDSP PROPERTY CXX_STANDARD 11) + set_property(TARGET PFDSP PROPERTY CXX_STANDARD_REQUIRED ON) + set_target_properties(PFDSP PROPERTIES OUTPUT_NAME "pfdsp") + target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES) + target_activate_cxx_compiler_warnings(PFDSP) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(PFDSP PRIVATE "-fsanitize=address") + endif() + if (PFFFT_USE_SIMD) + target_set_cxx_arch_flags(PFDSP) + else() + target_compile_definitions(PFDSP PRIVATE PFFFT_SIMD_DISABLE=1) + endif() + target_link_libraries( PFDSP ${MATHLIB} ) + set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ + ) + if (INSTALL_PFDSP) + set(INSTALL_TARGETS ${INSTALL_TARGETS} PFDSP) + set(INSTALL_HEADERS ${INSTALL_HEADERS} pf_mixer.h pf_cplx.h pf_carrier.h pf_cic.h) + endif() +endif() + +###################################################### + +if (PFFFT_USE_FFTPACK) + + # float / single precision + add_library(FFTPACK_FLOAT STATIC fftpack.c fftpack.h) + target_compile_definitions(FFTPACK_FLOAT PRIVATE _USE_MATH_DEFINES) + target_activate_c_compiler_warnings(FFTPACK_FLOAT) + target_link_libraries( FFTPACK_FLOAT ${MATHLIB} ) + set_property(TARGET FFTPACK_FLOAT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ + ) + + # double precision + add_library(FFTPACK_DOUBLE STATIC fftpack.c fftpack.h) + target_compile_definitions(FFTPACK_DOUBLE PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(FFTPACK_DOUBLE PUBLIC FFTPACK_DOUBLE_PRECISION) + target_activate_c_compiler_warnings(FFTPACK_DOUBLE) + target_link_libraries( FFTPACK_DOUBLE ${MATHLIB} ) + set_property(TARGET FFTPACK_DOUBLE APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ + ) + + # builtin test program of fftpack + add_executable(test_fftpack_float fftpack.c fftpack.h) + target_compile_definitions(test_fftpack_float PRIVATE _USE_MATH_DEFINES TESTING_FFTPACK) + target_link_libraries(test_fftpack_float ${MATHLIB}) + + add_executable(test_fftpack_double fftpack.c fftpack.h) + target_compile_definitions(test_fftpack_double PRIVATE _USE_MATH_DEFINES FFTPACK_DOUBLE_PRECISION TESTING_FFTPACK) + target_link_libraries(test_fftpack_double ${MATHLIB}) + +endif() + +###################################################### + +if (PFFFT_USE_TYPE_FLOAT) + # only 'float' supported in PFFASTCONV + add_library(PFFASTCONV STATIC pffastconv.c pffastconv.h pffft.h ) + set_target_properties(PFFASTCONV PROPERTIES OUTPUT_NAME "pffastconv") + target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES) + target_activate_c_compiler_warnings(PFFASTCONV) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address") + endif() + target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} ) + set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ + ) + if (INSTALL_PFFASTCONV) + set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFASTCONV) + set(INSTALL_HEADERS ${INSTALL_HEADERS} pffastconv.h) + endif() +endif() + + +###################################################### + +install( TARGETS ${INSTALL_TARGETS} DESTINATION lib) +install( FILES ${INSTALL_HEADERS} DESTINATION include) + +add_custom_target(uninstall + "${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/uninstall.cmake" +) + +####################################################### + +if (PFFFT_USE_TYPE_FLOAT) + add_executable( test_pffft_float test_pffft.c ) + target_compile_definitions(test_pffft_float PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(test_pffft_float PRIVATE PFFFT_ENABLE_FLOAT) + target_link_libraries( test_pffft_float PFFFT ${ASANLIB} ) +endif() + +###################################################### + +if (PFFFT_USE_TYPE_DOUBLE) + add_executable( test_pffft_double test_pffft.c ) + target_compile_definitions(test_pffft_double PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(test_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE) + target_link_libraries( test_pffft_double PFFFT ${ASANLIB} ) +endif() + +###################################################### + +add_executable( test_fft_factors test_fft_factors.c ) +if (PFFFT_USE_TYPE_FLOAT) + target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_FLOAT) +endif() +if (PFFFT_USE_TYPE_DOUBLE) + target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_DOUBLE) +endif() +target_link_libraries(test_fft_factors PFFFT ${ASANLIB} ${MATHLIB}) + +###################################################### + +add_executable( test_pffft_cpp test_pffft.cpp ) +target_compile_definitions(test_pffft_cpp PRIVATE _USE_MATH_DEFINES) +if (PFFFT_USE_TYPE_FLOAT) + target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_FLOAT) +endif() +if (PFFFT_USE_TYPE_DOUBLE) + target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_DOUBLE) +endif() +target_link_libraries( test_pffft_cpp PFFFT ${STDCXXLIB} ${ASANLIB} ) + +###################################################### + +add_executable( test_pffft_cpp_11 test_pffft.cpp ) +target_compile_definitions(test_pffft_cpp_11 PRIVATE _USE_MATH_DEFINES) +if (PFFFT_USE_TYPE_FLOAT) + target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_FLOAT) +endif() +if (PFFFT_USE_TYPE_DOUBLE) + target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_DOUBLE) +endif() +target_link_libraries( test_pffft_cpp_11 PFFFT ${STDCXXLIB} ${ASANLIB} ) + +set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD 11) +set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD_REQUIRED ON) + +###################################################### + +if (PFFFT_USE_TYPE_FLOAT) + add_executable(test_pffastconv test_pffastconv.c + ${SIMD_FLOAT_HDRS} ${SIMD_DOUBLE_HDRS} + ) + target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(test_pffastconv PRIVATE "-fsanitize=address") + endif() + target_set_c_arch_flags(test_pffastconv) + if (NOT PFFFT_USE_SIMD) + target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1) + endif() + target_link_libraries( test_pffastconv PFFASTCONV ${ASANLIB} ${MATHLIB} ) + +endif() + +###################################################### + +if (PFFFT_USE_TYPE_FLOAT) + add_executable(bench_pffft_float bench_pffft.c pffft.h) + target_compile_definitions(bench_pffft_float PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(bench_pffft_float PRIVATE PFFFT_ENABLE_FLOAT) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(bench_pffft_float PRIVATE "-fsanitize=address") + endif() + + target_link_libraries( bench_pffft_float PFFFT ${ASANLIB} ) + + if (PFFFT_USE_FFTPACK) + target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTPACK=1) + target_link_libraries(bench_pffft_float FFTPACK_FLOAT) + endif() + + if (PFFFT_USE_BENCH_FFTW) + target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTW=1) + target_link_libraries(bench_pffft_float fftw3f) + endif() + + if (PATH_GREEN AND PFFFT_USE_BENCH_GREEN) + target_compile_definitions(bench_pffft_float PRIVATE HAVE_GREEN_FFTS=1) + target_link_libraries(bench_pffft_float GreenFFT) + endif() + + if (PATH_KISS AND PFFFT_USE_BENCH_KISS) + target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1) + target_link_libraries(bench_pffft_float KissFFT) + endif() + + if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET) + target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1) + target_link_libraries(bench_pffft_float PocketFFT) + endif() + + if (PFFFT_USE_BENCH_MKL) + if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") ) + # has chances to work + else() + # other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?! + message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.") + endif() + message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL") + target_compile_definitions(bench_pffft_float PRIVATE HAVE_MKL=1) + target_link_libraries(bench_pffft_float mkl_intel_lp64 mkl_sequential -lmkl_core) + endif() +endif() + +if (PFFFT_USE_TYPE_DOUBLE) + add_executable(bench_pffft_double bench_pffft.c pffft.h) + target_compile_definitions(bench_pffft_double PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(bench_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(bench_pffft_double PRIVATE "-fsanitize=address") + endif() + target_link_libraries( bench_pffft_double PFFFT ${ASANLIB} ) + + if (PFFFT_USE_FFTPACK) + target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTPACK=1) + target_link_libraries(bench_pffft_double FFTPACK_DOUBLE) + endif() + + if (PFFFT_USE_BENCH_FFTW) + target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1) + target_link_libraries(bench_pffft_double fftw3) + endif() + + if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET) + target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1) + target_link_libraries(bench_pffft_double PocketFFT) + endif() + + if (PFFFT_USE_BENCH_MKL) + if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") ) + # has chances to work + else() + # other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?! + message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.") + endif() + message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL") + target_compile_definitions(bench_pffft_double PRIVATE HAVE_MKL=1) + target_link_libraries(bench_pffft_double mkl_intel_lp64 mkl_sequential -lmkl_core) + endif() +endif() + +###################################################### + +if (PFFFT_USE_TYPE_FLOAT) + + add_executable(bench_pf_mixer_float bench_mixers.cpp papi_perf_counter.h) + target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT) + target_link_libraries( bench_pf_mixer_float ${ASANLIB} ) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(bench_pf_mixer_float PRIVATE "-fsanitize=address") + endif() + if (PAPI_FOUND) + target_compile_definitions(bench_pf_mixer_float PRIVATE HAVE_PAPI=1) + target_link_libraries(bench_pf_mixer_float ${PAPI_LIBRARIES}) + endif() + target_link_libraries( bench_pf_mixer_float PFDSP $<$:stdc++> ) + + + ############################################################################ + + add_library(pf_conv_arch_none pf_conv.cpp pf_conv.h pf_cplx.h) + target_compile_definitions(pf_conv_arch_none PRIVATE CONV_ARCH_POST=none MIPP_NO_INTRINSICS=1) + set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD 11) + set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD_REQUIRED ON) + target_activate_cxx_compiler_warnings(pf_conv_arch_none) + add_library(pf_conv_dispatcher pf_conv_dispatcher.cpp pf_conv_dispatcher.h pf_conv.h pf_cplx.h) + set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD 11) + set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD_REQUIRED ON) + target_activate_cxx_compiler_warnings(pf_conv_dispatcher) + + add_library(pf_conv_arch_dflt pf_conv.cpp pf_conv.h pf_cplx.h) + target_compile_definitions(pf_conv_arch_dflt PRIVATE CONV_ARCH_POST=dflt) + set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD 11) + set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD_REQUIRED ON) + target_activate_cxx_compiler_warnings(pf_conv_arch_dflt) + target_set_cxx_arch_flags(pf_conv_arch_dflt) + + target_link_libraries(pf_conv_dispatcher pf_conv_arch_none pf_conv_arch_dflt) + + if ((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")) + + if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(PF_CONV_ARCHES "sse3;sse4;avx;avx2") + set(PF_CONV_OPT_sse3 "core2") # emulate a map + set(PF_CONV_OPT_sse4 "nehalem") + set(PF_CONV_OPT_avx "sandybridge") + set(PF_CONV_OPT_avx2 "haswell") + target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AMD64) + elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(PF_CONV_ARCHES "sse2;avx;avx2") + set(PF_CONV_OPT_sse2 "SSE2") # emulate a map + set(PF_CONV_OPT_avx "AVX") + set(PF_CONV_OPT_avx2 "AVX2") + target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_MSVC_AMD64) + else() + set(PF_CONV_ARCHES "") + message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation") + endif() + + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + + if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(PF_CONV_ARCHES "armv8a") + set(PF_CONV_OPT_armv8a "armv8-a") # emulate a map for arch + + target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AARCH64) + else() + set(PF_CONV_ARCHES "") + message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation") + endif() + + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l") + + if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(PF_CONV_ARCHES "neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72") + set(PF_CONV_OPT_neon_vfpv4 "armv7-a") # emulate a map for arch + set(PF_CONV_EXTRA_neon_vfpv4 "neon_vfpv4") # emulate a map for additional options (EXTRA) + set(PF_CONV_OPT_neon_rpi3_a53 "armv7-a") + set(PF_CONV_EXTRA_neon_rpi3_a53 "neon_rpi3_a53") + set(PF_CONV_OPT_neon_rpi4_a72 "armv7-a") + set(PF_CONV_EXTRA_neon_rpi4_a72 "neon_rpi4_a72") + + target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_ARM32NEON) + else() + set(PF_CONV_ARCHES "") + message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation") + endif() + + else() + message(WARNING "this is unforseen CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation") + endif() + + foreach (arch_opt ${PF_CONV_ARCHES}) + add_library(pf_conv_arch_${arch_opt} pf_conv.cpp pf_conv.h pf_cplx.h) + set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD 11) + set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD_REQUIRED ON) + target_activate_cxx_compiler_warnings(pf_conv_arch_${arch_opt}) + target_compile_definitions(pf_conv_arch_${arch_opt} PRIVATE CONV_ARCH_POST=${arch_opt}) + + target_set_cxx_arch_option(pf_conv_arch_${arch_opt} "${PF_CONV_OPT_${arch_opt}}" "${PF_CONV_EXTRA_${arch_opt}}" "${PF_CONV_OPT_${arch_opt}}") + target_link_libraries(pf_conv_dispatcher pf_conv_arch_${arch_opt}) + message(STATUS "added library pf_conv_arch_${arch_opt} with CONV_ARCH_POST=${arch_opt}") + endforeach() + + if (PFFFT_USE_DEBUG_ASAN) + foreach (arch_opt ${PF_CONV_ARCHES}) + target_compile_options(pf_conv_arch_${arch_opt} PRIVATE "-fsanitize=address") + target_link_libraries( pf_conv_arch_${arch_opt} ${ASANLIB}) + endforeach() + + target_compile_options(pf_conv_arch_none PRIVATE "-fsanitize=address") + target_link_libraries( pf_conv_arch_none ${ASANLIB}) + + target_compile_options(pf_conv_dispatcher PRIVATE "-fsanitize=address") + target_link_libraries(pf_conv_dispatcher ${ASANLIB}) + endif() + + if(MIPP_FOUND) + foreach (arch_opt ${PF_CONV_ARCHES}) + message(STATUS "link pf_conv_arch_${arch_opt} against MIPP") + target_link_libraries(pf_conv_arch_${arch_opt} MIPP) + endforeach() + + message(STATUS "link pf_conv_arch_none against MIPP") + target_link_libraries(pf_conv_arch_none MIPP) + endif() + + ############################################################################ + + add_executable(bench_pf_conv_float bench_conv.cpp papi_perf_counter.h) + set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD 11) + set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD_REQUIRED ON) + target_compile_definitions(bench_pf_conv_float PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(bench_pf_conv_float PRIVATE PFFFT_ENABLE_FLOAT) + if (PFFFT_USE_DEBUG_ASAN) + target_compile_options(bench_pf_conv_float PRIVATE "-fsanitize=address") + endif() + target_link_libraries( bench_pf_conv_float ${ASANLIB} ) + if (PAPI_FOUND) + target_compile_definitions(bench_pf_conv_float PRIVATE HAVE_PAPI=1) + target_link_libraries(bench_pf_conv_float ${PAPI_LIBRARIES}) + endif() + if(MIPP_FOUND) + target_link_libraries(bench_pf_conv_float MIPP) + endif() + target_link_libraries( bench_pf_conv_float pf_conv_dispatcher PFDSP $<$:stdc++> ) + +endif() + +###################################################### + +add_subdirectory(examples) + +###################################################### + +enable_testing() + + +add_test(NAME test_fft_factors + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fft_factors" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} +) + +if (PFFFT_USE_FFTPACK) + add_test(NAME test_fftpack_float + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_float" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + add_test(NAME test_fftpack_double + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_double" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) +endif() + + +if (PFFFT_USE_TYPE_FLOAT) + + add_test(NAME bench_pffft_pow2 + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--max-len" "128" "--quick" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + add_test(NAME bench_pffft_non2 + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--non-pow2" "--max-len" "192" "--quick" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + # add_test(NAME bench_plots + # COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/plots.sh" + # WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + # ) + + add_test(NAME test_pfconv_lens_symetric + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + add_test(NAME test_pfconv_lens_non_sym + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + add_test(NAME bench_pfconv_symetric + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" "--sym" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + add_test(NAME bench_pfconv_non_sym + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + +endif() + diff --git a/pffft/LICENSE.txt b/pffft/LICENSE.txt new file mode 100644 index 0000000..1ee09cd --- /dev/null +++ b/pffft/LICENSE.txt @@ -0,0 +1,38 @@ + +Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de ) +Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + +Copyright (c) 2004 the University Corporation for Atmospheric +Research ("UCAR"). All rights reserved. Developed by NCAR's +Computational and Information Systems Laboratory, UCAR, +www.cisl.ucar.edu. + +Redistribution and use of the Software in source and binary forms, +with or without modification, is permitted provided that the +following conditions are met: + +- Neither the names of NCAR's Computational and Information Systems +Laboratory, the University Corporation for Atmospheric Research, +nor the names of its sponsors or contributors may be used to +endorse or promote products derived from this Software without +specific prior written permission. + +- Redistributions of source code must retain the above copyright +notices, this list of conditions, and the disclaimer below. + +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions, and the disclaimer below in the +documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + diff --git a/pffft/README.md b/pffft/README.md new file mode 100644 index 0000000..275c4e1 --- /dev/null +++ b/pffft/README.md @@ -0,0 +1,352 @@ + +--- + +# PFFFT: a pretty fast FFT and fast convolution with PFFASTCONV + +--- + + + +- [Brief Description](#brief-description) +- [Why does it exist?](#why-does-it-exist) +- [CMake](#cmake) +- [History / Origin / Changes](#history--origin--changes) +- [Comparison with other FFTs](#comparison-with-other-ffts) +- [Dependencies / Required Linux packages](#dependencies--required-linux-packages) +- [Benchmarks and results](#benchmarks-and-results) + + + +--- + +## Brief description: + +PFFFT does 1D Fast Fourier Transforms, of single precision real and +complex vectors. It tries do it fast, it tries to be correct, and it +tries to be small. Computations do take advantage of SSE1 instructions +on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The +license is BSD-like. + +PFFFT is a fork of [Julien Pommier's library on bitbucket](https://bitbucket.org/jpommier/pffft/) +with some changes and additions. + + +PFFASTCONV does fast convolution (FIR filtering), of single precision +real vectors, utilizing the PFFFT library. The license is BSD-like. + +PFDSP contains a few other signal processing functions. +Currently, mixing and carrier generation functions are contained. +It is work in progress - also the API! +The fast convolution from PFFASTCONV might get merged into PFDSP. + + +## Why does it exist: + +I (Julien Pommier) was in search of a good performing FFT library , +preferably very small and with a very liberal license. + +When one says "fft library", FFTW ("Fastest Fourier Transform in the +West") is probably the first name that comes to mind -- I guess that +99% of open-source projects that need a FFT do use FFTW, and are happy +with it. However, it is quite a large library , which does everything +fft related (2d transforms, 3d transforms, other transformations such +as discrete cosine , or fast hartley). And it is licensed under the +GNU GPL , which means that it cannot be used in non open-source +products. + +An alternative to FFTW that is really small, is the venerable FFTPACK +v4, which is available on NETLIB. A more recent version (v5) exists, +but it is larger as it deals with multi-dimensional transforms. This +is a library that is written in FORTRAN 77, a language that is now +considered as a bit antiquated by many. FFTPACKv4 was written in 1985, +by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite +its age, benchmarks show it that it still a very good performing FFT +library, see for example the 1d single precision benchmarks +[here](http://www.fftw.org/speed/opteron-2.2GHz-32bit/). It is however not +competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML, +Apple vDSP. The reason for that is that those libraries do take +advantage of the SSE SIMD instructions available on Intel CPUs, +available since the days of the Pentium III. These instructions deal +with small vectors of 4 floats at a time, instead of a single float +for a traditionnal FPU, so when using these instructions one may expect +a 4-fold performance improvement. + +The idea was to take this fortran fftpack v4 code, translate to C, +modify it to deal with those SSE instructions, and check that the +final performance is not completely ridiculous when compared to other +SIMD FFT libraries. Translation to C was performed with [f2c]( +http://www.netlib.org/f2c/). The resulting file was a bit edited in +order to remove the thousands of gotos that were introduced by +f2c. You will find the fftpack.h and fftpack.c sources in the +repository, this a complete translation of [fftpack]( +http://www.netlib.org/fftpack/), with the discrete cosine transform +and the test program. There is no license information in the netlib +repository, but it was confirmed to me by the fftpack v5 curators that +the [same terms do apply to fftpack v4] +(http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html). This is a +"BSD-like" license, it is compatible with proprietary projects. + +Adapting fftpack to deal with the SIMD 4-element vectors instead of +scalar single precision numbers was more complex than I originally +thought, especially with the real transforms, and I ended up writing +more code than I planned.. + + +## The code: + +### Good old C: +The FFT API is very very simple, just make sure that you read the comments in `pffft.h`. + +The Fast convolution's API is also very simple, just make sure that you read the comments +in `pffastconv.h`. + +### C++: +A simple C++ wrapper is available in `pffft.hpp`. + +### Git: +This archive's source can be downloaded with git (without the submodules): +``` +git clone https://github.com/marton78/pffft.git +``` + +### Only two files?: +_"Only two files, in good old C, pffft.c and pffft.h"_ + +This statement does **NO LONGER** hold! + +With new functionality and support for AVX, there was need to restructure the sources. +But you can compile and link **pffft** as a static library. + + +## CMake: +There's now CMake support to build the static libraries `libPFFFT.a` +and `libPFFASTCONV.a` from the source files, plus the additional +`libFFTPACK.a` library. Later one's sources are there anyway for the benchmark. + +There are several CMake options to modify library size and optimization. +You can explore all available options with `cmake-gui` or `ccmake`, +the console version - after having installed (on Debian/Ubuntu Linux) one of +``` +sudo apt-get install cmake-qt-gui +sudo apt-get install cmake-curses-gui +``` + +Some of the options: +* `PFFFT_USE_TYPE_FLOAT` to activate single precision 'float' (default: ON) +* `PFFFT_USE_TYPE_DOUBLE` to activate 'double' precision float (default: ON) +* `PFFFT_USE_SIMD` to use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? (default: ON) +* `DISABLE_SIMD_AVX` to disable AVX CPU features (default: OFF) +* `PFFFT_USE_SIMD_NEON` to force using NEON on ARM (requires PFFFT_USE_SIMD) (default: OFF) +* `PFFFT_USE_SCALAR_VECT` to use 4-element vector scalar operations (if no other SIMD) (default: ON) + +Options can be passed to `cmake` at command line, e.g. +``` +cmake -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_TYPE_DOUBLE=ON +``` + +My Linux distribution defaults to GCC. With installed CLANG and the bash shell, you can use it with +``` +mkdir build +cd build +CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -DCMAKE_BUILD_TYPE=Debug ../ +cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=~ ../ +ccmake . # or: cmake-gui . +cmake --build . # or simply: make +ctest # to execute some tests - including benchmarks +cmake --build . --target install # or simply: [sudo] make install +``` + +With MSVC on Windows, you need some different options. Following ones to build a 64-bit Release with Visual Studio 2019: +``` +mkdir build +cd build +cmake -G "Visual Studio 16 2019" -A x64 .. +cmake --build . --config Release +ctest -C Release +``` + +see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators) + + +## History / Origin / Changes: +Origin for this code/fork is Julien Pommier's pffft on bitbucket: +[https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/) + +Git history shows following first commits of the major contributors: +* Julien Pommier: November 19, 2011 +* Marton Danoczy: September 30, 2015 +* Hayati Ayguen: December 22, 2019 +* Dario Mambro: March 24, 2020 + +There are a few other contributors not listed here. + +The main changes include: +* improved benchmarking, see [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks) +* double support +* avx(2) support +* c++ headers (wrapper) +* additional API helper functions +* additional library for fast convolution +* cmake support +* ctest + + +## Comparison with other FFTs: +The idea was not to break speed records, but to get a decently fast +fft that is at least 50% as fast as the fastest FFT -- especially on +slowest computers . I'm more focused on getting the best performance +on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than +on getting top performance on today fastest cpus. + +It can be used in a real-time context as the fft functions do not +perform any memory allocation -- that is why they accept a 'work' +array in their arguments. + +It is also a bit focused on performing 1D convolutions, that is why it +provides "unordered" FFTs , and a fourier domain convolution +operation. + +Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages). +It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast. +The whole C++ implementation file is 161 lines, including the Copyright header, see +[https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp) + +## Dependencies / Required Linux packages + +On Debian/Ubuntu Linux following packages should be installed: + +``` +sudo apt-get install build-essential gcc g++ cmake +``` + + +## Benchmarks and results + +#### Quicklink +Find results at [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks). + +#### General +My (Hayati Ayguen) first look at FFT-benchmarks was with [benchFFT](http://www.fftw.org/benchfft/) +and especially the results of the benchmarks [results](http://www.fftw.org/speed/), +which demonstrate the performance of the [FFTW](http://www.fftw.org/). +Looking at the benchmarked computer systems from todays view (2021), these are quite outdated. + +Having a look into the [benchFFT source code](http://www.fftw.org/benchfft/benchfft-3.1.tar.gz), +the latest source changes, including competitive fft implementations, are dated November 2003. + +In 2019, when pffft got my attention at [bitbucket](https://bitbucket.org/jpommier/pffft/src/master/), +there were also some benchmark results. +Unfortunately the results are tables with numbers - without graphical plots. +Without the plots, i could not get an impression. That was, why i started +[https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks), +which includes GnuPlot figures. + +Today in June 2021, i realized the existence of [https://github.com/FFTW/benchfft](https://github.com/FFTW/benchfft). +This repository is much more up-to-date with a commit in December 2020. +Unfortunately, it looks not so simple to get it run - including the generation of plots. + +Is there any website showing benchFFT results of more recent computer systems? + +Of course, it's very important, that a benchmark can be compared with a bunch +of different FFT algorithms/implementations. +This requires to have these compiled/built and utilizable. + + +#### Git submodules for Green-, Kiss- and Pocket-FFT +Sources for [Green-](https://github.com/hayguen/greenffts), +[Kiss-](https://github.com/hayguen/kissfft) +and [Pocket-FFT](https://github.com/hayguen/pocketfft) +can be downloaded directly with the sources of this repository - using git submodules: +``` +git clone --recursive https://github.com/marton78/pffft.git +``` + +Important is `--recursive`, that does also fetch the submodules directly. +But you might retrieve the submodules later, too: +``` +git submodule update --init +``` + +#### Fastest Fourier Transform in the West: FFTW +To allow comparison with FFTW [http://www.fftw.org/](http://www.fftw.org/), +cmake option `-DPFFFT_USE_BENCH_FFTW=ON` has to be used with following commands. +The cmake option requires previous setup of following (debian/ubuntu) package: +``` +sudo apt-get install libfftw3-dev +``` + +#### Intel Math Kernel Library: MKL +Intel's MKL [https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html) +currently looks even faster than FFTW. + +On Ubuntu-Linux it's easy to setup with the package `intel-mkl`. +Similar on Debian: `intel-mkl-full`. + +There are special repositories for following Linux distributions: +* Debian/apt: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html) +* RedHat/yum: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html) +* Gentoo/ebuild: [https://packages.gentoo.org/packages/sci-libs/mkl](https://packages.gentoo.org/packages/sci-libs/mkl) + +#### Performing the benchmarks - with CMake +Benchmarks should be prepared by creating a special build folder +``` +mkdir build_benches +cd build_benches +cmake ../bench +``` + +There are several CMake options to parametrize, which fft implementations should be benched. +You can explore all available options with `cmake-gui` or `ccmake`, see [CMake](#cmake). + +Some of the options: +* `BENCH_ID` name the benchmark - used in filename +* `BENCH_ARCH` target architecture passed to compiler for code optimization +* `PFFFT_USE_BENCH_FFTW` use (system-installed) FFTW3 in fft benchmark? (default: OFF) +* `PFFFT_USE_BENCH_GREEN` use Green FFT in fft benchmark? (default: ON) +* `PFFFT_USE_BENCH_KISS` use KissFFT in fft benchmark? (default: ON) +* `PFFFT_USE_BENCH_POCKET` use PocketFFT in fft benchmark? (default: ON) +* `PFFFT_USE_BENCH_MKL` use Intel MKL in fft benchmark? (default: OFF) + +These options can be passed to `cmake` at command line, e.g. +``` +cmake -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench +``` + +The benchmarks are built and executed with +``` +cmake --build . +``` + +You can also specify to use a different compiler/version with the cmake step, e.g.: + +``` +CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 cmake -DBENCH_ID=gcc9 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench +``` + +``` +CC=/usr/bin/clang-11 CXX=/usr/bin/clang++-11 cmake -DBENCH_ID=clang11 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench +``` + +For using MSVC/Windows, the cmake command requires/needs the generator and architecture options and to be called from the VS Developer prompt: +``` +cmake -G "Visual Studio 16 2019" -A x64 ../bench/ +``` + +see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators) + + + +For running with different compiler version(s): +* copy the result file (.tgz), e.g. `cp *.tgz ../` +* delete the build directory: `rm -rf *` +* then continue with the cmake step + + +#### Benchmark results and contribution +You might contribute by providing us the results of your computer(s). + +The benchmark results are stored in a separate git-repository: +See [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks). + +This is to keep this repositories' sources small. + diff --git a/pffft/bench/CMakeLists.txt b/pffft/bench/CMakeLists.txt new file mode 100644 index 0000000..2bc49c6 --- /dev/null +++ b/pffft/bench/CMakeLists.txt @@ -0,0 +1,224 @@ +cmake_minimum_required(VERSION 2.8) +project(BENCH_PFFFT) + +set(BENCH_ID "default" CACHE STRING "ID: use single word without spaces. gets part of result filename") + +option(BENCH_FAST_MATH "Build with fast math - non IEEE compliant" ON) + +if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge/ARM-NEON:armv7-a") +elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang") + set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge") +elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC") # others: "Intel" + set(BENCH_ARCH "AVX" CACHE STRING "target architecture (/arch): SSE2/AVX") +else() + set(BENCH_ARCH "" CACHE STRING "target architecture - use full compiler option!") +endif() + +# architecture/optimization options +option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON) +option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF) +option(PFFFT_USE_SIMD_NEON "force using NEON on ARM? (requires PFFFT_USE_SIMD)" OFF) +option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON) + +option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF) +option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON) +option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON) +option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON) +option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF) + + +set(OSSTR "") +if (WIN32) + set(OSSTR "Win32") +endif (WIN32) +if (UNIX) + set(OSSTR "Unix") +endif (UNIX) + +set(BUILD_DIR_TO_EXE "") +set(CMAKE_PLATFORM_OPT "") +set(CMAKE_MAKE_OPT "") +if (MSVC) + set(BUILD_DIR_TO_EXE "Release/") + set(CMAKE_PLATFORM_OPT "-A \"${CMAKE_GENERATOR_PLATFORM}\"") + set(CMAKE_MAKE_OPT "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}") +endif() + + +set(benchdir "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}") +set(benchdir_flt "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/float") +set(benchdir_dbl "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/double") +set(builddir_flt "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_float") +set(builddir_dbl "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_double") + +add_custom_command(OUTPUT "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir}" + COMMAND ${CMAKE_COMMAND} -E echo "benchmark ${BENCH_ID}" > "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "CMake major: ${CMAKE_MAJOR_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "CMake minor: ${CMAKE_MINOR_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "OS: ${OSSTR}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "System: ${CMAKE_SYSTEM_NAME}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "System CPU: ${CMAKE_SYSTEM_PROCESSOR}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "System Version: ${CMAKE_HOST_SYSTEM_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "C Compiler: ${CMAKE_C_COMPILER_ID}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "C Version: ${CMAKE_C_COMPILER_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "C++ Compiler: ${CMAKE_CXX_COMPILER_ID}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "C++ Version: ${CMAKE_CXX_COMPILER_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "MSVC Version: ${MSVC_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "MSVC Toolset: ${MSVC_TOOLSET_VERSION}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "Exe Suffix: ${CMAKE_EXECUTABLE_SUFFIX}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "C Byte Order: ${CMAKE_C_BYTE_ORDER}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "C++ Byte Order: ${CMAKE_CXX_BYTE_ORDER}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "Architecture: ${BENCH_ARCH}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "Fast math: ${BENCH_FAST_MATH}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD=${PFFFT_USE_SIMD}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config DISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}" >> "${benchdir}/info.txt" + COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}" >> "${benchdir}/info.txt" +) + +if (UNIX) + add_custom_command(OUTPUT "${benchdir}/unix_info.txt" + COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt" + COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/unix_info.sh" + DEPENDS "${benchdir}/info.txt" + WORKING_DIRECTORY ${benchdir} + ) +else() + add_custom_command(OUTPUT "${benchdir}/unix_info.txt" + COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt" + DEPENDS "${benchdir}/info.txt" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) +endif() + + +add_custom_command(OUTPUT "${builddir_flt}/directory.txt" + COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_flt}" + COMMAND ${CMAKE_COMMAND} -E touch "${builddir_flt}/directory.txt" +) + +add_custom_command(OUTPUT "${builddir_dbl}/directory.txt" + COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_dbl}" + COMMAND ${CMAKE_COMMAND} -E touch "${builddir_dbl}/directory.txt" +) + +add_custom_command(OUTPUT "${benchdir_flt}/directory.txt" + COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_flt}" + COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_flt}/directory.txt" +) + +add_custom_command(OUTPUT "${benchdir_dbl}/directory.txt" + COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_dbl}" + COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_dbl}/directory.txt" +) + + + +add_custom_target(build_float + COMMAND ${CMAKE_COMMAND} -E echo "start cmake for float in ${builddir_flt}" + COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT} + "${CMAKE_MAKE_OPT}" + -DCMAKE_BUILD_TYPE=Release + "-DARCH=${BENCH_ARCH}" + -DUSE_FAST_MATH=${BENCH_FAST_MATH} + -DPFFFT_USE_TYPE_FLOAT=ON + -DPFFFT_USE_TYPE_DOUBLE=OFF + -DUSE_FLOAT_PREC=ON + -DPFFFT_USE_SIMD=${PFFFT_USE_SIMD} + -DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX} + -DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON} + -DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT} + -DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW} + -DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN} + -DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS} + -DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET} + -DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL} + "${CMAKE_SOURCE_DIR}/.." + # COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for float in ${builddir_flt}" + COMMAND ${CMAKE_COMMAND} --build . --config Release + DEPENDS "${builddir_flt}/directory.txt" + WORKING_DIRECTORY "${builddir_flt}" +) + +add_custom_target(build_double + COMMAND ${CMAKE_COMMAND} -E echo "start cmake for double in ${builddir_dbl}" + COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT} + "${CMAKE_MAKE_OPT}" + -DCMAKE_BUILD_TYPE=Release + "-DARCH=${BENCH_ARCH}" + -DUSE_FAST_MATH=${BENCH_FAST_MATH} + -DPFFFT_USE_TYPE_FLOAT=OFF + -DPFFFT_USE_TYPE_DOUBLE=ON + -DUSE_FLOAT_PREC=OFF + -DPFFFT_USE_SIMD=${PFFFT_USE_SIMD} + -DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX} + -DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON} + -DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT} + -DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW} + -DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN} + -DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS} + -DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET} + -DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL} + "${CMAKE_SOURCE_DIR}/.." + COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for double in ${builddir_dbl}" + COMMAND ${CMAKE_COMMAND} --build . --config Release + DEPENDS "${builddir_dbl}/directory.txt" + WORKING_DIRECTORY "${builddir_dbl}" +) + +add_custom_target(bench_float + COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for float" + COMMAND "${builddir_flt}/${BUILD_DIR_TO_EXE}bench_pffft_float${CMAKE_EXECUTABLE_SUFFIX}" + DEPENDS "${benchdir_flt}/directory.txt" build_float + WORKING_DIRECTORY "${benchdir_flt}" +) + +add_custom_target(bench_double + COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for double" + COMMAND "${builddir_dbl}/${BUILD_DIR_TO_EXE}bench_pffft_double${CMAKE_EXECUTABLE_SUFFIX}" + DEPENDS "${benchdir_dbl}/directory.txt" build_double + WORKING_DIRECTORY "${benchdir_dbl}" +) + +add_custom_target(bench ALL + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir} + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz" + # DEPENDS "${benchdir}/info.txt" "${benchdir}/unix_info.txt" + DEPENDS "${benchdir}/info.txt" bench_float bench_double "${benchdir}/unix_info.txt" + WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" +) + +add_custom_target(bench_float_tar + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir} + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz" + DEPENDS "${benchdir}/info.txt" bench_float "${benchdir}/unix_info.txt" + WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" +) + +add_custom_target(bench_double_tar + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir} + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz" + DEPENDS "${benchdir}/info.txt" bench_double "${benchdir}/unix_info.txt" + WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" +) + +add_custom_target(clean_results + COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_flt}" + COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_dbl}" + WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" +) + diff --git a/pffft/bench/unix_info.sh b/pffft/bench/unix_info.sh new file mode 100755 index 0000000..7ef6687 --- /dev/null +++ b/pffft/bench/unix_info.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +lscpu > unix_lscpu.txt +cat /proc/cpuinfo > unix_cpuinfo.txt +lsb_release -a > unix_lsb_release.txt +FILES=$(ls -1 /etc/*-release) +if [ ! -z "$FILES" ]; then + cp /etc/*-release ./ +fi diff --git a/pffft/bench_conv.cpp b/pffft/bench_conv.cpp new file mode 100644 index 0000000..a42d8ef --- /dev/null +++ b/pffft/bench_conv.cpp @@ -0,0 +1,345 @@ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "papi_perf_counter.h" + +//#if defined(HAVE_MIPP) && !defined(NO_MIPP) +#if defined(HAVE_MIPP) +#include + +#define MIPP_VECTOR mipp::vector +#else +#define MIPP_VECTOR std::vector +#endif + +#include "pf_conv_dispatcher.h" +#include "pf_conv.h" + + +#define TEST_WITH_MIN_LEN 0 + + +MIPP_VECTOR generate_rng_vec(int M, int N = -1, int seed_value = 1) +{ + MIPP_VECTOR v(N < 0 ? M : N); + std::mt19937 g; + g.seed(seed_value); + constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX)); + for (int k = 0; k < M; ++k) + v[k] = float(int_fast32_t(g())) * scale; + for (int k = M; k < N; ++k) + v[k] = 0.0F; + return v; +} + + +int bench_oop_core( + const conv_f_ptrs & conv_arch, + const float * signal, const int sz_signal, + const float * filter, const int sz_filter, + const int blockLen, + float * y + ) +{ + conv_buffer_state state; + const auto conv_oop = conv_arch.fp_conv_float_oop; + int n_out_sum = 0; + state.offset = 0; + state.size = 0; + papi_perf_counter perf_counter(1); + for (int off = 0; off + blockLen <= sz_signal; off += blockLen) + { + state.size += blockLen; + int n_out = conv_oop(signal, &state, filter, sz_filter, y); + n_out_sum += n_out; + } + return n_out_sum; +} + +int bench_inplace_core( + const conv_f_ptrs & conv_arch, + float * signal, const int sz_signal, + const float * filter, const int sz_filter, + const int blockLen + ) +{ + conv_buffer_state state; + const auto conv_inplace = conv_arch.fp_conv_float_inplace; + int n_out_sum = 0; + state.offset = 0; + state.size = 0; + papi_perf_counter perf_counter(1); + for (int off = 0; off + blockLen <= sz_signal; off += blockLen) + { + state.size += blockLen; + int n_out = conv_inplace(signal, &state, filter, sz_filter); + n_out_sum += n_out; + } + return n_out_sum; +} + + +int bench_oop( + const conv_f_ptrs & conv_arch, + float * buffer, + const float * signal, const int sz_signal, + const float * filter, const int sz_filter, + const int blockLen, + float * y + ) +{ + conv_buffer_state state; + const auto conv_oop = conv_arch.fp_conv_float_oop; + const auto move_rest = conv_arch.fp_conv_float_move_rest; + int n_out_sum = 0; + state.offset = 0; + state.size = 0; + papi_perf_counter perf_counter(1); + for (int off = 0; off + blockLen <= sz_signal; off += blockLen) + { + move_rest(buffer, &state); + //memcpy(buffer+state.size, &s[off], B * sizeof(s[0])); + std::copy(&signal[off], &signal[off+blockLen], buffer+state.size); + state.size += blockLen; + int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]); + n_out_sum += n_out; + } + return n_out_sum; +} + +int bench_cx_real_oop( + const conv_f_ptrs & conv_arch, + complexf * buffer, + const float * signal_re, const int sz_signal_re, + const float * filter, const int sz_filter, + const int blockLen, + float * y_re + ) +{ + conv_buffer_state state; + const auto conv_oop = conv_arch.fp_conv_cplx_float_oop; + const auto move_rest = conv_arch.fp_conv_cplx_move_rest; + // interpret buffer, signal and output vector y as complex data + complexf * y = reinterpret_cast(y_re); + const complexf * signal = reinterpret_cast(signal_re); + const int sz_signal = sz_signal_re / 2; + int n_out_sum = 0; + state.offset = 0; + state.size = 0; + papi_perf_counter perf_counter(1); + for (int off = 0; off + blockLen <= sz_signal; off += blockLen) + { + move_rest(buffer, &state); + //memcpy(buffer+state.size, &s[off], B * sizeof(s[0])); + std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]); + state.size += blockLen; + int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]); + n_out_sum += n_out; + } + return n_out_sum; +} + + +int main(int argc, char *argv[]) +{ + // cli defaults: + // process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128 + int arch = 0, N = 64 * 1024 * 1024; + int filterLen = 128, blockLen = 1024; + int seed_sig = 1, seed_filter = 2; + bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1); + + for (int i = 1; i < argc; ++i) + { + if (i+1 < argc && !strcmp(argv[i], "-a")) + arch = atoi(argv[++i]); + else if (i+1 < argc && !strcmp(argv[i], "-n")) + N = atoi(argv[++i]) * 1024 * 1024; + else if (i+1 < argc && !strcmp(argv[i], "-f")) + filterLen = atoi(argv[++i]); + else if (i+1 < argc && !strcmp(argv[i], "-b")) + blockLen = atoi(argv[++i]); + else if (i+1 < argc && !strcmp(argv[i], "-ss")) + seed_sig = atoi(argv[++i]); + else if (i+1 < argc && !strcmp(argv[i], "-sf")) + seed_filter = atoi(argv[++i]); + else if (!strcmp(argv[i], "-v")) + verbose = true; + else if (!strcmp(argv[i], "-h")) + showUsage = exitFromUsage = true; + else + fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]); + } + + int num_arch = 0; + const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch); + + if (verbose) + { + fprintf(stderr, "num_arch is %d\n", num_arch); + for (int a = 0; a < num_arch; ++a) + if (conv_arch_ptrs[a]) + fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id ); + else + fprintf(stderr, " arch %d is nullptr !!!\n", a ); + fprintf(stderr, "\n"); + } + + if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage ) + { + fprintf(stderr, "%s [-v] [-a ] [-n [-f ] [-b ]\n", argv[0]); + fprintf(stderr, " [-ss ] [-sf ]\n"); + fprintf(stderr, "arch is one of:"); + for (int a = 0; a < num_arch; ++a) + if (conv_arch_ptrs[a]) + fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") ); + fprintf(stderr, "\n"); + if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch ) + return 0; + } + + if (verbose) + { + #ifdef HAVE_PAPI + fprintf(stderr, "PAPI is available\n"); + #else + fprintf(stderr, "PAPI is NOT available!\n"); + #endif + } + #if !defined(HAVE_MIPP) + fprintf(stderr, "MIPP is NOT available!\n"); + #endif + + //int float_simd_size[num_arch]; + int max_simd_size = -1; + for (int a = 0; a < num_arch; ++a) + { + if (conv_arch_ptrs[a]) + { + const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size(); + //float_simd_size[a] = sz; + if (max_simd_size < sz) + max_simd_size = sz; + if (verbose) + fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz); + } + //else + // float_simd_size[a] = 0; + } + //const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] ); + if (verbose) + fprintf(stderr, "max float simd size: %d\n", max_simd_size); + +#if TEST_WITH_MIN_LEN + filterLen = 2; +#endif + + // round up filter length + filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size ); + +#if TEST_WITH_MIN_LEN + blockLen = 1; + N = 2 * (3 + filterLen); // produce 3+1 samples +#endif + + if (!conv_arch_ptrs[arch]) + { + fprintf(stderr, "Error: architecture %d is NOT available!\n", arch); + return 1; + } + const conv_f_ptrs & conv_arch = *conv_arch_ptrs[arch]; + if (verbose) + fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp); + + fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n", + N / (1024 * 1024), blockLen, filterLen, conv_arch.id ); + + MIPP_VECTOR s = generate_rng_vec(N + 1, N + 1, seed_sig); + MIPP_VECTOR y(N + 1, 0.0F); + MIPP_VECTOR filter = generate_rng_vec(filterLen, filterLen, seed_filter); + MIPP_VECTOR buffer(blockLen + filterLen + 1, 0.0F); + MIPP_VECTOR buffer_cx(blockLen + filterLen + 1); + +#if 1 && TEST_WITH_MIN_LEN + for (int k = 0; k < N; ++k) + s[k] = (k+1); + for (int k = 0; k < filterLen; ++k) + filter[k] = (k+1); +#endif + + s[N] = 123.0F; + y[N] = 321.0F; + buffer[blockLen + filterLen] = 789.0F; + buffer_cx[blockLen + filterLen].i = 987.0F; + + fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id); + int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data()); + fprintf(stderr, "oop produced %d output samples\n", n_oop_out); +#if TEST_WITH_MIN_LEN + for (int k = 0; k < n_oop_out; ++k ) + fprintf(stderr, "y[%2d] = %g\n", k, y[k]); + fprintf(stderr, "\n"); +#endif + + fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id); + n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data()); + fprintf(stderr, "oop produced %d output samples\n", n_oop_out); + assert(s[N] == 123.0F); + assert(y[N] == 321.0F); + assert(buffer[blockLen + filterLen] == 789.0F); + assert(buffer_cx[blockLen + filterLen].i == 987.0F); +#if TEST_WITH_MIN_LEN + for (int k = 0; k < n_oop_out; ++k ) + fprintf(stderr, "y[%2d] = %g\n", k, y[k]); + fprintf(stderr, "\n"); +#endif + + fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id); + n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data()); + fprintf(stderr, "oop produced %d output samples\n", n_oop_out); + assert(s[N] == 123.0F); + assert(y[N] == 321.0F); + assert(buffer[blockLen + filterLen] == 789.0F); + assert(buffer_cx[blockLen + filterLen].i == 987.0F); +#if TEST_WITH_MIN_LEN + fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out); + for (int k = 0; k < n_oop_out; ++k ) + fprintf(stderr, "y[%2d] = %g %+g * i\n", k, y[2*k], y[2*k+1]); + fprintf(stderr, "\n"); + + const std::complex * sc = reinterpret_cast< std::complex* >( s.data() ); + const int Nc = N /2; + fprintf(stderr, "reference with std::complex:\n"); + for (int off = 0; off +filterLen <= Nc; ++off ) + { + std::complex sum(0.0F, 0.0F); + for (int k=0; k < filterLen; ++k) + sum += sc[off+k] * filter[k]; + fprintf(stderr, "yv[%2d] = %g %+g * i\n", off, sum.real(), sum.imag() ); + } +#endif + + fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id); + int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen); + fprintf(stderr, "inp produced %d output samples\n", n_inp_out); + assert(s[N] == 123.0F); + assert(y[N] == 321.0F); + assert(buffer[blockLen + filterLen] == 789.0F); + assert(buffer_cx[blockLen + filterLen].i == 987.0F); +#if TEST_WITH_MIN_LEN + for (int k = 0; k < n_inp_out; ++k ) + fprintf(stderr, "y[%2d] = %g\n", k, s[k]); + fprintf(stderr, "\n"); +#endif + + fprintf(stderr, "\n"); + return 0; +} diff --git a/pffft/bench_mixers.cpp b/pffft/bench_mixers.cpp new file mode 100644 index 0000000..c08a51a --- /dev/null +++ b/pffft/bench_mixers.cpp @@ -0,0 +1,889 @@ +/* + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + bench for mixer algorithm/implementations + + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "papi_perf_counter.h" + +#if defined(__linux__) +#define HAVE_SYS_TIMES +#endif + +#ifdef HAVE_SYS_TIMES +# include +# include +#endif + +#ifdef WIN32 +#define WIN32_LEAN_AND_MEAN +#define VC_EXTRALEAN +#include +#endif + +#define BENCH_REF_TRIG_FUNC 1 +#define BENCH_OUT_OF_PLACE_ALGOS 0 +#define BENCH_INPLACE_ALGOS 1 + +#define SAVE_BY_DEFAULT 0 +#define SAVE_LIMIT_MSPS 16 + +#if 0 + #define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin" + #define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin" + #define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin" + #define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin" + #define BENCH_FILE_REC_OSC_CC "" + #define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin" + #define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin" +#else + #define BENCH_FILE_SHIFT_MATH_CC "" + #define BENCH_FILE_ADD_FAST_CC "" + #define BENCH_FILE_ADD_FAST_INP_C "" + #define BENCH_FILE_UNROLL_INP_C "" + #define BENCH_FILE_LTD_UNROLL_INP_C "" + #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "" + #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "" + #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "" + #define BENCH_FILE_REC_OSC_CC "" + #define BENCH_FILE_REC_OSC_INP_C "" + #define BENCH_FILE_REC_OSC_SSE_INP_C "" +#endif + + + +#if defined(HAVE_SYS_TIMES) + static double ttclk = 0.; + + static double uclock_sec(int find_start) + { + struct tms t0, t; + if (ttclk == 0.) + { + ttclk = sysconf(_SC_CLK_TCK); + fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk); + } + times(&t); + if (find_start) + { + t0 = t; + while (t0.tms_utime == t.tms_utime) + times(&t); + } + /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ + return ((double)t.tms_utime) / ttclk; + } + +#elif defined(WIN32) + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes + double uclock_sec(int find_start) + { + FILETIME a, b, c, d; + if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0) + { + // Returns total user time. + // Can be tweaked to include kernel times as well. + return + (double)(d.dwLowDateTime | + ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001; + } + else { + // Handle error + return 0; + } + } + +#else + double uclock_sec(int find_start) + { return (double)clock()/(double)CLOCKS_PER_SEC; } +#endif + + +void save(complexf * d, int B, int N, const char * fn) +{ + if (!fn || !fn[0]) + { + if (! SAVE_BY_DEFAULT) + return; + fn = "/dev/shm/bench.bin"; + } + FILE * f = fopen(fn, "wb"); + if (!f) { + fprintf(stderr, "error writing result to %s\n", fn); + return; + } + if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 ) + N = SAVE_LIMIT_MSPS * 1024 * 1024; + for (int off = 0; off + B <= N; off += B) + { + fwrite(d+off, sizeof(complexf), B, f); + } + fclose(f); +} + + +double bench_core_shift_math_cc( + const int B, const int N, const bool ignore_time, + const complexf *input, + complexf *output, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + float phase = 0.0F; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_math_cc(const int B, const int N, const bool ignore_time) { + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_math_cc(B, N, ignore_time, input, output, iter, off); + + save(output, B, off, BENCH_FILE_SHIFT_MATH_CC); + + free(input); + free(output); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_table_cc(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + int table_size=65536; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + + shift_table_data_t table_data = shift_table_init(table_size); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_addfast(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_addfast_data_t state = shift_addfast_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_addfast_cc(input+off, output+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, BENCH_FILE_ADD_FAST_CC); + + free(input); + free(output); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_addfast_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_addfast_data_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + float phase = 0.0F; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + phase = shift_addfast_inp_c(data+off, B, &state, phase); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_addfast_inp(int B, int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_addfast_data_t state = shift_addfast_init(-0.0009F); + int iter, off; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_addfast_inplace( + B, N, ignore_time, input, state, + iter, off + ); + + save(input, B, off, BENCH_FILE_ADD_FAST_INP_C); + + free(input); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_unroll_oop(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_unroll_data_t state = shift_unroll_init(-0.0009F, B); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_unroll_cc(input+off, output+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_unroll_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_unroll_data_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + float phase = 0.0F; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + phase = shift_unroll_inp_c(data+off, B, &state, phase); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_unroll_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_unroll_data_t state = shift_unroll_init(-0.0009F, B); + int iter, off; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_unroll_inplace( + B, N, ignore_time, input, state, + iter, off + ); + + save(input, B, off, BENCH_FILE_UNROLL_INP_C); + + free(input); + shift_unroll_deinit(&state); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + + +double bench_shift_limited_unroll_oop(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_cc(input+off, output+off, B, &state); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_limited_unroll_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_limited_unroll_data_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + shift_limited_unroll_inp_c(data+off, B, &state); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_limited_unroll_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F); + int iter, off; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_limited_unroll_inplace( + B, N, ignore_time, input, state, + iter, off + ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C); + + free(input); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_limited_unroll_A_sse_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_limited_unroll_A_sse_data_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + shift_limited_unroll_A_sse_inp_c(data+off, B, &state); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_limited_unroll_A_sse_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_A_sse_data_t *state = (shift_limited_unroll_A_sse_data_t*)malloc(sizeof(shift_limited_unroll_A_sse_data_t)); + int iter, off; + + *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_limited_unroll_A_sse_inplace( + B, N, ignore_time, input, *state, + iter, off + ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C); + + free(input); + free(state); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_limited_unroll_B_sse_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_limited_unroll_B_sse_data_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + shift_limited_unroll_B_sse_inp_c(data+off, B, &state); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_limited_unroll_B_sse_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_B_sse_data_t *state = (shift_limited_unroll_B_sse_data_t*)malloc(sizeof(shift_limited_unroll_B_sse_data_t)); + int iter, off; + + *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_limited_unroll_B_sse_inplace( + B, N, ignore_time, input, *state, + iter, off + ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C); + + free(input); + free(state); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_limited_unroll_C_sse_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_limited_unroll_C_sse_data_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + shift_limited_unroll_C_sse_inp_c(data+off, B, &state); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_limited_unroll_C_sse_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_C_sse_data_t *state = (shift_limited_unroll_C_sse_data_t*)malloc(sizeof(shift_limited_unroll_C_sse_data_t)); + int iter, off; + + *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + double T = bench_core_shift_limited_unroll_C_sse_inplace( + B, N, ignore_time, input, *state, + iter, off + ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C); + + free(input); + free(state); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_rec_osc_cc_oop(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state, shift_state; + shift_recursive_osc_conf_t gen_conf, shift_conf; + + shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state); + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_REC_OSC_CC); + + save(output, B, off, NULL); + free(input); + free(output); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_rec_osc_cc_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_recursive_osc_conf_t &conf, shift_recursive_osc_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + shift_recursive_osc_inp_c(data+off, B, &conf, &state); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_rec_osc_cc_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state, shift_state; + shift_recursive_osc_conf_t gen_conf, shift_conf; + int iter, off; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state); + + double T = bench_core_shift_rec_osc_cc_inplace( + B, N, ignore_time, input, shift_conf, shift_state, + iter, off + ); + + save(input, B, off, BENCH_FILE_REC_OSC_INP_C); + free(input); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_core_shift_rec_osc_sse_c_inplace( + const int B, const int N, const bool ignore_time, + complexf *data, + shift_recursive_osc_sse_conf_t &conf, shift_recursive_osc_sse_t &state, + int &iters_out, int &off_out + ) +{ + const double t0 = uclock_sec(1); + const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + double t1; + int off = 0, iter = 0; + papi_perf_counter perf_counter(1); + + do { + // work + shift_recursive_osc_sse_inp_c(data+off, B, &conf, &state); + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( off + B < N && (ignore_time || t1 < tstop) ); + + iters_out = iter; + off_out = off; + return t1 - t0; +} + +double bench_shift_rec_osc_sse_c_inp(const int B, const int N, const bool ignore_time) { + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + + shift_recursive_osc_sse_t *shift_state = (shift_recursive_osc_sse_t*)malloc(sizeof(shift_recursive_osc_sse_t)); + shift_recursive_osc_sse_conf_t shift_conf; + int iter, off; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state); + + double T = bench_core_shift_rec_osc_sse_c_inplace( + B, N, ignore_time, input, shift_conf, *shift_state, + iter, off + ); + + save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C); + free(input); + free(shift_state); + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + + +int main(int argc, char **argv) +{ + double rt; + + // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte) + int B = 8 * 1024; + int N = 64 * 1024 * 1024; + int showUsage = 0; + bool ignore_time = true; + + if (argc == 1) + showUsage = 1; + + if (1 < argc) + B = atoi(argv[1]); + if (2 < argc) + N = atoi(argv[2]) * 1024 * 1024; + + if ( !B || !N || showUsage ) + { + fprintf(stderr, "%s [ [] ]\n", argv[0]); + if ( !B || !N ) + return 0; + } + + fprintf(stderr, "processing up to N = %d MSamples with block length of %d samples\n", + N / (1024 * 1024), B ); + + +#if BENCH_REF_TRIG_FUNC + printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n"); + rt = bench_shift_math_cc(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); +#endif + +#if BENCH_OUT_OF_PLACE_ALGOS + printf("starting bench of shift_table_cc (out-of-place) ..\n"); + rt = bench_shift_table_cc(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_addfast_cc (out-of-place) ..\n"); + rt = bench_shift_addfast(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n"); + rt = bench_shift_unroll_oop(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n"); + rt = bench_shift_limited_unroll_oop(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n"); + rt = bench_shift_rec_osc_cc_oop(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); +#endif + +#if BENCH_INPLACE_ALGOS + + printf("starting bench of shift_addfast_inp_c in-place ..\n"); + rt = bench_shift_addfast_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_unroll_inp_c in-place ..\n"); + rt = bench_shift_unroll_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + if ( have_sse_shift_mixer_impl() ) + { + printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_A_sse_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_B_sse_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_C_sse_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + } + + printf("starting bench of shift_recursive_osc_cc in-place ..\n"); + rt = bench_shift_rec_osc_cc_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + if ( have_sse_shift_mixer_impl() ) + { + printf("starting bench of shift_recursive_osc_sse_c in-place ..\n"); + rt = bench_shift_rec_osc_sse_c_inp(B, N, ignore_time); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + } +#endif + + return 0; +} + diff --git a/pffft/bench_pffft.c b/pffft/bench_pffft.c new file mode 100644 index 0000000..7abb48d --- /dev/null +++ b/pffft/bench_pffft.c @@ -0,0 +1,1402 @@ +/* + Copyright (c) 2013 Julien Pommier. + Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de ) + + Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, Intel MKL, and Apple vDSP + + How to build: + + on linux, with fftw3: + gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm + + on macos, without fftw3: + clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate + + on macos, with fftw3: + clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate + + as alternative: replace clang by gcc. + + on macos, with fftw3 and Intel MKL: + clang -o test_pffft -I /opt/intel/mkl/include -DHAVE_FFTW -DHAVE_VECLIB -DHAVE_MKL -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate /opt/intel/mkl/lib/libmkl_{intel_lp64,sequential,core}.a + + on windows, with visual c++: + cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c + + build without SIMD instructions: + gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm + + */ + +#define CONCAT_TOKENS(A, B) A ## B +#define CONCAT_THREE_TOKENS(A, B, C) A ## B ## C + +#ifdef PFFFT_ENABLE_FLOAT +#include "pffft.h" + +typedef float pffft_scalar; +typedef PFFFT_Setup PFFFT_SETUP; +#define PFFFT_FUNC(F) CONCAT_TOKENS(pffft_, F) + +#else +/* +Note: adapted for double precision dynamic range version. +*/ +#include "pffft_double.h" + +typedef double pffft_scalar; +typedef PFFFTD_Setup PFFFT_SETUP; +#define PFFFT_FUNC(F) CONCAT_TOKENS(pffftd_, F) +#endif + +#ifdef HAVE_FFTPACK +#include "fftpack.h" +#endif + +#ifdef PFFFT_ENABLE_FLOAT + +#ifdef HAVE_GREEN_FFTS +#include "fftext.h" +#endif + +#ifdef HAVE_KISS_FFT +#include +#include +#endif + +#endif + +#ifdef HAVE_POCKET_FFT +#include +#include +#endif + +#ifdef PFFFT_ENABLE_FLOAT + #define POCKFFTR_PRE(R) CONCAT_TOKENS(rffts, R) + #define POCKFFTC_PRE(R) CONCAT_TOKENS(cffts, R) + #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rffts, R) + #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cffts, R) +#else + #define POCKFFTR_PRE(R) CONCAT_TOKENS(rfft, R) + #define POCKFFTC_PRE(R) CONCAT_TOKENS(cfft, R) + #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rfft, R) + #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cfft, R) +#endif + + + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_SYS_TIMES +# include +# include +#endif + +#ifdef HAVE_VECLIB +# include +#endif + +#ifdef HAVE_FFTW +# include + +#ifdef PFFFT_ENABLE_FLOAT +typedef fftwf_plan FFTW_PLAN; +typedef fftwf_complex FFTW_COMPLEX; +#define FFTW_FUNC(F) CONCAT_TOKENS(fftwf_, F) +#else +typedef fftw_plan FFTW_PLAN; +typedef fftw_complex FFTW_COMPLEX; +#define FFTW_FUNC(F) CONCAT_TOKENS(fftw_, F) +#endif + +#endif /* HAVE_FFTW */ + +#ifdef HAVE_MKL +# include +#endif + +#ifndef M_LN2 + #define M_LN2 0.69314718055994530942 /* log_e 2 */ +#endif + + +#define NUM_FFT_ALGOS 10 +enum { + ALGO_FFTPACK = 0, + ALGO_VECLIB, + ALGO_FFTW_ESTIM, + ALGO_FFTW_AUTO, + ALGO_GREEN, + ALGO_KISS, + ALGO_POCKET, + ALGO_MKL, + ALGO_PFFFT_U, /* = 8 */ + ALGO_PFFFT_O /* = 9 */ +}; + +#define NUM_TYPES 7 +enum { + TYPE_PREP = 0, /* time for preparation in ms */ + TYPE_DUR_NS = 1, /* time per fft in ns */ + TYPE_DUR_FASTEST = 2, /* relative time to fastest */ + TYPE_REL_PFFFT = 3, /* relative time to ALGO_PFFFT */ + TYPE_ITER = 4, /* # of iterations in measurement */ + TYPE_MFLOPS = 5, /* MFlops/sec */ + TYPE_DUR_TOT = 6 /* test duration in sec */ +}; +/* double tmeas[NUM_TYPES][NUM_FFT_ALGOS]; */ + +const char * algoName[NUM_FFT_ALGOS] = { + "FFTPack ", + "vDSP (vec) ", + "FFTW F(estim)", + "FFTW F(auto) ", + "Green ", + "Kiss ", + "Pocket ", + "Intel MKL ", + "PFFFT-U(simd)", /* unordered */ + "PFFFT (simd) " /* ordered */ +}; + + +int compiledInAlgo[NUM_FFT_ALGOS] = { +#ifdef HAVE_FFTPACK + 1, /* "FFTPack " */ +#else + 0, /* "FFTPack " */ +#endif +#if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT) + 1, /* "vDSP (vec) " */ +#else + 0, +#endif +#if defined(HAVE_FFTW) + 1, /* "FFTW(estim)" */ + 1, /* "FFTW (auto)" */ +#else + 0, 0, +#endif +#if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT) + 1, /* "Green " */ +#else + 0, +#endif +#if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT) + 1, /* "Kiss " */ +#else + 0, +#endif +#if defined(HAVE_POCKET_FFT) + 1, /* "Pocket " */ +#else + 0, +#endif +#if defined(HAVE_MKL) + 1, /* "Intel MKL " */ +#else + 0, +#endif + 1, /* "PFFFT_U " */ + 1 /* "PFFFT_O " */ +}; + +const char * algoTableHeader[NUM_FFT_ALGOS][2] = { +{ "| real FFTPack ", "| cplx FFTPack " }, +{ "| real vDSP ", "| cplx vDSP " }, +{ "|real FFTWestim", "|cplx FFTWestim" }, +{ "|real FFTWauto ", "|cplx FFTWauto " }, +{ "| real Green ", "| cplx Green " }, +{ "| real Kiss ", "| cplx Kiss " }, +{ "| real Pocket ", "| cplx Pocket " }, +{ "| real MKL ", "| cplx MKL " }, +{ "| real PFFFT-U ", "| cplx PFFFT-U " }, +{ "| real PFFFT ", "| cplx PFFFT " } }; + +const char * typeText[NUM_TYPES] = { + "preparation in ms", + "time per fft in ns", + "relative to fastest", + "relative to pffft", + "measured_num_iters", + "mflops", + "test duration in sec" +}; + +const char * typeFilenamePart[NUM_TYPES] = { + "1-preparation-in-ms", + "2-timePerFft-in-ns", + "3-rel-fastest", + "4-rel-pffft", + "5-num-iter", + "6-mflops", + "7-duration-in-sec" +}; + +#define SAVE_ALL_TYPES 0 + +const int saveType[NUM_TYPES] = { + 1, /* "1-preparation-in-ms" */ + 0, /* "2-timePerFft-in-ns" */ + 0, /* "3-rel-fastest" */ + 1, /* "4-rel-pffft" */ + 1, /* "5-num-iter" */ + 1, /* "6-mflops" */ + 1, /* "7-duration-in-sec" */ +}; + + +#define MAX(x,y) ((x)>(y)?(x):(y)) +#define MIN(x,y) ((x)<(y)?(x):(y)) + +unsigned Log2(unsigned v) { + /* we don't need speed records .. obvious way is good enough */ + /* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */ + /* Find the log base 2 of an integer with the MSB N set in O(N) operations (the obvious way): + * unsigned v: 32-bit word to find the log base 2 of */ + unsigned r = 0; /* r will be lg(v) */ + while (v >>= 1) + { + r++; + } + return r; +} + + +double frand() { + return rand()/(double)RAND_MAX; +} + +#if defined(HAVE_SYS_TIMES) + inline double uclock_sec(void) { + static double ttclk = 0.; + struct tms t; + if (ttclk == 0.) + ttclk = sysconf(_SC_CLK_TCK); + times(&t); + /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ + return ((double)t.tms_utime)) / ttclk; + } +# else + double uclock_sec(void) +{ return (double)clock()/(double)CLOCKS_PER_SEC; } +#endif + + +/* compare results with the regular fftpack */ +int pffft_validate_N(int N, int cplx) { + +#ifdef HAVE_FFTPACK + + int Nfloat = N*(cplx?2:1); + int Nbytes = Nfloat * sizeof(pffft_scalar); + pffft_scalar *ref, *in, *out, *tmp, *tmp2; + PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); + int pass; + + + if (!s) { printf("Skipping N=%d, not supported\n", N); return 0; } + ref = PFFFT_FUNC(aligned_malloc)(Nbytes); + in = PFFFT_FUNC(aligned_malloc)(Nbytes); + out = PFFFT_FUNC(aligned_malloc)(Nbytes); + tmp = PFFFT_FUNC(aligned_malloc)(Nbytes); + tmp2 = PFFFT_FUNC(aligned_malloc)(Nbytes); + + for (pass=0; pass < 2; ++pass) { + float ref_max = 0; + int k; + /* printf("N=%d pass=%d cplx=%d\n", N, pass, cplx); */ + /* compute reference solution with FFTPACK */ + if (pass == 0) { + fftpack_real *wrk = malloc(2*Nbytes+15*sizeof(pffft_scalar)); + for (k=0; k < Nfloat; ++k) { + ref[k] = in[k] = (float)( frand()*2-1 ); + out[k] = 1e30F; + } + if (!cplx) { + rffti(N, wrk); + rfftf(N, ref, wrk); + /* use our ordering for real ffts instead of the one of fftpack */ + { + float refN=ref[N-1]; + for (k=N-2; k >= 1; --k) ref[k+1] = ref[k]; + ref[1] = refN; + } + } else { + cffti(N, wrk); + cfftf(N, ref, wrk); + } + free(wrk); + } + + for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, (float)( fabs(ref[k]) )); + + + /* pass 0 : non canonical ordering of transform coefficients */ + if (pass == 0) { + /* test forward transform, with different input / output */ + PFFFT_FUNC(transform)(s, in, tmp, 0, PFFFT_FORWARD); + memcpy(tmp2, tmp, Nbytes); + memcpy(tmp, in, Nbytes); + PFFFT_FUNC(transform)(s, tmp, tmp, 0, PFFFT_FORWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == tmp[k]); + } + + /* test reordering */ + PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD); + PFFFT_FUNC(zreorder)(s, out, tmp, PFFFT_BACKWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == tmp[k]); + } + PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD); + } else { + /* pass 1 : canonical ordering of transform coeffs. */ + PFFFT_FUNC(transform_ordered)(s, in, tmp, 0, PFFFT_FORWARD); + memcpy(tmp2, tmp, Nbytes); + memcpy(tmp, in, Nbytes); + PFFFT_FUNC(transform_ordered)(s, tmp, tmp, 0, PFFFT_FORWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == tmp[k]); + } + memcpy(out, tmp, Nbytes); + } + + { + for (k=0; k < Nfloat; ++k) { + if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) { + printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N); + return 1; + } + } + + if (pass == 0) PFFFT_FUNC(transform)(s, tmp, out, 0, PFFFT_BACKWARD); + else PFFFT_FUNC(transform_ordered)(s, tmp, out, 0, PFFFT_BACKWARD); + memcpy(tmp2, out, Nbytes); + memcpy(out, tmp, Nbytes); + if (pass == 0) PFFFT_FUNC(transform)(s, out, out, 0, PFFFT_BACKWARD); + else PFFFT_FUNC(transform_ordered)(s, out, out, 0, PFFFT_BACKWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == out[k]); + out[k] *= 1.f/N; + } + for (k = 0; k < Nfloat; ++k) { + if (fabs(in[k] - out[k]) > 1e-3 * ref_max) { + printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break; + return 1; + } + } + } + + /* quick test of the circular convolution in fft domain */ + { + float conv_err = 0, conv_max = 0; + + PFFFT_FUNC(zreorder)(s, ref, tmp, PFFFT_FORWARD); + memset(out, 0, Nbytes); + PFFFT_FUNC(zconvolve_accumulate)(s, ref, ref, out, 1.0); + PFFFT_FUNC(zreorder)(s, out, tmp2, PFFFT_FORWARD); + + for (k=0; k < Nfloat; k += 2) { + float ar = tmp[k], ai=tmp[k+1]; + if (cplx || k > 0) { + tmp[k] = ar*ar - ai*ai; + tmp[k+1] = 2*ar*ai; + } else { + tmp[0] = ar*ar; + tmp[1] = ai*ai; + } + } + + for (k=0; k < Nfloat; ++k) { + float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]); + if (d > conv_err) conv_err = d; + if (e > conv_max) conv_max = e; + } + if (conv_err > 1e-5*conv_max) { + printf("zconvolve error ? %g %g\n", conv_err, conv_max); + return 1; + } + } + + } + + printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout); + + PFFFT_FUNC(destroy_setup)(s); + PFFFT_FUNC(aligned_free)(ref); + PFFFT_FUNC(aligned_free)(in); + PFFFT_FUNC(aligned_free)(out); + PFFFT_FUNC(aligned_free)(tmp); + PFFFT_FUNC(aligned_free)(tmp2); + return 0; + +#else + return 2; +#endif /* HAVE_FFTPACK */ +} + +int pffft_validate(int cplx) { + static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0}; + int k, r; + for (k = 0; Ntest[k]; ++k) { + int N = Ntest[k]; + if (N == 16 && !cplx) continue; + r = pffft_validate_N(N, cplx); + if (r) + return r; + } + return 0; +} + +int array_output_format = 1; + + +void print_table(const char *txt, FILE *tableFile) { + fprintf(stdout, "%s", txt); + if (tableFile && tableFile != stdout) + fprintf(tableFile, "%s", txt); +} + +void print_table_flops(float mflops, FILE *tableFile) { + fprintf(stdout, "|%11.0f ", mflops); + if (tableFile && tableFile != stdout) + fprintf(tableFile, "|%11.0f ", mflops); +} + +void print_table_fftsize(int N, FILE *tableFile) { + fprintf(stdout, "|%9d ", N); + if (tableFile && tableFile != stdout) + fprintf(tableFile, "|%9d ", N); +} + +double show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter, FILE *tableFile) { + double T = (double)(t1-t0)/2/max_iter * 1e9; + float mflops = flops/1e6/(t1 - t0 + 1e-16); + if (array_output_format) { + if (flops != -1) + print_table_flops(mflops, tableFile); + else + print_table("| n/a ", tableFile); + } else { + if (flops != -1) { + printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter); + } + } + fflush(stdout); + return T; +} + +double cal_benchmark(int N, int cplx) { + const int log2N = Log2(N); + int Nfloat = (cplx ? N*2 : N); + int Nbytes = Nfloat * sizeof(pffft_scalar); + pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes); + double t0, t1, tstop, T, nI; + int k, iter; + + assert( PFFFT_FUNC(is_power_of_two)(N) ); + for (k = 0; k < Nfloat; ++k) { + X[k] = sqrtf(k+1); + } + + /* PFFFT-U (unordered) benchmark */ + PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); + assert(s); + iter = 0; + t0 = uclock_sec(); + tstop = t0 + 0.25; /* benchmark duration: 250 ms */ + do { + for ( k = 0; k < 512; ++k ) { + PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD); + PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD); + ++iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + PFFFT_FUNC(destroy_setup)(s); + PFFFT_FUNC(aligned_free)(X); + PFFFT_FUNC(aligned_free)(Y); + PFFFT_FUNC(aligned_free)(Z); + + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * ( log2N * N ); /* number of iterations "normalized" to O(N) = N*log2(N) */ + return (nI / T); /* normalized iterations per second */ +} + + + +void benchmark_ffts(int N, int cplx, int withFFTWfullMeas, double iterCal, double tmeas[NUM_TYPES][NUM_FFT_ALGOS], int haveAlgo[NUM_FFT_ALGOS], FILE *tableFile ) { + const int log2N = Log2(N); + int nextPow2N = PFFFT_FUNC(next_power_of_two)(N); + int log2NextN = Log2(nextPow2N); + int pffftPow2N = nextPow2N; + + int Nfloat = (cplx ? MAX(nextPow2N, pffftPow2N)*2 : MAX(nextPow2N, pffftPow2N)); + int Nmax, k, iter; + int Nbytes = Nfloat * sizeof(pffft_scalar); + + pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes + sizeof(pffft_scalar)), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes + 2*sizeof(pffft_scalar) ), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes); + double te, t0, t1, tstop, flops, Tfastest; + + const double max_test_duration = 0.150; /* test duration 150 ms */ + double numIter = max_test_duration * iterCal / ( log2N * N ); /* number of iteration for max_test_duration */ + const int step_iter = MAX(1, ((int)(0.01 * numIter)) ); /* one hundredth */ + int max_iter = MAX(1, ((int)numIter) ); /* minimum 1 iteration */ + + const float checkVal = 12345.0F; + + /* printf("benchmark_ffts(N = %d, cplx = %d): Nfloat = %d, X_mem = 0x%p, X = %p\n", N, cplx, Nfloat, X_mem, X); */ + + memset( X, 0, Nfloat * sizeof(pffft_scalar) ); + if ( Nfloat < 32 ) { + for (k = 0; k < Nfloat; k += 4) + X[k] = sqrtf(k+1); + } else { + for (k = 0; k < Nfloat; k += (Nfloat/16) ) + X[k] = sqrtf(k+1); + } + + for ( k = 0; k < NUM_TYPES; ++k ) + { + for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter ) + tmeas[k][iter] = 0.0; + } + + + /* FFTPack benchmark */ + Nmax = (cplx ? N*2 : N); + X[Nmax] = checkVal; +#ifdef HAVE_FFTPACK + { + fftpack_real *wrk = malloc(2*Nbytes + 15*sizeof(pffft_scalar)); + te = uclock_sec(); + if (cplx) cffti(N, wrk); + else rffti(N, wrk); + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + if (cplx) { + assert( X[Nmax] == checkVal ); + cfftf(N, X, wrk); + assert( X[Nmax] == checkVal ); + cfftb(N, X, wrk); + assert( X[Nmax] == checkVal ); + } else { + assert( X[Nmax] == checkVal ); + rfftf(N, X, wrk); + assert( X[Nmax] == checkVal ); + rfftb(N, X, wrk); + assert( X[Nmax] == checkVal ); + } + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + free(wrk); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_FFTPACK] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_FFTPACK] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_FFTPACK] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_FFTPACK] = show_output("FFTPack", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_FFTPACK] = (t0 - te) * 1e3; + haveAlgo[ALGO_FFTPACK] = 1; + } +#endif + +#if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT) + Nmax = (cplx ? nextPow2N*2 : nextPow2N); + X[Nmax] = checkVal; + te = uclock_sec(); + if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) { + FFTSetup setup; + + setup = vDSP_create_fftsetup(log2NextN, FFT_RADIX2); + DSPSplitComplex zsamples; + zsamples.realp = &X[0]; + zsamples.imagp = &X[Nfloat/2]; + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + if (cplx) { + assert( X[Nmax] == checkVal ); + vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward); + assert( X[Nmax] == checkVal ); + vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse); + assert( X[Nmax] == checkVal ); + } else { + assert( X[Nmax] == checkVal ); + vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward); + assert( X[Nmax] == checkVal ); + vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse); + assert( X[Nmax] == checkVal ); + } + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + vDSP_destroy_fftsetup(setup); + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_VECLIB] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_VECLIB] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_VECLIB] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_VECLIB] = show_output("vDSP", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_VECLIB] = (t0 - te) * 1e3; + haveAlgo[ALGO_VECLIB] = 1; + } else { + show_output("vDSP", N, cplx, -1, -1, -1, -1, tableFile); + } +#endif + +#if defined(HAVE_FFTW) + Nmax = (cplx ? N*2 : N); + X[Nmax] = checkVal; + { + /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE); measure takes a lot of time on largest ffts */ + int flags = FFTW_ESTIMATE; + te = uclock_sec(); + + FFTW_PLAN planf, planb; + FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N); + FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N); + memset(in, 0, sizeof(FFTW_COMPLEX) * N); + if (cplx) { + planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags); + planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags); + } else { + planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags); + planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags); + } + + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + assert( X[Nmax] == checkVal ); + FFTW_FUNC(execute)(planf); + assert( X[Nmax] == checkVal ); + FFTW_FUNC(execute)(planb); + assert( X[Nmax] == checkVal ); + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + FFTW_FUNC(destroy_plan)(planf); + FFTW_FUNC(destroy_plan)(planb); + FFTW_FUNC(free)(in); FFTW_FUNC(free)(out); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_FFTW_ESTIM] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_FFTW_ESTIM] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_FFTW_ESTIM] = (t0 - te) * 1e3; + haveAlgo[ALGO_FFTW_ESTIM] = 1; + } + Nmax = (cplx ? N*2 : N); + X[Nmax] = checkVal; + do { + /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE); measure takes a lot of time on largest ffts */ + /* int flags = FFTW_MEASURE; */ +#if ( defined(__arm__) || defined(__aarch64__) || defined(__arm64__) ) + int limitFFTsize = 31; /* takes over a second on Raspberry Pi 3 B+ -- and much much more on higher ffts sizes! */ +#else + int limitFFTsize = 2400; /* take over a second on i7 for fft size 2400 */ +#endif + int flags = (N < limitFFTsize ? FFTW_MEASURE : (withFFTWfullMeas ? FFTW_MEASURE : FFTW_ESTIMATE)); + + if (flags == FFTW_ESTIMATE) { + show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, -1, -1, -1, -1, tableFile); + /* copy values from estimation */ + tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = tmeas[TYPE_ITER][ALGO_FFTW_ESTIM]; + tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM]; + tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM]; + tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = tmeas[TYPE_PREP][ALGO_FFTW_ESTIM]; + } else { + te = uclock_sec(); + FFTW_PLAN planf, planb; + FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N); + FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N); + memset(in, 0, sizeof(FFTW_COMPLEX) * N); + if (cplx) { + planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags); + planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags); + } else { + planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags); + planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags); + } + + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + assert( X[Nmax] == checkVal ); + FFTW_FUNC(execute)(planf); + assert( X[Nmax] == checkVal ); + FFTW_FUNC(execute)(planb); + assert( X[Nmax] == checkVal ); + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + FFTW_FUNC(destroy_plan)(planf); + FFTW_FUNC(destroy_plan)(planb); + FFTW_FUNC(free)(in); FFTW_FUNC(free)(out); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_FFTW_AUTO] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = (t0 - te) * 1e3; + haveAlgo[ALGO_FFTW_AUTO] = 1; + } + } while (0); +#else + (void)withFFTWfullMeas; +#endif + +#if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT) + Nmax = (cplx ? nextPow2N*2 : nextPow2N); + X[Nmax] = checkVal; + if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) + { + te = uclock_sec(); + fftInit(log2NextN); + + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + if (cplx) { + assert( X[Nmax] == checkVal ); + ffts(X, log2NextN, 1); + assert( X[Nmax] == checkVal ); + iffts(X, log2NextN, 1); + assert( X[Nmax] == checkVal ); + } else { + rffts(X, log2NextN, 1); + riffts(X, log2NextN, 1); + } + + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + fftFree(); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_GREEN] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_GREEN] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_GREEN] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_GREEN] = show_output("Green", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_GREEN] = (t0 - te) * 1e3; + haveAlgo[ALGO_GREEN] = 1; + } else { + show_output("Green", N, cplx, -1, -1, -1, -1, tableFile); + } +#endif + +#if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT) + Nmax = (cplx ? nextPow2N*2 : nextPow2N); + X[Nmax] = checkVal; + if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) + { + kiss_fft_cfg stf; + kiss_fft_cfg sti; + kiss_fftr_cfg stfr; + kiss_fftr_cfg stir; + + te = uclock_sec(); + if (cplx) { + stf = kiss_fft_alloc(nextPow2N, 0, 0, 0); + sti = kiss_fft_alloc(nextPow2N, 1, 0, 0); + } else { + stfr = kiss_fftr_alloc(nextPow2N, 0, 0, 0); + stir = kiss_fftr_alloc(nextPow2N, 1, 0, 0); + } + + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + if (cplx) { + assert( X[Nmax] == checkVal ); + kiss_fft(stf, (const kiss_fft_cpx *)X, (kiss_fft_cpx *)Y); + assert( X[Nmax] == checkVal ); + kiss_fft(sti, (const kiss_fft_cpx *)Y, (kiss_fft_cpx *)X); + assert( X[Nmax] == checkVal ); + } else { + assert( X[Nmax] == checkVal ); + kiss_fftr(stfr, X, (kiss_fft_cpx *)Y); + assert( X[Nmax] == checkVal ); + kiss_fftri(stir, (const kiss_fft_cpx *)Y, X); + assert( X[Nmax] == checkVal ); + } + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + kiss_fft_cleanup(); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_KISS] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_KISS] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_KISS] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_KISS] = show_output("Kiss", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_KISS] = (t0 - te) * 1e3; + haveAlgo[ALGO_KISS] = 1; + } else { + show_output("Kiss", N, cplx, -1, -1, -1, -1, tableFile); + } +#endif + +#if defined(HAVE_POCKET_FFT) + + Nmax = (cplx ? nextPow2N*2 : nextPow2N); + X[Nmax] = checkVal; + if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) + { + POCKFFTR_PRE(_plan) planr; + POCKFFTC_PRE(_plan) planc; + + te = uclock_sec(); + if (cplx) { + planc = POCKFFTC_MID(make_,_plan)(nextPow2N); + } else { + planr = POCKFFTR_MID(make_,_plan)(nextPow2N); + } + + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + if (cplx) { + assert( X[Nmax] == checkVal ); + memcpy(Y, X, 2*nextPow2N * sizeof(pffft_scalar) ); + POCKFFTC_PRE(_forward)(planc, Y, 1.); + assert( X[Nmax] == checkVal ); + memcpy(X, Y, 2*nextPow2N * sizeof(pffft_scalar) ); + POCKFFTC_PRE(_backward)(planc, X, 1./nextPow2N); + assert( X[Nmax] == checkVal ); + } else { + assert( X[Nmax] == checkVal ); + memcpy(Y, X, nextPow2N * sizeof(pffft_scalar) ); + POCKFFTR_PRE(_forward)(planr, Y, 1.); + assert( X[Nmax] == checkVal ); + memcpy(X, Y, nextPow2N * sizeof(pffft_scalar) ); + POCKFFTR_PRE(_backward)(planr, X, 1./nextPow2N); + assert( X[Nmax] == checkVal ); + } + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + if (cplx) { + POCKFFTC_MID(destroy_,_plan)(planc); + } else { + POCKFFTR_MID(destroy_,_plan)(planr); + } + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_POCKET] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_POCKET] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_POCKET] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_POCKET] = show_output("Pocket", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_POCKET] = (t0 - te) * 1e3; + haveAlgo[ALGO_POCKET] = 1; + } else { + show_output("Pocket", N, cplx, -1, -1, -1, -1, tableFile); + } +#endif + + +#if defined(HAVE_MKL) + { + DFTI_DESCRIPTOR_HANDLE fft_handle; + MKL_LONG mkl_status, mkl_ret; + te = uclock_sec(); + if (sizeof(float) == sizeof(pffft_scalar)) + mkl_status = DftiCreateDescriptor(&fft_handle, DFTI_SINGLE, (cplx ? DFTI_COMPLEX : DFTI_REAL), 1, N); + else if (sizeof(double) == sizeof(pffft_scalar)) + mkl_status = DftiCreateDescriptor(&fft_handle, DFTI_DOUBLE, (cplx ? DFTI_COMPLEX : DFTI_REAL), 1, N); + else + mkl_status = 1; + + while (mkl_status == 0) { + mkl_ret = DftiSetValue(fft_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE); + if (mkl_ret) { + DftiFreeDescriptor(&fft_handle); + mkl_status = 1; + break; + } + mkl_ret = DftiCommitDescriptor(fft_handle); + if (mkl_ret) { + DftiFreeDescriptor(&fft_handle); + mkl_status = 1; + break; + } + break; + } + + if (mkl_status == 0) { + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + + do { + for ( k = 0; k < step_iter; ++k ) { + assert( X[Nmax] == checkVal ); + DftiComputeForward(fft_handle, &X[0], &Y[0]); + assert( X[Nmax] == checkVal ); + DftiComputeBackward(fft_handle, &X[0], &Y[0]); + assert( X[Nmax] == checkVal ); + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + DftiFreeDescriptor(&fft_handle); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_MKL] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_MKL] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_MKL] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_MKL] = show_output("MKL", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_MKL] = (t0 - te) * 1e3; + haveAlgo[ALGO_MKL] = 1; + } else { + show_output("MKL", N, cplx, -1, -1, -1, -1, tableFile); + } + } +#endif + + /* PFFFT-U (unordered) benchmark */ + Nmax = (cplx ? pffftPow2N*2 : pffftPow2N); + X[Nmax] = checkVal; + if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) ) + { + te = uclock_sec(); + PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); + if (s) { + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + assert( X[Nmax] == checkVal ); + PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD); + assert( X[Nmax] == checkVal ); + PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD); + assert( X[Nmax] == checkVal ); + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + PFFFT_FUNC(destroy_setup)(s); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_PFFFT_U] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_PFFFT_U] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_PFFFT_U] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_PFFFT_U] = show_output("PFFFT-U", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_PFFFT_U] = (t0 - te) * 1e3; + haveAlgo[ALGO_PFFFT_U] = 1; + } + } else { + show_output("PFFFT-U", N, cplx, -1, -1, -1, -1, tableFile); + } + + + if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) ) + { + te = uclock_sec(); + PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); + if (s) { + t0 = uclock_sec(); + tstop = t0 + max_test_duration; + max_iter = 0; + do { + for ( k = 0; k < step_iter; ++k ) { + assert( X[Nmax] == checkVal ); + PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_FORWARD); + assert( X[Nmax] == checkVal ); + PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_BACKWARD); + assert( X[Nmax] == checkVal ); + ++max_iter; + } + t1 = uclock_sec(); + } while ( t1 < tstop ); + + PFFFT_FUNC(destroy_setup)(s); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */ + tmeas[TYPE_ITER][ALGO_PFFFT_O] = max_iter; + tmeas[TYPE_MFLOPS][ALGO_PFFFT_O] = flops/1e6/(t1 - t0 + 1e-16); + tmeas[TYPE_DUR_TOT][ALGO_PFFFT_O] = t1 - t0; + tmeas[TYPE_DUR_NS][ALGO_PFFFT_O] = show_output("PFFFT", N, cplx, flops, t0, t1, max_iter, tableFile); + tmeas[TYPE_PREP][ALGO_PFFFT_O] = (t0 - te) * 1e3; + haveAlgo[ALGO_PFFFT_O] = 1; + } + } else { + show_output("PFFFT", N, cplx, -1, -1, -1, -1, tableFile); + } + + if (!array_output_format) + { + printf("prepare/ms: "); + for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter ) + { + if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) { + printf("%s %.3f ", algoName[iter], tmeas[TYPE_PREP][iter] ); + } + } + printf("\n"); + } + Tfastest = 0.0; + for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter ) + { + if ( Tfastest == 0.0 || ( tmeas[TYPE_DUR_NS][iter] != 0.0 && tmeas[TYPE_DUR_NS][iter] < Tfastest ) ) + Tfastest = tmeas[TYPE_DUR_NS][iter]; + } + if ( Tfastest > 0.0 ) + { + if (!array_output_format) + printf("relative fast: "); + for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter ) + { + if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) { + tmeas[TYPE_DUR_FASTEST][iter] = tmeas[TYPE_DUR_NS][iter] / Tfastest; + if (!array_output_format) + printf("%s %.3f ", algoName[iter], tmeas[TYPE_DUR_FASTEST][iter] ); + } + } + if (!array_output_format) + printf("\n"); + } + + { + if (!array_output_format) + printf("relative pffft: "); + for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter ) + { + if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) { + tmeas[TYPE_REL_PFFFT][iter] = tmeas[TYPE_DUR_NS][iter] / tmeas[TYPE_DUR_NS][ALGO_PFFFT_O]; + if (!array_output_format) + printf("%s %.3f ", algoName[iter], tmeas[TYPE_REL_PFFFT][iter] ); + } + } + if (!array_output_format) + printf("\n"); + } + + if (!array_output_format) { + printf("--\n"); + } + + PFFFT_FUNC(aligned_free)(X); + PFFFT_FUNC(aligned_free)(Y); + PFFFT_FUNC(aligned_free)(Z); +} + + +/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */ +void validate_pffft_simd(); +int validate_pffft_simd_ex(FILE * DbgOut); +void validate_pffftd_simd(); +int validate_pffftd_simd_ex(FILE * DbgOut); + + + +int main(int argc, char **argv) { + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + +#if 0 /* include powers of 2 ? */ +#define NUMNONPOW2LENS 23 + int NnonPow2[NUMNONPOW2LENS] = { + 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, + 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, + 256*1024, 1024*1024, -1 }; +#else +#define NUMNONPOW2LENS 11 + int NnonPow2[NUMNONPOW2LENS] = { + 96, 160, 192, 384, 5*96, 5*128,3*256, 800, 2400, 9*1024, + -1 }; +#endif + +#define NUMPOW2FFTLENS 22 +#define MAXNUMFFTLENS MAX( NUMPOW2FFTLENS, NUMNONPOW2LENS ) + int Npow2[NUMPOW2FFTLENS]; /* exp = 1 .. 21, -1 */ + const int *Nvalues = NULL; + double tmeas[2][MAXNUMFFTLENS][NUM_TYPES][NUM_FFT_ALGOS]; + double iterCalReal = 0.0, iterCalCplx = 0.0; + + int benchReal=1, benchCplx=1, withFFTWfullMeas=0, outputTable2File=1, usePow2=1; + int max_N = 1024 * 1024 * 2; + int quicktest = 0; + int realCplxIdx, typeIdx; + int i, k; + FILE *tableFile = NULL; + + int haveAlgo[NUM_FFT_ALGOS]; + char acCsvFilename[64]; + + for ( k = 1; k <= NUMPOW2FFTLENS; ++k ) + Npow2[k-1] = (k == NUMPOW2FFTLENS) ? -1 : (1 << k); + Nvalues = Npow2; /* set default .. for comparisons .. */ + + for ( i = 0; i < NUM_FFT_ALGOS; ++i ) + haveAlgo[i] = 0; + + printf("pffft architecture: '%s'\n", PFFFT_FUNC(simd_arch)()); + printf("pffft SIMD size: %d\n", PFFFT_FUNC(simd_size)()); + printf("pffft min real fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_REAL)); + printf("pffft min complex fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_COMPLEX)); + printf("\n"); + + for ( i = 1; i < argc; ++i ) { + if (!strcmp(argv[i], "--array-format") || !strcmp(argv[i], "--table")) { + array_output_format = 1; + } + else if (!strcmp(argv[i], "--no-tab")) { + array_output_format = 0; + } + else if (!strcmp(argv[i], "--real")) { + benchCplx = 0; + } + else if (!strcmp(argv[i], "--cplx")) { + benchReal = 0; + } + else if (!strcmp(argv[i], "--fftw-full-measure")) { + withFFTWfullMeas = 1; + } + else if (!strcmp(argv[i], "--non-pow2")) { + Nvalues = NnonPow2; + usePow2 = 0; + } + else if (!strcmp(argv[i], "--max-len") && i+1 < argc) { + max_N = atoi(argv[i+1]); + ++i; + } + else if (!strcmp(argv[i], "--quick")) { + fprintf(stdout, "actived quicktest mode\n"); + quicktest = 1; + } + else if (!strcmp(argv[i], "--validate")) { +#ifdef HAVE_FFTPACK + int r; + fprintf(stdout, "validating PFFFT against %s FFTPACK ..\n", (benchCplx ? "complex" : "real")); + r = pffft_validate(benchCplx); + fprintf((r ? stderr : stderr), "pffft %s\n", (r ? "validation failed!" : "successful")); + return r; +#else + fprintf(stderr, "validation not available without FFTPACK!\n"); +#endif + return 0; + } + else /* if (!strcmp(argv[i], "--help")) */ { + printf("usage: %s [--array-format|--table] [--no-tab] [--real|--cplx] [--validate] [--fftw-full-measure] [--non-pow2] [--max-len ] [--quick]\n", argv[0]); + exit(0); + } + } + +#ifdef HAVE_FFTW +#ifdef PFFFT_ENABLE_DOUBLE + algoName[ALGO_FFTW_ESTIM] = "FFTW D(estim)"; + algoName[ALGO_FFTW_AUTO] = "FFTW D(auto) "; +#endif + + if (withFFTWfullMeas) + { +#ifdef PFFFT_ENABLE_FLOAT + algoName[ALGO_FFTW_AUTO] = "FFTWF(meas)"; /* "FFTW (auto)" */ +#else + algoName[ALGO_FFTW_AUTO] = "FFTWD(meas)"; /* "FFTW (auto)" */ +#endif + algoTableHeader[ALGO_FFTW_AUTO][0] = "|real FFTWmeas "; /* "|real FFTWauto " */ + algoTableHeader[ALGO_FFTW_AUTO][1] = "|cplx FFTWmeas "; /* "|cplx FFTWauto " */ + } +#endif + + if ( PFFFT_FUNC(simd_size)() == 1 ) + { + algoName[ALGO_PFFFT_U] = "PFFFTU scal-1"; + algoName[ALGO_PFFFT_O] = "PFFFT scal-1 "; + } + else if ( !strcmp(PFFFT_FUNC(simd_arch)(), "4xScalar") ) + { + algoName[ALGO_PFFFT_U] = "PFFFT-U scal4"; + algoName[ALGO_PFFFT_O] = "PFFFT scal-4 "; + } + + + clock(); + /* double TClockDur = 1.0 / CLOCKS_PER_SEC; + printf("clock() duration for CLOCKS_PER_SEC = %f sec = %f ms\n", TClockDur, 1000.0 * TClockDur ); + */ + + /* calibrate test duration */ + if (!quicktest) + { + double t0, t1, dur; + printf("calibrating fft benchmark duration at size N = 512 ..\n"); + t0 = uclock_sec(); + if (benchReal) { + iterCalReal = cal_benchmark(512, 0 /* real fft */); + printf("real fft iterCal = %f\n", iterCalReal); + } + if (benchCplx) { + iterCalCplx = cal_benchmark(512, 1 /* cplx fft */); + printf("cplx fft iterCal = %f\n", iterCalCplx); + } + t1 = uclock_sec(); + dur = t1 - t0; + printf("calibration done in %f sec.\n\n", dur); + } + + if (!array_output_format) { + if (benchReal) { + for (i=0; Nvalues[i] > 0 && Nvalues[i] <= max_N; ++i) + benchmark_ffts(Nvalues[i], 0 /* real fft */, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, NULL); + } + if (benchCplx) { + for (i=0; Nvalues[i] > 0 && Nvalues[i] <= max_N; ++i) + benchmark_ffts(Nvalues[i], 1 /* cplx fft */, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, NULL); + } + + } else { + + if (outputTable2File) { + tableFile = fopen( usePow2 ? "bench-fft-table-pow2.txt" : "bench-fft-table-non2.txt", "w"); + } + /* print table headers */ + printf("table shows MFlops; higher values indicate faster computation\n\n"); + + { + print_table("| input len ", tableFile); + for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx) + { + if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) ) + continue; + for (k=0; k < NUM_FFT_ALGOS; ++k) + { + if ( compiledInAlgo[k] ) + print_table(algoTableHeader[k][realCplxIdx], tableFile); + } + } + print_table("|\n", tableFile); + } + /* print table value seperators */ + { + print_table("|----------", tableFile); + for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx) + { + if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) ) + continue; + for (k=0; k < NUM_FFT_ALGOS; ++k) + { + if ( compiledInAlgo[k] ) + print_table(":|-------------", tableFile); + } + } + print_table(":|\n", tableFile); + } + + for (i=0; Nvalues[i] > 0 && Nvalues[i] <= max_N; ++i) { + { + double t0, t1; + print_table_fftsize(Nvalues[i], tableFile); + t0 = uclock_sec(); + if (benchReal) + benchmark_ffts(Nvalues[i], 0, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, tableFile); + if (benchCplx) + benchmark_ffts(Nvalues[i], 1, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, tableFile); + t1 = uclock_sec(); + print_table("|\n", tableFile); + /* printf("all ffts for size %d took %f sec\n", Nvalues[i], t1-t0); */ + (void)t0; + (void)t1; + } + } + fprintf(stdout, " (numbers are given in MFlops)\n"); + if (outputTable2File) { + fclose(tableFile); + } + } + + printf("\n"); + printf("now writing .csv files ..\n"); + + for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx) + { + if ( (benchReal && realCplxIdx == 0) || (benchCplx && realCplxIdx == 1) ) + { + for (typeIdx = 0; typeIdx < NUM_TYPES; ++typeIdx) + { + FILE *f = NULL; + if ( !(SAVE_ALL_TYPES || saveType[typeIdx]) ) + continue; + acCsvFilename[0] = 0; +#ifdef PFFFT_SIMD_DISABLE + strcat(acCsvFilename, "scal-"); +#else + strcat(acCsvFilename, "simd-"); +#endif + strcat(acCsvFilename, (realCplxIdx == 0 ? "real-" : "cplx-")); + strcat(acCsvFilename, ( usePow2 ? "pow2-" : "non2-")); + assert( strlen(acCsvFilename) + strlen(typeFilenamePart[typeIdx]) + 5 < (sizeof(acCsvFilename) / sizeof(acCsvFilename[0])) ); + strcat(acCsvFilename, typeFilenamePart[typeIdx]); + strcat(acCsvFilename, ".csv"); + f = fopen(acCsvFilename, "w"); + if (!f) + continue; + { + fprintf(f, "size, log2, "); + for (k=0; k < NUM_FFT_ALGOS; ++k) + if ( haveAlgo[k] ) + fprintf(f, "%s, ", algoName[k]); + fprintf(f, "\n"); + } + for (i=0; Nvalues[i] > 0 && Nvalues[i] <= max_N; ++i) + { + { + fprintf(f, "%d, %.3f, ", Nvalues[i], log10((double)Nvalues[i])/log10(2.0) ); + for (k=0; k < NUM_FFT_ALGOS; ++k) + if ( haveAlgo[k] ) + fprintf(f, "%f, ", tmeas[realCplxIdx][i][typeIdx][k]); + fprintf(f, "\n"); + } + } + fclose(f); + } + } + } + + return 0; +} + diff --git a/pffft/cmake/FindMIPP.cmake b/pffft/cmake/FindMIPP.cmake new file mode 100644 index 0000000..afd840d --- /dev/null +++ b/pffft/cmake/FindMIPP.cmake @@ -0,0 +1,26 @@ + +if(MIPP_INCLUDE_DIRS) + set(MIPP_FIND_QUIETLY TRUE) +endif() + +find_path(MIPP_INCLUDE_DIRS NAMES mipp.h + HINTS + ${MIPP_ROOT} + $ENV{HOME}/.local + PATH_SUFFIXES include/mipp +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MIPP DEFAULT_MSG MIPP_INCLUDE_DIRS) + +if(MIPP_FOUND AND NOT TARGET MIPP) + message(STATUS "MIPP_FOUND -> creating interface library MIPP at ${MIPP_INCLUDE_DIRS}") + add_library(MIPP INTERFACE) + target_compile_definitions(MIPP INTERFACE HAVE_MIPP=1) + target_include_directories(MIPP INTERFACE ${MIPP_INCLUDE_DIRS}) + target_compile_features(MIPP INTERFACE cxx_std_11) +else() + message(WARNING "MIPP not found.") +endif() + +mark_as_advanced(MIPP_INCLUDE_DIRS) diff --git a/pffft/cmake/FindPAPI.cmake b/pffft/cmake/FindPAPI.cmake new file mode 100644 index 0000000..81e7a6a --- /dev/null +++ b/pffft/cmake/FindPAPI.cmake @@ -0,0 +1,25 @@ +# Find PAPI libraries +# Once done this will define +# PAPI_FOUND - System has PAPI +# PAPI_INCLUDE_DIRS - The PAPI include directories +# PAPI_LIBRARIES - The libraries needed to use PAPI + +if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES) + set(PAPI_FIND_QUIETLY TRUE) +endif() + +find_path(PAPI_INCLUDE_DIRS NAMES papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include) +find_library(PAPI_LIBRARIES NAMES papi HINTS ${PAPI_ROOT} PATH_SUFFIXES lib lib64) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARIES PAPI_INCLUDE_DIRS) +if(PAPI_FOUND AND NOT TARGET PAPI::PAPI) + set(PAPI_LIBRARIES ${PAPI_LIBRARIES} rt) + + add_library(PAPI::PAPI SHARED IMPORTED) + set_target_properties(PAPI::PAPI PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}" + IMPORTED_LOCATION "${PAPI_LIBRARIES}") +endif() + +mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES) diff --git a/pffft/cmake/compiler_warnings.cmake b/pffft/cmake/compiler_warnings.cmake new file mode 100644 index 0000000..32c1782 --- /dev/null +++ b/pffft/cmake/compiler_warnings.cmake @@ -0,0 +1,11 @@ + +function(target_activate_cxx_compiler_warnings target) + target_compile_options(${target} PRIVATE $<$:-Wall -Wextra -pedantic>) + target_compile_options(${target} PRIVATE $<$:-Wall -Wextra -pedantic>) +endfunction() + +function(target_activate_c_compiler_warnings target) + target_compile_options(${target} PRIVATE $<$:-Wall -Wextra -pedantic>) + target_compile_options(${target} PRIVATE $<$:-Wall -Wextra -pedantic>) +endfunction() + diff --git a/pffft/cmake/target_optimizations.cmake b/pffft/cmake/target_optimizations.cmake new file mode 100644 index 0000000..6d19fdb --- /dev/null +++ b/pffft/cmake/target_optimizations.cmake @@ -0,0 +1,197 @@ + +# cmake options: TARGET_C_ARCH / TARGET_CPP_ARCH: +# and optionally: TARGET_C_EXTRA TARGET_CXX_EXTRA +# +# provided: +# - function: target_set_c_arch_flags() # uses options TARGET_C_ARCH and TARGET_C_EXTRA +# - function: target_set_cxx_arch_flags() # uses options TARGET_CXX_ARCH and TARGET_CXX_EXTRA +# - macro: target_set_cxx_arch_option( ) +# +# see https://en.wikichip.org/wiki/x86/extensions +# and https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html +# for gcc specific architecture options +# and https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64 +# or https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 +# for msvc specific architecture options + +# https://en.wikichip.org/wiki/arm/versions +# https://en.wikipedia.org/wiki/Raspberry_Pi +# https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#ARM-Options +# https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores +# https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores + +# arm32_rpi1 untested +# -mcpu=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp -mtune=arm1176jzf-s +# arm32_rpi2 untested +# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" +# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" +# arm32_rpi3 with "armv7-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits +# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" +# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits +# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" +# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits +# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53" +# arm32_rpi4 untested +# RPi 4 Model B: Cortex-A72 => "-mtune=cortex-a72" ? +# "-mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits -mtune=cortex-a72" + +set(MSVC_EXTRA_OPT_none "") +set(GCC_EXTRA_OPT_none "") +set(GCC_EXTRA_OPT_neon_vfpv4 "-mfloat-abi=hard" "-mfpu=neon-vfpv4") +set(GCC_EXTRA_OPT_neon_rpi3_a53 "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53") +set(GCC_EXTRA_OPT_neon_rpi4_a72 "-mfloat-abi=hard" "-mfpu=neon-fp-armv8" "-mtune=cortex-a72") + +if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") ) + set(GCC_MARCH_DESC "native/SSE2:pentium4/SSE3:core2/SSE4:nehalem/AVX:sandybridge/AVX2:haswell") + set(GCC_MARCH_VALUES "none;native;pentium4;core2;nehalem;sandybridge;haswell" CACHE INTERNAL "List of possible architectures") + set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible EXTRA options") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(GCC_MARCH_DESC "native/ARMwNEON:armv8-a") + set(GCC_MARCH_VALUES "none;native;armv8-a" CACHE INTERNAL "List of possible architectures") + set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l") + set(GCC_MARCH_DESC "native/ARMwNEON:armv7-a") + set(GCC_MARCH_VALUES "none;native;armv7-a" CACHE INTERNAL "List of possible architectures") + set(GCC_EXTRA_VALUES "none;neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72" CACHE INTERNAL "List of possible additional options") +else() + message(WARNING "unsupported CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}'") + # other PROCESSORs could be "ppc", "ppc64", "arm" - or something else?! + set(GCC_MARCH_DESC "native") + set(GCC_MARCH_VALUES "none;native" CACHE INTERNAL "List of possible architectures") + set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options") +endif() + +# cmake options - depending on C/C++ compiler +# how are chances, that C and C++ compilers are from different vendors? +if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + set(TARGET_C_ARCH "none" CACHE STRING "gcc target C architecture (-march): ${GCC_MARCH_DESC}") + set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES}) + if ( NOT (GCC_EXTRA_VALUES STREQUAL "") ) + set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C") + set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES}) + endif() +elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang") + set(TARGET_C_ARCH "none" CACHE STRING "clang target C architecture (-march): ${GCC_MARCH_DESC}") + set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES}) + if ( NOT (GCC_EXTRA_VALUES STREQUAL "") ) + set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C") + set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES}) + endif() +elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC") + set(TARGET_C_ARCH "none" CACHE STRING "msvc target C architecture (/arch): SSE2/AVX/AVX2/AVX512") + set(TARGET_C_EXTRA "none" CACHE STRING "msvc additional options") +else() + message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") +endif() + +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(TARGET_CXX_ARCH "none" CACHE STRING "gcc target C++ architecture (-march): ${GCC_MARCH_DESC}") + set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES}) + if ( NOT (GCC_EXTRA_VALUES STREQUAL "") ) + set(TARGET_CXX_EXTRA "none" CACHE STRING "gcc additional options for C++") + set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES}) + endif() +elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(TARGET_CXX_ARCH "none" CACHE STRING "clang target C++ architecture (-march): ${GCC_MARCH_DESC}") + set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES}) + if ( NOT (GCC_EXTRA_VALUES STREQUAL "") ) + set(TARGET_CXX_EXTRA "none" CACHE STRING "clang additional options for C++") + set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES}) + endif() +elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(TARGET_CXX_ARCH "none" CACHE STRING "msvc target C++ architecture (/arch): SSE2/AVX/AVX2/AVX512") + set(TARGET_CXX_EXTRA "none" CACHE STRING "msvc additional options") +else() + message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") +endif() + +###################################################### + +function(target_set_c_arch_flags target) + if ( ("${TARGET_C_ARCH}" STREQUAL "") OR ("${TARGET_C_ARCH}" STREQUAL "none") ) + message(STATUS "C ARCH for target ${target} is not set!") + else() + if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") ) + target_compile_options(${target} PRIVATE "-march=${TARGET_C_ARCH}") + message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}") + elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC") + target_compile_options(${target} PRIVATE "/arch:${TARGET_C_ARCH}") + message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}") + else() + message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") + endif() + endif() + if ( ("${TARGET_C_EXTRA}" STREQUAL "") OR ("${TARGET_C_EXTRA}" STREQUAL "none") ) + message(STATUS "C additional options for target ${target} is not set!") + else() + if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") ) + target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}") + message(STATUS "C additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}") + elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC") + # target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_C_EXTRA}}") + message(STATUS "C additional options for target ${target} not usable with MSVC") + else() + message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") + endif() + if ( ("${TARGET_C_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") ) + message(STATUS "additional option contains neon: setting PFFFT_ENABLE_NEON for C target ${target}") + target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1) + endif() + endif() +endfunction() + +function(target_set_cxx_arch_flags target) + if ( ("${TARGET_CXX_ARCH}" STREQUAL "") OR ("${TARGET_CXX_ARCH}" STREQUAL "none") ) + message(STATUS "C++ ARCH for target ${target} is not set!") + else() + if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") ) + target_compile_options(${target} PRIVATE "-march=${TARGET_CXX_ARCH}") + message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}") + elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + target_compile_options(${target} PRIVATE "/arch:${TARGET_CXX_ARCH}") + message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}") + else() + message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") + endif() + endif() + if ( ("${TARGET_CXX_EXTRA}" STREQUAL "") OR ("${TARGET_CXX_EXTRA}" STREQUAL "none") ) + message(STATUS "C++ additional options for target ${target} is not set!") + else() + if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") ) + target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}") + message(STATUS "C++ additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}") + elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC") + # target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_CXX_EXTRA}}") + message(STATUS "C++ additional options for target ${target} not usable with MSVC") + else() + message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") + endif() + if ( ("${TARGET_CXX_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") ) + message(STATUS "additional option contains 'neon': setting PFFFT_ENABLE_NEON for C++ target ${target}") + target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1) + endif() + endif() +endfunction() + + +macro(target_set_cxx_arch_option target gcc_clang_arch gcc_clang_extra msvc_arch ) + if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") ) + + if ( NOT (("${gcc_clang_arch}" STREQUAL "") OR ("${gcc_clang_arch}" STREQUAL "none") ) ) + target_compile_options(${target} PRIVATE "-march=${gcc_clang_arch}") + message(STATUS "C++ ARCH for target ${target}: ${gcc_clang_arch}") + endif() + if (NOT ( ("${gcc_clang_extra}" STREQUAL "") OR ("${gcc_clang_extra}" STREQUAL "none") ) ) + target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${gcc_clang_extra}}") + message(STATUS "C++ additional options for target ${target}: ${GCC_EXTRA_OPT_${gcc_clang_extra}}") + endif() + elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + if ( NOT (("${msvc_arch}" STREQUAL "") OR ("${msvc_arch}" STREQUAL "none") ) ) + target_compile_options(${target} PRIVATE "/arch:${msvc_arch}") + message(STATUS "C++ ARCH for target ${target} set: ${msvc_arch}") + endif() + else() + message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_option(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html") + endif() +endmacro() + diff --git a/pffft/cross_build_mingw32.sh b/pffft/cross_build_mingw32.sh new file mode 100755 index 0000000..94f05f9 --- /dev/null +++ b/pffft/cross_build_mingw32.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# requires debian/ubuntu packages: zip gcc-mingw-w64 + +if [ -z "$1" ]; then + echo "usage: $0 " + exit 1 +fi + +ZIP_POST="$1" +shift + +CROSS="i686-w64-mingw32" +WN="w32" +TOOLCHAIN="mingw-w32-i686.cmake" + +rm -rf build_${WN}_${ZIP_POST} +echo -e "\n\n********************************************************" +echo "start build of pffft_${WN}_${ZIP_POST}" +mkdir build_${WN}_${ZIP_POST} && \ +cmake -S . -B build_${WN}_${ZIP_POST} \ + -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \ + -DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \ + "$@" && \ +cmake --build build_${WN}_${ZIP_POST} diff --git a/pffft/cross_build_mingw64.sh b/pffft/cross_build_mingw64.sh new file mode 100755 index 0000000..23c251f --- /dev/null +++ b/pffft/cross_build_mingw64.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# requires debian/ubuntu packages: zip gcc-mingw-w64 + +if [ -z "$1" ]; then + echo "usage: $0 " + exit 1 +fi + +ZIP_POST="$1" +shift + +# CROSS="x86_64-w64-mingw32" +WN="w64" +TOOLCHAIN="mingw-w64-x64_64.cmake" + +rm -rf build_${WN}_${ZIP_POST} +echo -e "\n\n********************************************************" +echo "start build of pffft_${WN}_${ZIP_POST}" +mkdir build_${WN}_${ZIP_POST} && \ +cmake -S . -B build_${WN}_${ZIP_POST} \ + -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \ + -DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \ + "$@" && \ +cmake --build build_${WN}_${ZIP_POST} diff --git a/pffft/examples/CMakeLists.txt b/pffft/examples/CMakeLists.txt new file mode 100644 index 0000000..0fe733b --- /dev/null +++ b/pffft/examples/CMakeLists.txt @@ -0,0 +1,63 @@ +cmake_minimum_required(VERSION 3.1) +project(examples) + +if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" ) + # using Visual Studio C++ + message(STATUS "INFO: detected MSVC: will not link math lib m") + set(MATHLIB "") + add_definitions("/D_CRT_SECURE_NO_WARNINGS") + set(MSVC_DISABLED_WARNINGS_LIST "C4996") +else() + if(PFFFT_DISABLE_LINK_WITH_M) + else() + message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m") + set(MATHLIB "m") + endif() +endif() + +set(STDCXXLIB "") +if (MINGW) + set(STDCXXLIB "stdc++") +endif() + + +set(CMAKE_CXX_EXTENSIONS OFF) + + +if (PFFFT_USE_TYPE_DOUBLE) + add_executable(example_cpp11_real_dbl_fwd example_cpp11_real_dbl_fwd.cpp) + target_compile_definitions(example_cpp11_real_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE) + target_link_libraries(example_cpp11_real_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB}) + set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD 11) + set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON) + + add_executable(example_cpp11_cplx_dbl_fwd example_cpp11_cplx_dbl_fwd.cpp) + target_compile_definitions(example_cpp11_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE) + target_link_libraries(example_cpp11_cplx_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB}) + set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD 11) + set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON) + + add_executable(example_c_cplx_dbl_fwd example_c_cplx_dbl_fwd.c) + target_compile_definitions(example_c_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_FLOAT) + target_link_libraries(example_c_cplx_dbl_fwd PFFFT ${MATHLIB}) +endif() + + +if (PFFFT_USE_TYPE_FLOAT) + add_executable(example_cpp98_real_flt_fwd example_cpp98_real_flt_fwd.cpp) + target_compile_definitions(example_cpp98_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT) + target_link_libraries(example_cpp98_real_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB}) + set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD 98) + set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON) + + add_executable(example_cpp98_cplx_flt_fwd example_cpp98_cplx_flt_fwd.cpp) + target_compile_definitions(example_cpp98_cplx_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT) + target_link_libraries(example_cpp98_cplx_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB}) + set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD 98) + set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON) + + add_executable(example_c_real_flt_fwd example_c_real_flt_fwd.c) + target_compile_definitions(example_c_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT) + target_link_libraries(example_c_real_flt_fwd PFFFT ${MATHLIB}) +endif() + diff --git a/pffft/examples/example_c_cplx_dbl_fwd.c b/pffft/examples/example_c_cplx_dbl_fwd.c new file mode 100644 index 0000000..e9adcd9 --- /dev/null +++ b/pffft/examples/example_c_cplx_dbl_fwd.c @@ -0,0 +1,69 @@ + +#include "pffft_double.h" + +#include +#include + + +void c_forward_complex_double(const int transformLen) +{ + printf("running %s()\n", __FUNCTION__); + + /* first check - might be skipped */ + if (transformLen < pffftd_min_fft_size(PFFFT_COMPLEX)) + { + fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffftd_min_fft_size(PFFFT_COMPLEX)); + return; + } + + /* instantiate FFT and prepare transformation for length N */ + PFFFTD_Setup *ffts = pffftd_new_setup(transformLen, PFFFT_COMPLEX); + + /* one more check */ + if (!ffts) + { + fprintf(stderr, + "Error: transformation length %d is not decomposable into small prime factors. " + "Next valid transform size is: %d ; next power of 2 is: %d\n", + transformLen, + pffftd_nearest_transform_size(transformLen, PFFFT_COMPLEX, 1), + pffftd_next_power_of_two(transformLen) ); + return; + } + + /* allocate aligned vectors for input X and output Y */ + double *X = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */ + double *Y = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */ + double *W = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); + + /* prepare some input data */ + for (int k = 0; k < 2 * transformLen; k += 4) + { + X[k] = k / 2; /* real */ + X[k+1] = (k / 2) & 1; /* imag */ + + X[k+2] = -1 - k / 2; /* real */ + X[k+3] = (k / 2) & 1; /* imag */ + } + + /* do the forward transform; write complex spectrum result into Y */ + pffftd_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD); + + /* print spectral output */ + printf("output should be complex spectrum with %d complex bins\n", transformLen); + for (int k = 0; k < 2 * transformLen; k += 2) + printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]); + + pffftd_aligned_free(W); + pffftd_aligned_free(Y); + pffftd_aligned_free(X); + pffftd_destroy_setup(ffts); +} + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 16; + c_forward_complex_double(N); + return 0; +} diff --git a/pffft/examples/example_c_real_flt_fwd.c b/pffft/examples/example_c_real_flt_fwd.c new file mode 100644 index 0000000..f52df41 --- /dev/null +++ b/pffft/examples/example_c_real_flt_fwd.c @@ -0,0 +1,66 @@ + +#include "pffft.h" + +#include +#include + + +void c_forward_real_float(const int transformLen) +{ + printf("running %s()\n", __FUNCTION__); + + /* first check - might be skipped */ + if (transformLen < pffft_min_fft_size(PFFFT_REAL)) + { + fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffft_min_fft_size(PFFFT_REAL)); + return; + } + + /* instantiate FFT and prepare transformation for length N */ + PFFFT_Setup *ffts = pffft_new_setup(transformLen, PFFFT_REAL); + + /* one more check */ + if (!ffts) + { + fprintf(stderr, + "Error: transformation length %d is not decomposable into small prime factors. " + "Next valid transform size is: %d ; next power of 2 is: %d\n", + transformLen, + pffft_nearest_transform_size(transformLen, PFFFT_REAL, 1), + pffft_next_power_of_two(transformLen) ); + return; + } + + /* allocate aligned vectors for input X and output Y */ + float *X = (float*)pffft_aligned_malloc(transformLen * sizeof(float)); + float *Y = (float*)pffft_aligned_malloc(transformLen * sizeof(float)); /* complex: re/im interleaved */ + float *W = (float*)pffft_aligned_malloc(transformLen * sizeof(float)); + + /* prepare some input data */ + for (int k = 0; k < transformLen; k += 2) + { + X[k] = k; + X[k+1] = -1-k; + } + + /* do the forward transform; write complex spectrum result into Y */ + pffft_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD); + + /* print spectral output */ + printf("output should be complex spectrum with %d complex bins\n", transformLen /2); + for (int k = 0; k < transformLen; k += 2) + printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]); + + pffft_aligned_free(W); + pffft_aligned_free(Y); + pffft_aligned_free(X); + pffft_destroy_setup(ffts); +} + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 32; + c_forward_real_float(N); + return 0; +} diff --git a/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp b/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp new file mode 100644 index 0000000..e60dbc9 --- /dev/null +++ b/pffft/examples/example_cpp11_cplx_dbl_fwd.cpp @@ -0,0 +1,66 @@ + +#include "pffft.hpp" + +#include +#include + + +void cxx11_forward_complex_double(const int transformLen) +{ + std::cout << "running " << __FUNCTION__ << "()" << std::endl; + + // first check - might be skipped + using FFT_T = pffft::Fft< std::complex >; + if (transformLen < FFT_T::minFFtsize()) + { + std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl; + return; + } + + // instantiate FFT and prepare transformation for length N + pffft::Fft< std::complex > fft(transformLen); + + // one more check + if (!fft.isValid()) + { + std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. " + << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen) + << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl; + return; + } + + // allocate aligned vectors for input X and output Y + auto X = fft.valueVector(); + auto Y = fft.spectrumVector(); + + // alternative access: get raw pointers to aligned vectors + std::complex *Xs = X.data(); + std::complex *Ys = Y.data(); + + // prepare some input data + for (int k = 0; k < transformLen; k += 2) + { + X[k] = std::complex(k, k&1); // access through AlignedVector + Xs[k+1] = std::complex(-1-k, k&1); // access through raw pointer + } + + // do the forward transform; write complex spectrum result into Y + fft.forward(X, Y); + + // print spectral output + std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl; + std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl; + for (unsigned k = 0; k < Y.size(); k += 2) + { + std::cout << "Y[" << k << "] = " << Y[k] << std::endl; + std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl; + } +} + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 16; + cxx11_forward_complex_double(N); + return 0; +} diff --git a/pffft/examples/example_cpp11_real_dbl_fwd.cpp b/pffft/examples/example_cpp11_real_dbl_fwd.cpp new file mode 100644 index 0000000..433865a --- /dev/null +++ b/pffft/examples/example_cpp11_real_dbl_fwd.cpp @@ -0,0 +1,66 @@ + +#include "pffft.hpp" + +#include +#include + + +void cxx11_forward_real_double(const int transformLen) +{ + std::cout << "running " << __FUNCTION__ << "()" << std::endl; + + // first check - might be skipped + using FFT_T = pffft::Fft; + if (transformLen < FFT_T::minFFtsize()) + { + std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl; + return; + } + + // instantiate FFT and prepare transformation for length N + pffft::Fft fft { transformLen }; + + // one more check + if (!fft.isValid()) + { + std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. " + << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen) + << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl; + return; + } + + // allocate aligned vectors for (real) input X and (complex) output Y + auto X = fft.valueVector(); // input vector; type is AlignedVector + auto Y = fft.spectrumVector(); // output vector; type is AlignedVector< std::complex > + + // alternative access: get raw pointers to aligned vectors + double *Xs = X.data(); + std::complex *Ys = Y.data(); + + // prepare some input data + for (int k = 0; k < transformLen; k += 2) + { + X[k] = k; // access through AlignedVector + Xs[k+1] = -1-k; // access through raw pointer + } + + // do the forward transform; write complex spectrum result into Y + fft.forward(X, Y); + + // print spectral output + std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl; + std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl; + for (unsigned k = 0; k < Y.size(); k += 2) + { + std::cout << "Y[" << k << "] = " << Y[k] << std::endl; + std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl; + } +} + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 32; + cxx11_forward_real_double(N); + return 0; +} diff --git a/pffft/examples/example_cpp98_cplx_flt_fwd.cpp b/pffft/examples/example_cpp98_cplx_flt_fwd.cpp new file mode 100644 index 0000000..91e48cd --- /dev/null +++ b/pffft/examples/example_cpp98_cplx_flt_fwd.cpp @@ -0,0 +1,66 @@ + +#include "pffft.hpp" + +#include +#include + + +void cxx98_forward_complex_float(const int transformLen) +{ + std::cout << "running " << __FUNCTION__ << "()" << std::endl; + + // first check - might be skipped + typedef pffft::Fft< std::complex > FFT_T; + if (transformLen < FFT_T::minFFtsize()) + { + std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl; + return; + } + + // instantiate FFT and prepare transformation for length N + pffft::Fft< std::complex > fft(transformLen); + + // one more check + if (!fft.isValid()) + { + std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. " + << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen) + << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl; + return; + } + + // allocate aligned vectors for input X and output Y + pffft::AlignedVector< std::complex > X = fft.valueVector(); + pffft::AlignedVector< std::complex > Y = fft.spectrumVector(); + + // alternative access: get raw pointers to aligned vectors + std::complex *Xs = X.data(); + std::complex *Ys = Y.data(); + + // prepare some input data + for (int k = 0; k < transformLen; k += 2) + { + X[k] = std::complex(k, k&1); // access through AlignedVector + Xs[k+1] = std::complex(-1-k, k&1); // access through raw pointer + } + + // do the forward transform; write complex spectrum result into Y + fft.forward(X, Y); + + // print spectral output + std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl; + std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl; + for (unsigned k = 0; k < Y.size(); k += 2) + { + std::cout << "Y[" << k << "] = " << Y[k] << std::endl; + std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl; + } +} + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 16; + cxx98_forward_complex_float(N); + return 0; +} diff --git a/pffft/examples/example_cpp98_real_flt_fwd.cpp b/pffft/examples/example_cpp98_real_flt_fwd.cpp new file mode 100644 index 0000000..c5ffe2b --- /dev/null +++ b/pffft/examples/example_cpp98_real_flt_fwd.cpp @@ -0,0 +1,66 @@ + +#include "pffft.hpp" + +#include +#include + + +void cxx98_forward_real_float(const int transformLen) +{ + std::cout << "running " << __FUNCTION__ << "()" << std::endl; + + // first check - might be skipped + typedef pffft::Fft FFT_T; + if (transformLen < FFT_T::minFFtsize()) + { + std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl; + return; + } + + // instantiate FFT and prepare transformation for length N + pffft::Fft fft(transformLen); + + // one more check + if (!fft.isValid()) + { + std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. " + << "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen) + << "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl; + return; + } + + // allocate aligned vectors for input X and output Y + pffft::AlignedVector X = fft.valueVector(); + pffft::AlignedVector< std::complex > Y = fft.spectrumVector(); + + // alternative access: get raw pointers to aligned vectors + float *Xs = X.data(); + std::complex *Ys = Y.data(); + + // prepare some input data + for (int k = 0; k < transformLen; k += 2) + { + X[k] = k; // access through AlignedVector + Xs[k+1] = -1-k; // access through raw pointer + } + + // do the forward transform; write complex spectrum result into Y + fft.forward(X, Y); + + // print spectral output + std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl; + std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl; + for (unsigned k = 0; k < Y.size(); k += 2) + { + std::cout << "Y[" << k << "] = " << Y[k] << std::endl; + std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl; + } +} + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 32; + cxx98_forward_real_float(N); + return 0; +} diff --git a/pffft/fftpack.c b/pffft/fftpack.c new file mode 100644 index 0000000..0645390 --- /dev/null +++ b/pffft/fftpack.c @@ -0,0 +1,3130 @@ +/* + compile with cc -DTESTING_FFTPACK fftpack.c in order to build the + test application. + + This is an f2c translation of the full fftpack sources as found on + http://www.netlib.org/fftpack/ The translated code has been + slightlty edited to remove the ugliest artefacts of the translation + (a hundred of wild GOTOs were wiped during that operation). + + The original fftpack file was written by Paul N. Swarztrauber + (Version 4, 1985), in fortran 77. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + ChangeLog: + 2011/10/02: this is my first release of this file. +*/ + +#include "fftpack.h" +#include + +typedef fftpack_real real; +typedef fftpack_int integer; + +#ifndef FFTPACK_DOUBLE_PRECISION + #define FFTPACK_COS cosf + #define FFTPACK_SIN sinf +#else + #define FFTPACK_COS cos + #define FFTPACK_SIN sin +#endif + + +typedef struct f77complex { + real r, i; +} f77complex; + +#ifdef TESTING_FFTPACK +static real c_abs(f77complex *c) { return sqrt(c->r*c->r + c->i*c->i); } +static double dmax(double a, double b) { return a < b ? b : a; } +#endif + +/* define own constants required to turn off g++ extensions .. */ +#ifndef M_PI + #define M_PI 3.14159265358979323846 /* pi */ +#endif + +#ifndef M_SQRT2 + #define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ +#endif + + +/* translated by f2c (version 20061008), and slightly edited */ + +static void passfb(integer *nac, integer ido, integer ip, integer l1, integer idl1, + real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa, real fsign) +{ + /* System generated locals */ + integer ch_offset, cc_offset, + c1_offset, c2_offset, ch2_offset; + + /* Local variables */ + integer i, j, k, l, jc, lc, ik, idj, idl, inc, idp; + real wai, war; + integer ipp2, idij, idlj, idot, ipph; + + +#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1] +#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1] +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] +#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + c1_offset = 1 + ido * (1 + l1); + c1 -= c1_offset; + cc_offset = 1 + ido * (1 + ip); + cc -= cc_offset; + ch2_offset = 1 + idl1; + ch2 -= ch2_offset; + c2_offset = 1 + idl1; + c2 -= c2_offset; + --wa; + + /* Function Body */ + idot = ido / 2; + ipp2 = ip + 2; + ipph = (ip + 1) / 2; + idp = ip * ido; + + if (ido >= l1) { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (k = 1; k <= l1; ++k) { + for (i = 1; i <= ido; ++i) { + ch_ref(i, k, j) = cc_ref(i, j, k) + cc_ref(i, jc, k); + ch_ref(i, k, jc) = cc_ref(i, j, k) - cc_ref(i, jc, k); + } + } + } + for (k = 1; k <= l1; ++k) { + for (i = 1; i <= ido; ++i) { + ch_ref(i, k, 1) = cc_ref(i, 1, k); + } + } + } else { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (i = 1; i <= ido; ++i) { + for (k = 1; k <= l1; ++k) { + ch_ref(i, k, j) = cc_ref(i, j, k) + cc_ref(i, jc, k); + ch_ref(i, k, jc) = cc_ref(i, j, k) - cc_ref(i, jc, k); + } + } + } + for (i = 1; i <= ido; ++i) { + for (k = 1; k <= l1; ++k) { + ch_ref(i, k, 1) = cc_ref(i, 1, k); + } + } + } + idl = 2 - ido; + inc = 0; + for (l = 2; l <= ipph; ++l) { + lc = ipp2 - l; + idl += ido; + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, l) = ch2_ref(ik, 1) + wa[idl - 1] * ch2_ref(ik, 2); + c2_ref(ik, lc) = fsign*wa[idl] * ch2_ref(ik, ip); + } + idlj = idl; + inc += ido; + for (j = 3; j <= ipph; ++j) { + jc = ipp2 - j; + idlj += inc; + if (idlj > idp) { + idlj -= idp; + } + war = wa[idlj - 1]; + wai = wa[idlj]; + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, l) = c2_ref(ik, l) + war * ch2_ref(ik, j); + c2_ref(ik, lc) = c2_ref(ik, lc) + fsign*wai * ch2_ref(ik, jc); + } + } + } + for (j = 2; j <= ipph; ++j) { + for (ik = 1; ik <= idl1; ++ik) { + ch2_ref(ik, 1) = ch2_ref(ik, 1) + ch2_ref(ik, j); + } + } + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (ik = 2; ik <= idl1; ik += 2) { + ch2_ref(ik - 1, j) = c2_ref(ik - 1, j) - c2_ref(ik, jc); + ch2_ref(ik - 1, jc) = c2_ref(ik - 1, j) + c2_ref(ik, jc); + ch2_ref(ik, j) = c2_ref(ik, j) + c2_ref(ik - 1, jc); + ch2_ref(ik, jc) = c2_ref(ik, j) - c2_ref(ik - 1, jc); + } + } + *nac = 1; + if (ido == 2) { + return; + } + *nac = 0; + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, 1) = ch2_ref(ik, 1); + } + for (j = 2; j <= ip; ++j) { + for (k = 1; k <= l1; ++k) { + c1_ref(1, k, j) = ch_ref(1, k, j); + c1_ref(2, k, j) = ch_ref(2, k, j); + } + } + if (idot <= l1) { + idij = 0; + for (j = 2; j <= ip; ++j) { + idij += 2; + for (i = 4; i <= ido; i += 2) { + idij += 2; + for (k = 1; k <= l1; ++k) { + c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) - fsign*wa[idij] * ch_ref(i, k, j); + c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + fsign*wa[idij] * ch_ref(i - 1, k, j); + } + } + } + return; + } + idj = 2 - ido; + for (j = 2; j <= ip; ++j) { + idj += ido; + for (k = 1; k <= l1; ++k) { + idij = idj; + for (i = 4; i <= ido; i += 2) { + idij += 2; + c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) - fsign*wa[idij] * ch_ref(i, k, j); + c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + fsign*wa[idij] * ch_ref(i - 1, k, j); + } + } + } +} /* passb */ + +#undef ch2_ref +#undef ch_ref +#undef cc_ref +#undef c2_ref +#undef c1_ref + + +static void passb2(integer ido, integer l1, const real *cc, real *ch, const real *wa1) +{ + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ti2, tr2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 3; + cc -= cc_offset; + --wa1; + + /* Function Body */ + if (ido <= 2) { + for (k = 1; k <= l1; ++k) { + ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(1, 2, k); + ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(1, 2, k); + ch_ref(2, k, 1) = cc_ref(2, 1, k) + cc_ref(2, 2, k); + ch_ref(2, k, 2) = cc_ref(2, 1, k) - cc_ref(2, 2, k); + } + return; + } + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 2, k); + tr2 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 2, k); + ch_ref(i, k, 1) = cc_ref(i, 1, k) + cc_ref(i, 2, k); + ti2 = cc_ref(i, 1, k) - cc_ref(i, 2, k); + ch_ref(i, k, 2) = wa1[i - 1] * ti2 + wa1[i] * tr2; + ch_ref(i - 1, k, 2) = wa1[i - 1] * tr2 - wa1[i] * ti2; + } + } +} /* passb2 */ + +#undef ch_ref +#undef cc_ref + + +static void passb3(integer ido, integer l1, const real *cc, real *ch, const real *wa1, const real *wa2) +{ + static const real taur = -.5f; + static const real taui = .866025403784439f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + (ido << 2); + cc -= cc_offset; + --wa1; + --wa2; + + /* Function Body */ + if (ido == 2) { + for (k = 1; k <= l1; ++k) { + tr2 = cc_ref(1, 2, k) + cc_ref(1, 3, k); + cr2 = cc_ref(1, 1, k) + taur * tr2; + ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2; + ti2 = cc_ref(2, 2, k) + cc_ref(2, 3, k); + ci2 = cc_ref(2, 1, k) + taur * ti2; + ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2; + cr3 = taui * (cc_ref(1, 2, k) - cc_ref(1, 3, k)); + ci3 = taui * (cc_ref(2, 2, k) - cc_ref(2, 3, k)); + ch_ref(1, k, 2) = cr2 - ci3; + ch_ref(1, k, 3) = cr2 + ci3; + ch_ref(2, k, 2) = ci2 + cr3; + ch_ref(2, k, 3) = ci2 - cr3; + } + } else { + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 3, k); + cr2 = cc_ref(i - 1, 1, k) + taur * tr2; + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2; + ti2 = cc_ref(i, 2, k) + cc_ref(i, 3, k); + ci2 = cc_ref(i, 1, k) + taur * ti2; + ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2; + cr3 = taui * (cc_ref(i - 1, 2, k) - cc_ref(i - 1, 3, k)); + ci3 = taui * (cc_ref(i, 2, k) - cc_ref(i, 3, k)); + dr2 = cr2 - ci3; + dr3 = cr2 + ci3; + di2 = ci2 + cr3; + di3 = ci2 - cr3; + ch_ref(i, k, 2) = wa1[i - 1] * di2 + wa1[i] * dr2; + ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 - wa1[i] * di2; + ch_ref(i, k, 3) = wa2[i - 1] * di3 + wa2[i] * dr3; + ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 - wa2[i] * di3; + } + } + } +} /* passb3 */ + +#undef ch_ref +#undef cc_ref + + +static void passb4(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3) +{ + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 5; + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + + /* Function Body */ + if (ido == 2) { + for (k = 1; k <= l1; ++k) { + ti1 = cc_ref(2, 1, k) - cc_ref(2, 3, k); + ti2 = cc_ref(2, 1, k) + cc_ref(2, 3, k); + tr4 = cc_ref(2, 4, k) - cc_ref(2, 2, k); + ti3 = cc_ref(2, 2, k) + cc_ref(2, 4, k); + tr1 = cc_ref(1, 1, k) - cc_ref(1, 3, k); + tr2 = cc_ref(1, 1, k) + cc_ref(1, 3, k); + ti4 = cc_ref(1, 2, k) - cc_ref(1, 4, k); + tr3 = cc_ref(1, 2, k) + cc_ref(1, 4, k); + ch_ref(1, k, 1) = tr2 + tr3; + ch_ref(1, k, 3) = tr2 - tr3; + ch_ref(2, k, 1) = ti2 + ti3; + ch_ref(2, k, 3) = ti2 - ti3; + ch_ref(1, k, 2) = tr1 + tr4; + ch_ref(1, k, 4) = tr1 - tr4; + ch_ref(2, k, 2) = ti1 + ti4; + ch_ref(2, k, 4) = ti1 - ti4; + } + } else { + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + ti1 = cc_ref(i, 1, k) - cc_ref(i, 3, k); + ti2 = cc_ref(i, 1, k) + cc_ref(i, 3, k); + ti3 = cc_ref(i, 2, k) + cc_ref(i, 4, k); + tr4 = cc_ref(i, 4, k) - cc_ref(i, 2, k); + tr1 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 3, k); + tr2 = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 3, k); + ti4 = cc_ref(i - 1, 2, k) - cc_ref(i - 1, 4, k); + tr3 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 4, k); + ch_ref(i - 1, k, 1) = tr2 + tr3; + cr3 = tr2 - tr3; + ch_ref(i, k, 1) = ti2 + ti3; + ci3 = ti2 - ti3; + cr2 = tr1 + tr4; + cr4 = tr1 - tr4; + ci2 = ti1 + ti4; + ci4 = ti1 - ti4; + ch_ref(i - 1, k, 2) = wa1[i - 1] * cr2 - wa1[i] * ci2; + ch_ref(i, k, 2) = wa1[i - 1] * ci2 + wa1[i] * cr2; + ch_ref(i - 1, k, 3) = wa2[i - 1] * cr3 - wa2[i] * ci3; + ch_ref(i, k, 3) = wa2[i - 1] * ci3 + wa2[i] * cr3; + ch_ref(i - 1, k, 4) = wa3[i - 1] * cr4 - wa3[i] * ci4; + ch_ref(i, k, 4) = wa3[i - 1] * ci4 + wa3[i] * cr4; + } + } + } +} /* passb4 */ + +#undef ch_ref +#undef cc_ref + +/* passf5 and passb5 merged */ +static void passfb5(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3, const real *wa4, real fsign) +{ + const real tr11 = .309016994374947f; + const real ti11 = .951056516295154f*fsign; + const real tr12 = -.809016994374947f; + const real ti12 = .587785252292473f*fsign; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3, + ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 6; + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + --wa4; + + /* Function Body */ + if (ido == 2) { + for (k = 1; k <= l1; ++k) { + ti5 = cc_ref(2, 2, k) - cc_ref(2, 5, k); + ti2 = cc_ref(2, 2, k) + cc_ref(2, 5, k); + ti4 = cc_ref(2, 3, k) - cc_ref(2, 4, k); + ti3 = cc_ref(2, 3, k) + cc_ref(2, 4, k); + tr5 = cc_ref(1, 2, k) - cc_ref(1, 5, k); + tr2 = cc_ref(1, 2, k) + cc_ref(1, 5, k); + tr4 = cc_ref(1, 3, k) - cc_ref(1, 4, k); + tr3 = cc_ref(1, 3, k) + cc_ref(1, 4, k); + ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2 + tr3; + ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2 + ti3; + cr2 = cc_ref(1, 1, k) + tr11 * tr2 + tr12 * tr3; + ci2 = cc_ref(2, 1, k) + tr11 * ti2 + tr12 * ti3; + cr3 = cc_ref(1, 1, k) + tr12 * tr2 + tr11 * tr3; + ci3 = cc_ref(2, 1, k) + tr12 * ti2 + tr11 * ti3; + cr5 = ti11 * tr5 + ti12 * tr4; + ci5 = ti11 * ti5 + ti12 * ti4; + cr4 = ti12 * tr5 - ti11 * tr4; + ci4 = ti12 * ti5 - ti11 * ti4; + ch_ref(1, k, 2) = cr2 - ci5; + ch_ref(1, k, 5) = cr2 + ci5; + ch_ref(2, k, 2) = ci2 + cr5; + ch_ref(2, k, 3) = ci3 + cr4; + ch_ref(1, k, 3) = cr3 - ci4; + ch_ref(1, k, 4) = cr3 + ci4; + ch_ref(2, k, 4) = ci3 - cr4; + ch_ref(2, k, 5) = ci2 - cr5; + } + } else { + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + ti5 = cc_ref(i, 2, k) - cc_ref(i, 5, k); + ti2 = cc_ref(i, 2, k) + cc_ref(i, 5, k); + ti4 = cc_ref(i, 3, k) - cc_ref(i, 4, k); + ti3 = cc_ref(i, 3, k) + cc_ref(i, 4, k); + tr5 = cc_ref(i - 1, 2, k) - cc_ref(i - 1, 5, k); + tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 5, k); + tr4 = cc_ref(i - 1, 3, k) - cc_ref(i - 1, 4, k); + tr3 = cc_ref(i - 1, 3, k) + cc_ref(i - 1, 4, k); + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2 + tr3; + ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2 + ti3; + cr2 = cc_ref(i - 1, 1, k) + tr11 * tr2 + tr12 * tr3; + ci2 = cc_ref(i, 1, k) + tr11 * ti2 + tr12 * ti3; + cr3 = cc_ref(i - 1, 1, k) + tr12 * tr2 + tr11 * tr3; + ci3 = cc_ref(i, 1, k) + tr12 * ti2 + tr11 * ti3; + cr5 = ti11 * tr5 + ti12 * tr4; + ci5 = ti11 * ti5 + ti12 * ti4; + cr4 = ti12 * tr5 - ti11 * tr4; + ci4 = ti12 * ti5 - ti11 * ti4; + dr3 = cr3 - ci4; + dr4 = cr3 + ci4; + di3 = ci3 + cr4; + di4 = ci3 - cr4; + dr5 = cr2 + ci5; + dr2 = cr2 - ci5; + di5 = ci2 - cr5; + di2 = ci2 + cr5; + ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 - fsign*wa1[i] * di2; + ch_ref(i, k, 2) = wa1[i - 1] * di2 + fsign*wa1[i] * dr2; + ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 - fsign*wa2[i] * di3; + ch_ref(i, k, 3) = wa2[i - 1] * di3 + fsign*wa2[i] * dr3; + ch_ref(i - 1, k, 4) = wa3[i - 1] * dr4 - fsign*wa3[i] * di4; + ch_ref(i, k, 4) = wa3[i - 1] * di4 + fsign*wa3[i] * dr4; + ch_ref(i - 1, k, 5) = wa4[i - 1] * dr5 - fsign*wa4[i] * di5; + ch_ref(i, k, 5) = wa4[i - 1] * di5 + fsign*wa4[i] * dr5; + } + } + } +} /* passb5 */ + +#undef ch_ref +#undef cc_ref + +static void passf2(integer ido, integer l1, const real *cc, real *ch, const real *wa1) +{ + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ti2, tr2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 3; + cc -= cc_offset; + --wa1; + + /* Function Body */ + if (ido == 2) { + for (k = 1; k <= l1; ++k) { + ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(1, 2, k); + ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(1, 2, k); + ch_ref(2, k, 1) = cc_ref(2, 1, k) + cc_ref(2, 2, k); + ch_ref(2, k, 2) = cc_ref(2, 1, k) - cc_ref(2, 2, k); + } + } else { + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 2, + k); + tr2 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 2, k); + ch_ref(i, k, 1) = cc_ref(i, 1, k) + cc_ref(i, 2, k); + ti2 = cc_ref(i, 1, k) - cc_ref(i, 2, k); + ch_ref(i, k, 2) = wa1[i - 1] * ti2 - wa1[i] * tr2; + ch_ref(i - 1, k, 2) = wa1[i - 1] * tr2 + wa1[i] * ti2; + } + } + } +} /* passf2 */ + +#undef ch_ref +#undef cc_ref + + +static void passf3(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2) +{ + static const real taur = -.5f; + static const real taui = -.866025403784439f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + (ido << 2); + cc -= cc_offset; + --wa1; + --wa2; + + /* Function Body */ + if (ido == 2) { + for (k = 1; k <= l1; ++k) { + tr2 = cc_ref(1, 2, k) + cc_ref(1, 3, k); + cr2 = cc_ref(1, 1, k) + taur * tr2; + ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2; + ti2 = cc_ref(2, 2, k) + cc_ref(2, 3, k); + ci2 = cc_ref(2, 1, k) + taur * ti2; + ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2; + cr3 = taui * (cc_ref(1, 2, k) - cc_ref(1, 3, k)); + ci3 = taui * (cc_ref(2, 2, k) - cc_ref(2, 3, k)); + ch_ref(1, k, 2) = cr2 - ci3; + ch_ref(1, k, 3) = cr2 + ci3; + ch_ref(2, k, 2) = ci2 + cr3; + ch_ref(2, k, 3) = ci2 - cr3; + } + } else { + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 3, k); + cr2 = cc_ref(i - 1, 1, k) + taur * tr2; + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2; + ti2 = cc_ref(i, 2, k) + cc_ref(i, 3, k); + ci2 = cc_ref(i, 1, k) + taur * ti2; + ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2; + cr3 = taui * (cc_ref(i - 1, 2, k) - cc_ref(i - 1, 3, k)); + ci3 = taui * (cc_ref(i, 2, k) - cc_ref(i, 3, k)); + dr2 = cr2 - ci3; + dr3 = cr2 + ci3; + di2 = ci2 + cr3; + di3 = ci2 - cr3; + ch_ref(i, k, 2) = wa1[i - 1] * di2 - wa1[i] * dr2; + ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 + wa1[i] * di2; + ch_ref(i, k, 3) = wa2[i - 1] * di3 - wa2[i] * dr3; + ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 + wa2[i] * di3; + } + } + } +} /* passf3 */ + +#undef ch_ref +#undef cc_ref + + +static void passf4(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3) +{ + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k; + real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 5; + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + + /* Function Body */ + if (ido == 2) { + for (k = 1; k <= l1; ++k) { + ti1 = cc_ref(2, 1, k) - cc_ref(2, 3, k); + ti2 = cc_ref(2, 1, k) + cc_ref(2, 3, k); + tr4 = cc_ref(2, 2, k) - cc_ref(2, 4, k); + ti3 = cc_ref(2, 2, k) + cc_ref(2, 4, k); + tr1 = cc_ref(1, 1, k) - cc_ref(1, 3, k); + tr2 = cc_ref(1, 1, k) + cc_ref(1, 3, k); + ti4 = cc_ref(1, 4, k) - cc_ref(1, 2, k); + tr3 = cc_ref(1, 2, k) + cc_ref(1, 4, k); + ch_ref(1, k, 1) = tr2 + tr3; + ch_ref(1, k, 3) = tr2 - tr3; + ch_ref(2, k, 1) = ti2 + ti3; + ch_ref(2, k, 3) = ti2 - ti3; + ch_ref(1, k, 2) = tr1 + tr4; + ch_ref(1, k, 4) = tr1 - tr4; + ch_ref(2, k, 2) = ti1 + ti4; + ch_ref(2, k, 4) = ti1 - ti4; + } + } else { + for (k = 1; k <= l1; ++k) { + for (i = 2; i <= ido; i += 2) { + ti1 = cc_ref(i, 1, k) - cc_ref(i, 3, k); + ti2 = cc_ref(i, 1, k) + cc_ref(i, 3, k); + ti3 = cc_ref(i, 2, k) + cc_ref(i, 4, k); + tr4 = cc_ref(i, 2, k) - cc_ref(i, 4, k); + tr1 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 3, k); + tr2 = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 3, k); + ti4 = cc_ref(i - 1, 4, k) - cc_ref(i - 1, 2, k); + tr3 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 4, k); + ch_ref(i - 1, k, 1) = tr2 + tr3; + cr3 = tr2 - tr3; + ch_ref(i, k, 1) = ti2 + ti3; + ci3 = ti2 - ti3; + cr2 = tr1 + tr4; + cr4 = tr1 - tr4; + ci2 = ti1 + ti4; + ci4 = ti1 - ti4; + ch_ref(i - 1, k, 2) = wa1[i - 1] * cr2 + wa1[i] * ci2; + ch_ref(i, k, 2) = wa1[i - 1] * ci2 - wa1[i] * cr2; + ch_ref(i - 1, k, 3) = wa2[i - 1] * cr3 + wa2[i] * ci3; + ch_ref(i, k, 3) = wa2[i - 1] * ci3 - wa2[i] * cr3; + ch_ref(i - 1, k, 4) = wa3[i - 1] * cr4 + wa3[i] * ci4; + ch_ref(i, k, 4) = wa3[i - 1] * ci4 - wa3[i] * cr4; + } + } + } +} /* passf4 */ + +#undef ch_ref +#undef cc_ref + +static void radb2(integer ido, integer l1, const real *cc, real *ch, const real *wa1) +{ + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k, ic; + real ti2, tr2; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 3; + cc -= cc_offset; + --wa1; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(ido, 2, k); + ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(ido, 2, k); + } + if (ido < 2) return; + else if (ido != 2) { + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(ic - 1, 2, + k); + tr2 = cc_ref(i - 1, 1, k) - cc_ref(ic - 1, 2, k); + ch_ref(i, k, 1) = cc_ref(i, 1, k) - cc_ref(ic, 2, k); + ti2 = cc_ref(i, 1, k) + cc_ref(ic, 2, k); + ch_ref(i - 1, k, 2) = wa1[i - 2] * tr2 - wa1[i - 1] * ti2; + ch_ref(i, k, 2) = wa1[i - 2] * ti2 + wa1[i - 1] * tr2; + } + } + if (ido % 2 == 1) return; + } + for (k = 1; k <= l1; ++k) { + ch_ref(ido, k, 1) = cc_ref(ido, 1, k) + cc_ref(ido, 1, k); + ch_ref(ido, k, 2) = -(cc_ref(1, 2, k) + cc_ref(1, 2, k)); + } +} /* radb2 */ + +#undef ch_ref +#undef cc_ref + + +static void radb3(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2) +{ + /* Initialized data */ + + static const real taur = -.5f; + static const real taui = .866025403784439f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k, ic; + real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + (ido << 2); + cc -= cc_offset; + --wa1; + --wa2; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + tr2 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k); + cr2 = cc_ref(1, 1, k) + taur * tr2; + ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2; + ci3 = taui * (cc_ref(1, 3, k) + cc_ref(1, 3, k)); + ch_ref(1, k, 2) = cr2 - ci3; + ch_ref(1, k, 3) = cr2 + ci3; + } + if (ido == 1) { + return; + } + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + tr2 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k); + cr2 = cc_ref(i - 1, 1, k) + taur * tr2; + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2; + ti2 = cc_ref(i, 3, k) - cc_ref(ic, 2, k); + ci2 = cc_ref(i, 1, k) + taur * ti2; + ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2; + cr3 = taui * (cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k)); + ci3 = taui * (cc_ref(i, 3, k) + cc_ref(ic, 2, k)); + dr2 = cr2 - ci3; + dr3 = cr2 + ci3; + di2 = ci2 + cr3; + di3 = ci2 - cr3; + ch_ref(i - 1, k, 2) = wa1[i - 2] * dr2 - wa1[i - 1] * di2; + ch_ref(i, k, 2) = wa1[i - 2] * di2 + wa1[i - 1] * dr2; + ch_ref(i - 1, k, 3) = wa2[i - 2] * dr3 - wa2[i - 1] * di3; + ch_ref(i, k, 3) = wa2[i - 2] * di3 + wa2[i - 1] * dr3; + } + } +} /* radb3 */ + +#undef ch_ref +#undef cc_ref + + +static void radb4(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3) +{ + /* Initialized data */ + + static const real sqrt2 = 1.414213562373095f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k, ic; + real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 5; + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + tr1 = cc_ref(1, 1, k) - cc_ref(ido, 4, k); + tr2 = cc_ref(1, 1, k) + cc_ref(ido, 4, k); + tr3 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k); + tr4 = cc_ref(1, 3, k) + cc_ref(1, 3, k); + ch_ref(1, k, 1) = tr2 + tr3; + ch_ref(1, k, 2) = tr1 - tr4; + ch_ref(1, k, 3) = tr2 - tr3; + ch_ref(1, k, 4) = tr1 + tr4; + } + if (ido < 2) return; + if (ido != 2) { + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + ti1 = cc_ref(i, 1, k) + cc_ref(ic, 4, k); + ti2 = cc_ref(i, 1, k) - cc_ref(ic, 4, k); + ti3 = cc_ref(i, 3, k) - cc_ref(ic, 2, k); + tr4 = cc_ref(i, 3, k) + cc_ref(ic, 2, k); + tr1 = cc_ref(i - 1, 1, k) - cc_ref(ic - 1, 4, k); + tr2 = cc_ref(i - 1, 1, k) + cc_ref(ic - 1, 4, k); + ti4 = cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k); + tr3 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k); + ch_ref(i - 1, k, 1) = tr2 + tr3; + cr3 = tr2 - tr3; + ch_ref(i, k, 1) = ti2 + ti3; + ci3 = ti2 - ti3; + cr2 = tr1 - tr4; + cr4 = tr1 + tr4; + ci2 = ti1 + ti4; + ci4 = ti1 - ti4; + ch_ref(i - 1, k, 2) = wa1[i - 2] * cr2 - wa1[i - 1] * ci2; + ch_ref(i, k, 2) = wa1[i - 2] * ci2 + wa1[i - 1] * cr2; + ch_ref(i - 1, k, 3) = wa2[i - 2] * cr3 - wa2[i - 1] * ci3; + ch_ref(i, k, 3) = wa2[i - 2] * ci3 + wa2[i - 1] * cr3; + ch_ref(i - 1, k, 4) = wa3[i - 2] * cr4 - wa3[i - 1] * ci4; + ch_ref(i, k, 4) = wa3[i - 2] * ci4 + wa3[i - 1] * cr4; + } + } + if (ido % 2 == 1) return; + } + for (k = 1; k <= l1; ++k) { + ti1 = cc_ref(1, 2, k) + cc_ref(1, 4, k); + ti2 = cc_ref(1, 4, k) - cc_ref(1, 2, k); + tr1 = cc_ref(ido, 1, k) - cc_ref(ido, 3, k); + tr2 = cc_ref(ido, 1, k) + cc_ref(ido, 3, k); + ch_ref(ido, k, 1) = tr2 + tr2; + ch_ref(ido, k, 2) = sqrt2 * (tr1 - ti1); + ch_ref(ido, k, 3) = ti2 + ti2; + ch_ref(ido, k, 4) = -sqrt2 * (tr1 + ti1); + } +} /* radb4 */ + +#undef ch_ref +#undef cc_ref + + +static void radb5(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3, const real *wa4) +{ + /* Initialized data */ + + static const real tr11 = .309016994374947f; + static const real ti11 = .951056516295154f; + static const real tr12 = -.809016994374947f; + static const real ti12 = .587785252292473f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k, ic; + real ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3, + ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + cc_offset = 1 + ido * 6; + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + --wa4; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + ti5 = cc_ref(1, 3, k) + cc_ref(1, 3, k); + ti4 = cc_ref(1, 5, k) + cc_ref(1, 5, k); + tr2 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k); + tr3 = cc_ref(ido, 4, k) + cc_ref(ido, 4, k); + ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2 + tr3; + cr2 = cc_ref(1, 1, k) + tr11 * tr2 + tr12 * tr3; + cr3 = cc_ref(1, 1, k) + tr12 * tr2 + tr11 * tr3; + ci5 = ti11 * ti5 + ti12 * ti4; + ci4 = ti12 * ti5 - ti11 * ti4; + ch_ref(1, k, 2) = cr2 - ci5; + ch_ref(1, k, 3) = cr3 - ci4; + ch_ref(1, k, 4) = cr3 + ci4; + ch_ref(1, k, 5) = cr2 + ci5; + } + if (ido == 1) { + return; + } + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + ti5 = cc_ref(i, 3, k) + cc_ref(ic, 2, k); + ti2 = cc_ref(i, 3, k) - cc_ref(ic, 2, k); + ti4 = cc_ref(i, 5, k) + cc_ref(ic, 4, k); + ti3 = cc_ref(i, 5, k) - cc_ref(ic, 4, k); + tr5 = cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k); + tr2 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k); + tr4 = cc_ref(i - 1, 5, k) - cc_ref(ic - 1, 4, k); + tr3 = cc_ref(i - 1, 5, k) + cc_ref(ic - 1, 4, k); + ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2 + tr3; + ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2 + ti3; + cr2 = cc_ref(i - 1, 1, k) + tr11 * tr2 + tr12 * tr3; + ci2 = cc_ref(i, 1, k) + tr11 * ti2 + tr12 * ti3; + cr3 = cc_ref(i - 1, 1, k) + tr12 * tr2 + tr11 * tr3; + ci3 = cc_ref(i, 1, k) + tr12 * ti2 + tr11 * ti3; + cr5 = ti11 * tr5 + ti12 * tr4; + ci5 = ti11 * ti5 + ti12 * ti4; + cr4 = ti12 * tr5 - ti11 * tr4; + ci4 = ti12 * ti5 - ti11 * ti4; + dr3 = cr3 - ci4; + dr4 = cr3 + ci4; + di3 = ci3 + cr4; + di4 = ci3 - cr4; + dr5 = cr2 + ci5; + dr2 = cr2 - ci5; + di5 = ci2 - cr5; + di2 = ci2 + cr5; + ch_ref(i - 1, k, 2) = wa1[i - 2] * dr2 - wa1[i - 1] * di2; + ch_ref(i, k, 2) = wa1[i - 2] * di2 + wa1[i - 1] * dr2; + ch_ref(i - 1, k, 3) = wa2[i - 2] * dr3 - wa2[i - 1] * di3; + ch_ref(i, k, 3) = wa2[i - 2] * di3 + wa2[i - 1] * dr3; + ch_ref(i - 1, k, 4) = wa3[i - 2] * dr4 - wa3[i - 1] * di4; + ch_ref(i, k, 4) = wa3[i - 2] * di4 + wa3[i - 1] * dr4; + ch_ref(i - 1, k, 5) = wa4[i - 2] * dr5 - wa4[i - 1] * di5; + ch_ref(i, k, 5) = wa4[i - 2] * di5 + wa4[i - 1] * dr5; + } + } +} /* radb5 */ + +#undef ch_ref +#undef cc_ref + + +static void radbg(integer ido, integer ip, integer l1, integer idl1, + const real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa) +{ + /* System generated locals */ + integer ch_offset, cc_offset, + c1_offset, c2_offset, ch2_offset; + + /* Local variables */ + integer i, j, k, l, j2, ic, jc, lc, ik, is; + real dc2, ai1, ai2, ar1, ar2, ds2; + integer nbd; + real dcp, arg, dsp, ar1h, ar2h; + integer idp2, ipp2, idij, ipph; + + +#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1] +#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1] +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] +#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + c1_offset = 1 + ido * (1 + l1); + c1 -= c1_offset; + cc_offset = 1 + ido * (1 + ip); + cc -= cc_offset; + ch2_offset = 1 + idl1; + ch2 -= ch2_offset; + c2_offset = 1 + idl1; + c2 -= c2_offset; + --wa; + + /* Function Body */ + arg = (2*M_PI) / (real) (ip); + dcp = FFTPACK_COS(arg); + dsp = FFTPACK_SIN(arg); + idp2 = ido + 2; + nbd = (ido - 1) / 2; + ipp2 = ip + 2; + ipph = (ip + 1) / 2; + if (ido >= l1) { + for (k = 1; k <= l1; ++k) { + for (i = 1; i <= ido; ++i) { + ch_ref(i, k, 1) = cc_ref(i, 1, k); + } + } + } else { + for (i = 1; i <= ido; ++i) { + for (k = 1; k <= l1; ++k) { + ch_ref(i, k, 1) = cc_ref(i, 1, k); + } + } + } + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + j2 = j + j; + for (k = 1; k <= l1; ++k) { + ch_ref(1, k, j) = cc_ref(ido, j2 - 2, k) + cc_ref(ido, j2 - 2, k); + ch_ref(1, k, jc) = cc_ref(1, j2 - 1, k) + cc_ref(1, j2 - 1, k); + } + } + if (ido != 1) { + if (nbd >= l1) { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + ch_ref(i - 1, k, j) = cc_ref(i - 1, (j << 1) - 1, k) + cc_ref(ic - 1, (j << 1) - 2, k); + ch_ref(i - 1, k, jc) = cc_ref(i - 1, (j << 1) - 1, k) - cc_ref(ic - 1, (j << 1) - 2, k); + ch_ref(i, k, j) = cc_ref(i, (j << 1) - 1, k) - cc_ref(ic, (j << 1) - 2, k); + ch_ref(i, k, jc) = cc_ref(i, (j << 1) - 1, k) + cc_ref(ic, (j << 1) - 2, k); + } + } + } + } else { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + for (k = 1; k <= l1; ++k) { + ch_ref(i - 1, k, j) = cc_ref(i - 1, (j << 1) - 1, k) + cc_ref(ic - 1, (j << 1) - 2, k); + ch_ref(i - 1, k, jc) = cc_ref(i - 1, (j << 1) - 1, k) - cc_ref(ic - 1, (j << 1) - 2, k); + ch_ref(i, k, j) = cc_ref(i, (j << 1) - 1, k) - cc_ref(ic, (j << 1) - 2, k); + ch_ref(i, k, jc) = cc_ref(i, (j << 1) - 1, k) + cc_ref(ic, (j << 1) - 2, k); + } + } + } + } + } + ar1 = 1.f; + ai1 = 0.f; + for (l = 2; l <= ipph; ++l) { + lc = ipp2 - l; + ar1h = dcp * ar1 - dsp * ai1; + ai1 = dcp * ai1 + dsp * ar1; + ar1 = ar1h; + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, l) = ch2_ref(ik, 1) + ar1 * ch2_ref(ik, 2); + c2_ref(ik, lc) = ai1 * ch2_ref(ik, ip); + } + dc2 = ar1; + ds2 = ai1; + ar2 = ar1; + ai2 = ai1; + for (j = 3; j <= ipph; ++j) { + jc = ipp2 - j; + ar2h = dc2 * ar2 - ds2 * ai2; + ai2 = dc2 * ai2 + ds2 * ar2; + ar2 = ar2h; + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, l) = c2_ref(ik, l) + ar2 * ch2_ref(ik, j); + c2_ref(ik, lc) = c2_ref(ik, lc) + ai2 * ch2_ref(ik, jc); + } + } + } + for (j = 2; j <= ipph; ++j) { + for (ik = 1; ik <= idl1; ++ik) { + ch2_ref(ik, 1) = ch2_ref(ik, 1) + ch2_ref(ik, j); + } + } + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (k = 1; k <= l1; ++k) { + ch_ref(1, k, j) = c1_ref(1, k, j) - c1_ref(1, k, jc); + ch_ref(1, k, jc) = c1_ref(1, k, j) + c1_ref(1, k, jc); + } + } + if (ido != 1) { + if (nbd >= l1) { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ch_ref(i - 1, k, j) = c1_ref(i - 1, k, j) - c1_ref(i, k, jc); + ch_ref(i - 1, k, jc) = c1_ref(i - 1, k, j) + c1_ref(i, k, jc); + ch_ref(i, k, j) = c1_ref(i, k, j) + c1_ref(i - 1, k, jc); + ch_ref(i, k, jc) = c1_ref(i, k, j) - c1_ref(i - 1, k, jc); + } + } + } + } else { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (i = 3; i <= ido; i += 2) { + for (k = 1; k <= l1; ++k) { + ch_ref(i - 1, k, j) = c1_ref(i - 1, k, j) - c1_ref(i, k, jc); + ch_ref(i - 1, k, jc) = c1_ref(i - 1, k, j) + c1_ref(i, k, jc); + ch_ref(i, k, j) = c1_ref(i, k, j) + c1_ref(i - 1, k, jc); + ch_ref(i, k, jc) = c1_ref(i, k, j) - c1_ref(i - 1, k, jc); + } + } + } + } + } + if (ido == 1) { + return; + } + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, 1) = ch2_ref(ik, 1); + } + for (j = 2; j <= ip; ++j) { + for (k = 1; k <= l1; ++k) { + c1_ref(1, k, j) = ch_ref(1, k, j); + } + } + if (nbd <= l1) { + is = -(ido); + for (j = 2; j <= ip; ++j) { + is += ido; + idij = is; + for (i = 3; i <= ido; i += 2) { + idij += 2; + for (k = 1; k <= l1; ++k) { + c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) + - wa[idij] * ch_ref(i, k, j); + c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + wa[idij] * ch_ref(i - 1, k, j); + } + } + } + } else { + is = -(ido); + for (j = 2; j <= ip; ++j) { + is += ido; + for (k = 1; k <= l1; ++k) { + idij = is; + for (i = 3; i <= ido; i += 2) { + idij += 2; + c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) + - wa[idij] * ch_ref(i, k, j); + c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + wa[idij] * ch_ref(i - 1, k, j); + } + } + } + } +} /* radbg */ + +#undef ch2_ref +#undef ch_ref +#undef cc_ref +#undef c2_ref +#undef c1_ref + + +static void radf2(integer ido, integer l1, const real *cc, real *ch, + const real *wa1) +{ + /* System generated locals */ + integer ch_offset, cc_offset; + + /* Local variables */ + integer i, k, ic; + real ti2, tr2; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*2 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * 3; + ch -= ch_offset; + cc_offset = 1 + ido * (1 + l1); + cc -= cc_offset; + --wa1; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + ch_ref(1, 1, k) = cc_ref(1, k, 1) + cc_ref(1, k, 2); + ch_ref(ido, 2, k) = cc_ref(1, k, 1) - cc_ref(1, k, 2); + } + if (ido < 2) return; + if (ido != 2) { + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + tr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * + cc_ref(i, k, 2); + ti2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref( + i - 1, k, 2); + ch_ref(i, 1, k) = cc_ref(i, k, 1) + ti2; + ch_ref(ic, 2, k) = ti2 - cc_ref(i, k, 1); + ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + tr2; + ch_ref(ic - 1, 2, k) = cc_ref(i - 1, k, 1) - tr2; + } + } + if (ido % 2 == 1) { + return; + } + } + for (k = 1; k <= l1; ++k) { + ch_ref(1, 2, k) = -cc_ref(ido, k, 2); + ch_ref(ido, 1, k) = cc_ref(ido, k, 1); + } +} /* radf2 */ + +#undef ch_ref +#undef cc_ref + + +static void radf3(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2) +{ + static const real taur = -.5f; + static const real taui = .866025403784439f; + + /* System generated locals */ + integer ch_offset, cc_offset; + + /* Local variables */ + integer i, k, ic; + real ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*3 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + (ido << 2); + ch -= ch_offset; + cc_offset = 1 + ido * (1 + l1); + cc -= cc_offset; + --wa1; + --wa2; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + cr2 = cc_ref(1, k, 2) + cc_ref(1, k, 3); + ch_ref(1, 1, k) = cc_ref(1, k, 1) + cr2; + ch_ref(1, 3, k) = taui * (cc_ref(1, k, 3) - cc_ref(1, k, 2)); + ch_ref(ido, 2, k) = cc_ref(1, k, 1) + taur * cr2; + } + if (ido == 1) { + return; + } + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * + cc_ref(i, k, 2); + di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref( + i - 1, k, 2); + dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * + cc_ref(i, k, 3); + di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref( + i - 1, k, 3); + cr2 = dr2 + dr3; + ci2 = di2 + di3; + ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + cr2; + ch_ref(i, 1, k) = cc_ref(i, k, 1) + ci2; + tr2 = cc_ref(i - 1, k, 1) + taur * cr2; + ti2 = cc_ref(i, k, 1) + taur * ci2; + tr3 = taui * (di2 - di3); + ti3 = taui * (dr3 - dr2); + ch_ref(i - 1, 3, k) = tr2 + tr3; + ch_ref(ic - 1, 2, k) = tr2 - tr3; + ch_ref(i, 3, k) = ti2 + ti3; + ch_ref(ic, 2, k) = ti3 - ti2; + } + } +} /* radf3 */ + +#undef ch_ref +#undef cc_ref + + +static void radf4(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3) +{ + /* Initialized data */ + + static const real hsqt2 = .7071067811865475f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k, ic; + real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*4 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * 5; + ch -= ch_offset; + cc_offset = 1 + ido * (1 + l1); + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + tr1 = cc_ref(1, k, 2) + cc_ref(1, k, 4); + tr2 = cc_ref(1, k, 1) + cc_ref(1, k, 3); + ch_ref(1, 1, k) = tr1 + tr2; + ch_ref(ido, 4, k) = tr2 - tr1; + ch_ref(ido, 2, k) = cc_ref(1, k, 1) - cc_ref(1, k, 3); + ch_ref(1, 3, k) = cc_ref(1, k, 4) - cc_ref(1, k, 2); + } + if (ido < 2) return; + if (ido != 2) { + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + cr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * + cc_ref(i, k, 2); + ci2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref( + i - 1, k, 2); + cr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * + cc_ref(i, k, 3); + ci3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref( + i - 1, k, 3); + cr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * + cc_ref(i, k, 4); + ci4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref( + i - 1, k, 4); + tr1 = cr2 + cr4; + tr4 = cr4 - cr2; + ti1 = ci2 + ci4; + ti4 = ci2 - ci4; + ti2 = cc_ref(i, k, 1) + ci3; + ti3 = cc_ref(i, k, 1) - ci3; + tr2 = cc_ref(i - 1, k, 1) + cr3; + tr3 = cc_ref(i - 1, k, 1) - cr3; + ch_ref(i - 1, 1, k) = tr1 + tr2; + ch_ref(ic - 1, 4, k) = tr2 - tr1; + ch_ref(i, 1, k) = ti1 + ti2; + ch_ref(ic, 4, k) = ti1 - ti2; + ch_ref(i - 1, 3, k) = ti4 + tr3; + ch_ref(ic - 1, 2, k) = tr3 - ti4; + ch_ref(i, 3, k) = tr4 + ti3; + ch_ref(ic, 2, k) = tr4 - ti3; + } + } + if (ido % 2 == 1) { + return; + } + } + for (k = 1; k <= l1; ++k) { + ti1 = -hsqt2 * (cc_ref(ido, k, 2) + cc_ref(ido, k, 4)); + tr1 = hsqt2 * (cc_ref(ido, k, 2) - cc_ref(ido, k, 4)); + ch_ref(ido, 1, k) = tr1 + cc_ref(ido, k, 1); + ch_ref(ido, 3, k) = cc_ref(ido, k, 1) - tr1; + ch_ref(1, 2, k) = ti1 - cc_ref(ido, k, 3); + ch_ref(1, 4, k) = ti1 + cc_ref(ido, k, 3); + } +} /* radf4 */ + +#undef ch_ref +#undef cc_ref + + +static void radf5(integer ido, integer l1, const real *cc, real *ch, + const real *wa1, const real *wa2, const real *wa3, const real *wa4) +{ + /* Initialized data */ + + static const real tr11 = .309016994374947f; + static const real ti11 = .951056516295154f; + static const real tr12 = -.809016994374947f; + static const real ti12 = .587785252292473f; + + /* System generated locals */ + integer cc_offset, ch_offset; + + /* Local variables */ + integer i, k, ic; + real ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5, + cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5; + integer idp2; + + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * 6; + ch -= ch_offset; + cc_offset = 1 + ido * (1 + l1); + cc -= cc_offset; + --wa1; + --wa2; + --wa3; + --wa4; + + /* Function Body */ + for (k = 1; k <= l1; ++k) { + cr2 = cc_ref(1, k, 5) + cc_ref(1, k, 2); + ci5 = cc_ref(1, k, 5) - cc_ref(1, k, 2); + cr3 = cc_ref(1, k, 4) + cc_ref(1, k, 3); + ci4 = cc_ref(1, k, 4) - cc_ref(1, k, 3); + ch_ref(1, 1, k) = cc_ref(1, k, 1) + cr2 + cr3; + ch_ref(ido, 2, k) = cc_ref(1, k, 1) + tr11 * cr2 + tr12 * cr3; + ch_ref(1, 3, k) = ti11 * ci5 + ti12 * ci4; + ch_ref(ido, 4, k) = cc_ref(1, k, 1) + tr12 * cr2 + tr11 * cr3; + ch_ref(1, 5, k) = ti12 * ci5 - ti11 * ci4; + } + if (ido == 1) { + return; + } + idp2 = ido + 2; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * cc_ref(i, k, 2); + di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(i - 1, k, 2); + dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * cc_ref(i, k, 3); + di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(i - 1, k, 3); + dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * cc_ref(i, k, 4); + di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(i - 1, k, 4); + dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * cc_ref(i, k, 5); + di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(i - 1, k, 5); + cr2 = dr2 + dr5; + ci5 = dr5 - dr2; + cr5 = di2 - di5; + ci2 = di2 + di5; + cr3 = dr3 + dr4; + ci4 = dr4 - dr3; + cr4 = di3 - di4; + ci3 = di3 + di4; + ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + cr2 + cr3; + ch_ref(i, 1, k) = cc_ref(i, k, 1) + ci2 + ci3; + tr2 = cc_ref(i - 1, k, 1) + tr11 * cr2 + tr12 * cr3; + ti2 = cc_ref(i, k, 1) + tr11 * ci2 + tr12 * ci3; + tr3 = cc_ref(i - 1, k, 1) + tr12 * cr2 + tr11 * cr3; + ti3 = cc_ref(i, k, 1) + tr12 * ci2 + tr11 * ci3; + tr5 = ti11 * cr5 + ti12 * cr4; + ti5 = ti11 * ci5 + ti12 * ci4; + tr4 = ti12 * cr5 - ti11 * cr4; + ti4 = ti12 * ci5 - ti11 * ci4; + ch_ref(i - 1, 3, k) = tr2 + tr5; + ch_ref(ic - 1, 2, k) = tr2 - tr5; + ch_ref(i, 3, k) = ti2 + ti5; + ch_ref(ic, 2, k) = ti5 - ti2; + ch_ref(i - 1, 5, k) = tr3 + tr4; + ch_ref(ic - 1, 4, k) = tr3 - tr4; + ch_ref(i, 5, k) = ti3 + ti4; + ch_ref(ic, 4, k) = ti4 - ti3; + } + } +} /* radf5 */ + +#undef ch_ref +#undef cc_ref + + +static void radfg(integer ido, integer ip, integer l1, integer idl1, + real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa) +{ + /* System generated locals */ + integer ch_offset, cc_offset, + c1_offset, c2_offset, ch2_offset; + + /* Local variables */ + integer i, j, k, l, j2, ic, jc, lc, ik, is; + real dc2, ai1, ai2, ar1, ar2, ds2; + integer nbd; + real dcp, arg, dsp, ar1h, ar2h; + integer idp2, ipp2, idij, ipph; + + +#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1] +#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1] +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] +#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1] + + /* Parameter adjustments */ + ch_offset = 1 + ido * (1 + l1); + ch -= ch_offset; + c1_offset = 1 + ido * (1 + l1); + c1 -= c1_offset; + cc_offset = 1 + ido * (1 + ip); + cc -= cc_offset; + ch2_offset = 1 + idl1; + ch2 -= ch2_offset; + c2_offset = 1 + idl1; + c2 -= c2_offset; + --wa; + + /* Function Body */ + arg = (2*M_PI) / (real) (ip); + dcp = FFTPACK_COS(arg); + dsp = FFTPACK_SIN(arg); + ipph = (ip + 1) / 2; + ipp2 = ip + 2; + idp2 = ido + 2; + nbd = (ido - 1) / 2; + if (ido == 1) { + for (ik = 1; ik <= idl1; ++ik) { + c2_ref(ik, 1) = ch2_ref(ik, 1); + } + } else { + for (ik = 1; ik <= idl1; ++ik) { + ch2_ref(ik, 1) = c2_ref(ik, 1); + } + for (j = 2; j <= ip; ++j) { + for (k = 1; k <= l1; ++k) { + ch_ref(1, k, j) = c1_ref(1, k, j); + } + } + if (nbd <= l1) { + is = -(ido); + for (j = 2; j <= ip; ++j) { + is += ido; + idij = is; + for (i = 3; i <= ido; i += 2) { + idij += 2; + for (k = 1; k <= l1; ++k) { + ch_ref(i - 1, k, j) = wa[idij - 1] * c1_ref(i - 1, k, j) + + wa[idij] * c1_ref(i, k, j); + ch_ref(i, k, j) = wa[idij - 1] * c1_ref(i, k, j) - wa[ + idij] * c1_ref(i - 1, k, j); + } + } + } + } else { + is = -(ido); + for (j = 2; j <= ip; ++j) { + is += ido; + for (k = 1; k <= l1; ++k) { + idij = is; + for (i = 3; i <= ido; i += 2) { + idij += 2; + ch_ref(i - 1, k, j) = wa[idij - 1] * c1_ref(i - 1, k, j) + + wa[idij] * c1_ref(i, k, j); + ch_ref(i, k, j) = wa[idij - 1] * c1_ref(i, k, j) - wa[ + idij] * c1_ref(i - 1, k, j); + } + } + } + } + if (nbd >= l1) { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + c1_ref(i - 1, k, j) = ch_ref(i - 1, k, j) + ch_ref(i - + 1, k, jc); + c1_ref(i - 1, k, jc) = ch_ref(i, k, j) - ch_ref(i, k, + jc); + c1_ref(i, k, j) = ch_ref(i, k, j) + ch_ref(i, k, jc); + c1_ref(i, k, jc) = ch_ref(i - 1, k, jc) - ch_ref(i - 1, + k, j); + } + } + } + } else { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (i = 3; i <= ido; i += 2) { + for (k = 1; k <= l1; ++k) { + c1_ref(i - 1, k, j) = ch_ref(i - 1, k, j) + ch_ref(i - + 1, k, jc); + c1_ref(i - 1, k, jc) = ch_ref(i, k, j) - ch_ref(i, k, + jc); + c1_ref(i, k, j) = ch_ref(i, k, j) + ch_ref(i, k, jc); + c1_ref(i, k, jc) = ch_ref(i - 1, k, jc) - ch_ref(i - 1, + k, j); + } + } + } + } + } + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + for (k = 1; k <= l1; ++k) { + c1_ref(1, k, j) = ch_ref(1, k, j) + ch_ref(1, k, jc); + c1_ref(1, k, jc) = ch_ref(1, k, jc) - ch_ref(1, k, j); + } + } + + ar1 = 1.f; + ai1 = 0.f; + for (l = 2; l <= ipph; ++l) { + lc = ipp2 - l; + ar1h = dcp * ar1 - dsp * ai1; + ai1 = dcp * ai1 + dsp * ar1; + ar1 = ar1h; + for (ik = 1; ik <= idl1; ++ik) { + ch2_ref(ik, l) = c2_ref(ik, 1) + ar1 * c2_ref(ik, 2); + ch2_ref(ik, lc) = ai1 * c2_ref(ik, ip); + } + dc2 = ar1; + ds2 = ai1; + ar2 = ar1; + ai2 = ai1; + for (j = 3; j <= ipph; ++j) { + jc = ipp2 - j; + ar2h = dc2 * ar2 - ds2 * ai2; + ai2 = dc2 * ai2 + ds2 * ar2; + ar2 = ar2h; + for (ik = 1; ik <= idl1; ++ik) { + ch2_ref(ik, l) = ch2_ref(ik, l) + ar2 * c2_ref(ik, j); + ch2_ref(ik, lc) = ch2_ref(ik, lc) + ai2 * c2_ref(ik, jc); + } + } + } + for (j = 2; j <= ipph; ++j) { + for (ik = 1; ik <= idl1; ++ik) { + ch2_ref(ik, 1) = ch2_ref(ik, 1) + c2_ref(ik, j); + } + } + + if (ido >= l1) { + for (k = 1; k <= l1; ++k) { + for (i = 1; i <= ido; ++i) { + cc_ref(i, 1, k) = ch_ref(i, k, 1); + } + } + } else { + for (i = 1; i <= ido; ++i) { + for (k = 1; k <= l1; ++k) { + cc_ref(i, 1, k) = ch_ref(i, k, 1); + } + } + } + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + j2 = j + j; + for (k = 1; k <= l1; ++k) { + cc_ref(ido, j2 - 2, k) = ch_ref(1, k, j); + cc_ref(1, j2 - 1, k) = ch_ref(1, k, jc); + } + } + if (ido == 1) { + return; + } + if (nbd >= l1) { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + j2 = j + j; + for (k = 1; k <= l1; ++k) { + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + cc_ref(i - 1, j2 - 1, k) = ch_ref(i - 1, k, j) + ch_ref( + i - 1, k, jc); + cc_ref(ic - 1, j2 - 2, k) = ch_ref(i - 1, k, j) - ch_ref( + i - 1, k, jc); + cc_ref(i, j2 - 1, k) = ch_ref(i, k, j) + ch_ref(i, k, + jc); + cc_ref(ic, j2 - 2, k) = ch_ref(i, k, jc) - ch_ref(i, k, j) + ; + } + } + } + } else { + for (j = 2; j <= ipph; ++j) { + jc = ipp2 - j; + j2 = j + j; + for (i = 3; i <= ido; i += 2) { + ic = idp2 - i; + for (k = 1; k <= l1; ++k) { + cc_ref(i - 1, j2 - 1, k) = ch_ref(i - 1, k, j) + ch_ref( + i - 1, k, jc); + cc_ref(ic - 1, j2 - 2, k) = ch_ref(i - 1, k, j) - ch_ref( + i - 1, k, jc); + cc_ref(i, j2 - 1, k) = ch_ref(i, k, j) + ch_ref(i, k, + jc); + cc_ref(ic, j2 - 2, k) = ch_ref(i, k, jc) - ch_ref(i, k, j) + ; + } + } + } + } +} /* radfg */ + +#undef ch2_ref +#undef ch_ref +#undef cc_ref +#undef c2_ref +#undef c1_ref + + +static void cfftb1(integer n, real *c, real *ch, const real *wa, integer *ifac) +{ + integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, nac, ido, + idl1, idot; + + /* Function Body */ + nf = ifac[1]; + na = 0; + l1 = 1; + iw = 0; + for (k1 = 1; k1 <= nf; ++k1) { + ip = ifac[k1 + 1]; + l2 = ip * l1; + ido = n / l2; + idot = ido + ido; + idl1 = idot * l1; + switch (ip) { + case 4: + ix2 = iw + idot; + ix3 = ix2 + idot; + passb4(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]); + na = 1 - na; + break; + case 2: + passb2(idot, l1, na?ch:c, na?c:ch, &wa[iw]); + na = 1 - na; + break; + case 3: + ix2 = iw + idot; + passb3(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]); + na = 1 - na; + break; + case 5: + ix2 = iw + idot; + ix3 = ix2 + idot; + ix4 = ix3 + idot; + passfb5(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], +1); + na = 1 - na; + break; + default: + if (na == 0) { + passfb(&nac, idot, ip, l1, idl1, c, c, c, ch, ch, &wa[iw], +1); + } else { + passfb(&nac, idot, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw], +1); + } + if (nac != 0) { + na = 1 - na; + } + break; + } + l1 = l2; + iw += (ip - 1) * idot; + } + if (na == 0) { + return; + } + for (i = 0; i < 2*n; ++i) { + c[i] = ch[i]; + } +} /* cfftb1 */ + +void cfftb(integer n, real *c, real *wsave) +{ + integer iw1, iw2; + + /* Parameter adjustments */ + --wsave; + --c; + + /* Function Body */ + if (n == 1) { + return; + } + iw1 = 2*n + 1; + iw2 = iw1 + 2*n; + cfftb1(n, &c[1], &wsave[1], &wsave[iw1], (int*)&wsave[iw2]); +} /* cfftb */ + +static void cfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac) +{ + /* Local variables */ + integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, nac, ido, + idl1, idot; + + /* Function Body */ + nf = ifac[1]; + na = 0; + l1 = 1; + iw = 0; + for (k1 = 1; k1 <= nf; ++k1) { + ip = ifac[k1 + 1]; + l2 = ip * l1; + ido = n / l2; + idot = ido + ido; + idl1 = idot * l1; + switch (ip) { + case 4: + ix2 = iw + idot; + ix3 = ix2 + idot; + passf4(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]); + na = 1 - na; + break; + case 2: + passf2(idot, l1, na?ch:c, na?c:ch, &wa[iw]); + na = 1 - na; + break; + case 3: + ix2 = iw + idot; + passf3(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]); + na = 1 - na; + break; + case 5: + ix2 = iw + idot; + ix3 = ix2 + idot; + ix4 = ix3 + idot; + passfb5(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], -1); + na = 1 - na; + break; + default: + if (na == 0) { + passfb(&nac, idot, ip, l1, idl1, c, c, c, ch, ch, &wa[iw], -1); + } else { + passfb(&nac, idot, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw], -1); + } + if (nac != 0) { + na = 1 - na; + } + break; + } + l1 = l2; + iw += (ip - 1)*idot; + } + if (na == 0) { + return; + } + for (i = 0; i < 2*n; ++i) { + c[i] = ch[i]; + } +} /* cfftf1 */ + +void cfftf(integer n, real *c, real *wsave) +{ + integer iw1, iw2; + + /* Parameter adjustments */ + --wsave; + --c; + + /* Function Body */ + if (n == 1) { + return; + } + iw1 = 2*n + 1; + iw2 = iw1 + 2*n; + cfftf1(n, &c[1], &wsave[1], &wsave[iw1], (int*)&wsave[iw2]); +} /* cfftf */ + +static int decompose(integer n, integer *ifac, integer ntryh[4]) { + integer ntry=0, nl = n, nf = 0, nq, nr, i, j = 0; + do { + if (j < 4) { + ntry = ntryh[j]; + } else { + ntry += 2; + } + ++j; + L104: + nq = nl / ntry; + nr = nl - ntry * nq; + if (nr != 0) continue; + ++nf; + ifac[nf + 2] = ntry; + nl = nq; + if (ntry == 2 && nf != 1) { + for (i = 2; i <= nf; ++i) { + integer ib = nf - i + 2; + ifac[ib + 2] = ifac[ib + 1]; + } + ifac[3] = 2; + } + if (nl != 1) { + goto L104; + } + } while (nl != 1); + ifac[1] = n; + ifac[2] = nf; + return nf; +} + +static void cffti1(integer n, real *wa, integer *ifac) +{ + static integer ntryh[4] = { 3,4,2,5 }; + + /* Local variables */ + integer i, j, i1, k1, l1, l2; + real fi; + integer ld, ii, nf, ip; + real arg; + integer ido, ipm; + real argh; + integer idot; + real argld; + + /* Parameter adjustments */ + --ifac; + --wa; + + nf = decompose(n, ifac, ntryh); + + argh = (2*M_PI) / (real) (n); + i = 2; + l1 = 1; + for (k1 = 1; k1 <= nf; ++k1) { + ip = ifac[k1 + 2]; + ld = 0; + l2 = l1 * ip; + ido = n / l2; + idot = ido + ido + 2; + ipm = ip - 1; + for (j = 1; j <= ipm; ++j) { + i1 = i; + wa[i - 1] = 1.f; + wa[i] = 0.f; + ld += l1; + fi = 0.f; + argld = (real) ld * argh; + for (ii = 4; ii <= idot; ii += 2) { + i += 2; + fi += 1.f; + arg = fi * argld; + wa[i - 1] = FFTPACK_COS(arg); + wa[i] = FFTPACK_SIN(arg); + } + if (ip > 5) { + wa[i1 - 1] = wa[i - 1]; + wa[i1] = wa[i]; + }; + } + l1 = l2; + } +} /* cffti1 */ + +void cffti(integer n, real *wsave) +{ + integer iw1, iw2; + /* Parameter adjustments */ + --wsave; + + /* Function Body */ + if (n == 1) { + return; + } + iw1 = 2*n + 1; + iw2 = iw1 + 2*n; + cffti1(n, &wsave[iw1], (int*)&wsave[iw2]); + return; +} /* cffti */ + +static void rfftb1(integer n, real *c, real *ch, const real *wa, integer *ifac) +{ + /* Local variables */ + integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, ido, idl1; + + /* Function Body */ + nf = ifac[1]; + na = 0; + l1 = 1; + iw = 0; + for (k1 = 1; k1 <= nf; ++k1) { + ip = ifac[k1 + 1]; + l2 = ip * l1; + ido = n / l2; + idl1 = ido * l1; + switch (ip) { + case 4: + ix2 = iw + ido; + ix3 = ix2 + ido; + radb4(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]); + na = 1 - na; + break; + case 2: + radb2(ido, l1, na?ch:c, na?c:ch, &wa[iw]); + na = 1 - na; + break; + case 3: + ix2 = iw + ido; + radb3(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]); + na = 1 - na; + break; + case 5: + ix2 = iw + ido; + ix3 = ix2 + ido; + ix4 = ix3 + ido; + radb5(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); + na = 1 - na; + break; + default: + if (na == 0) { + radbg(ido, ip, l1, idl1, c, c, c, ch, ch, &wa[iw]); + } else { + radbg(ido, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw]); + } + if (ido == 1) { + na = 1 - na; + } + break; + } + l1 = l2; + iw += (ip - 1) * ido; + } + if (na == 0) { + return; + } + for (i = 0; i < n; ++i) { + c[i] = ch[i]; + } +} /* rfftb1 */ + +static void rfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac) +{ + /* Local variables */ + integer i, k1, l1, l2, na, kh, nf, ip, iw, ix2, ix3, ix4, ido, idl1; + + /* Function Body */ + nf = ifac[1]; + na = 1; + l2 = n; + iw = n-1; + for (k1 = 1; k1 <= nf; ++k1) { + kh = nf - k1; + ip = ifac[kh + 2]; + l1 = l2 / ip; + ido = n / l2; + idl1 = ido * l1; + iw -= (ip - 1) * ido; + na = 1 - na; + switch (ip) { + case 4: + ix2 = iw + ido; + ix3 = ix2 + ido; + radf4(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3]); + break; + case 2: + radf2(ido, l1, na ? ch : c, na ? c : ch, &wa[iw]); + break; + case 3: + ix2 = iw + ido; + radf3(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2]); + break; + case 5: + ix2 = iw + ido; + ix3 = ix2 + ido; + ix4 = ix3 + ido; + radf5(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); + break; + default: + if (ido == 1) { + na = 1 - na; + } + if (na == 0) { + radfg(ido, ip, l1, idl1, c, c, c, ch, ch, &wa[iw]); + na = 1; + } else { + radfg(ido, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw]); + na = 0; + } + break; + } + l2 = l1; + } + if (na == 1) { + return; + } + for (i = 0; i < n; ++i) { + c[i] = ch[i]; + } +} + +void rfftb(integer n, real *r, real *wsave) +{ + + /* Parameter adjustments */ + --wsave; + --r; + + /* Function Body */ + if (n == 1) { + return; + } + rfftb1(n, &r[1], &wsave[1], &wsave[n + 1], (int*)&wsave[(n << 1) + 1]); +} /* rfftb */ + +static void rffti1(integer n, real *wa, integer *ifac) +{ + static integer ntryh[4] = { 4,2,3,5 }; + + /* Local variables */ + integer i, j, k1, l1, l2; + real fi; + integer ld, ii, nf, ip, is; + real arg; + integer ido, ipm; + integer nfm1; + real argh; + real argld; + + /* Parameter adjustments */ + --ifac; + --wa; + + nf = decompose(n, ifac, ntryh); + + argh = (2*M_PI) / (real) (n); + is = 0; + nfm1 = nf - 1; + l1 = 1; + if (nfm1 == 0) { + return; + } + for (k1 = 1; k1 <= nfm1; ++k1) { + ip = ifac[k1 + 2]; + ld = 0; + l2 = l1 * ip; + ido = n / l2; + ipm = ip - 1; + for (j = 1; j <= ipm; ++j) { + ld += l1; + i = is; + argld = (real) ld * argh; + fi = 0.f; + for (ii = 3; ii <= ido; ii += 2) { + i += 2; + fi += 1.f; + arg = fi * argld; + wa[i - 1] = FFTPACK_COS(arg); + wa[i] = FFTPACK_SIN(arg); + } + is += ido; + } + l1 = l2; + } +} /* rffti1 */ + +void rfftf(integer n, real *r, real *wsave) +{ + + /* Parameter adjustments */ + --wsave; + --r; + + /* Function Body */ + if (n == 1) { + return; + } + rfftf1(n, &r[1], &wsave[1], &wsave[n + 1], (int*)&wsave[(n << 1) + 1]); +} /* rfftf */ + +void rffti(integer n, real *wsave) +{ + /* Parameter adjustments */ + --wsave; + + /* Function Body */ + if (n == 1) { + return; + } + rffti1(n, &wsave[n + 1], (int*)&wsave[(n << 1) + 1]); + return; +} /* rffti */ + +static void cosqb1(integer n, real *x, real *w, real *xh) +{ + /* Local variables */ + integer i, k, kc, np2, ns2; + real xim1; + integer modn; + + /* Parameter adjustments */ + --xh; + --w; + --x; + + /* Function Body */ + ns2 = (n + 1) / 2; + np2 = n + 2; + for (i = 3; i <= n; i += 2) { + xim1 = x[i - 1] + x[i]; + x[i] -= x[i - 1]; + x[i - 1] = xim1; + } + x[1] += x[1]; + modn = n % 2; + if (modn == 0) { + x[n] += x[n]; + } + rfftb(n, &x[1], &xh[1]); + for (k = 2; k <= ns2; ++k) { + kc = np2 - k; + xh[k] = w[k - 1] * x[kc] + w[kc - 1] * x[k]; + xh[kc] = w[k - 1] * x[k] - w[kc - 1] * x[kc]; + } + if (modn == 0) { + x[ns2 + 1] = w[ns2] * (x[ns2 + 1] + x[ns2 + 1]); + } + for (k = 2; k <= ns2; ++k) { + kc = np2 - k; + x[k] = xh[k] + xh[kc]; + x[kc] = xh[k] - xh[kc]; + } + x[1] += x[1]; +} /* cosqb1 */ + +void cosqb(integer n, real *x, real *wsave) +{ + static const real tsqrt2 = 2.82842712474619f; + + /* Local variables */ + real x1; + + /* Parameter adjustments */ + --wsave; + --x; + + if (n < 2) { + x[1] *= 4.f; + } else if (n == 2) { + x1 = (x[1] + x[2]) * 4.f; + x[2] = tsqrt2 * (x[1] - x[2]); + x[1] = x1; + } else { + cosqb1(n, &x[1], &wsave[1], &wsave[n + 1]); + } +} /* cosqb */ + +static void cosqf1(integer n, real *x, real *w, real *xh) +{ + /* Local variables */ + integer i, k, kc, np2, ns2; + real xim1; + integer modn; + + /* Parameter adjustments */ + --xh; + --w; + --x; + + /* Function Body */ + ns2 = (n + 1) / 2; + np2 = n + 2; + for (k = 2; k <= ns2; ++k) { + kc = np2 - k; + xh[k] = x[k] + x[kc]; + xh[kc] = x[k] - x[kc]; + } + modn = n % 2; + if (modn == 0) { + xh[ns2 + 1] = x[ns2 + 1] + x[ns2 + 1]; + } + for (k = 2; k <= ns2; ++k) { + kc = np2 - k; + x[k] = w[k - 1] * xh[kc] + w[kc - 1] * xh[k]; + x[kc] = w[k - 1] * xh[k] - w[kc - 1] * xh[kc]; + } + if (modn == 0) { + x[ns2 + 1] = w[ns2] * xh[ns2 + 1]; + } + rfftf(n, &x[1], &xh[1]); + for (i = 3; i <= n; i += 2) { + xim1 = x[i - 1] - x[i]; + x[i] = x[i - 1] + x[i]; + x[i - 1] = xim1; + } +} /* cosqf1 */ + +void cosqf(integer n, real *x, real *wsave) +{ + static const real sqrt2 = 1.4142135623731f; + + /* Local variables */ + real tsqx; + + /* Parameter adjustments */ + --wsave; + --x; + + if (n == 2) { + tsqx = sqrt2 * x[2]; + x[2] = x[1] - tsqx; + x[1] += tsqx; + } else if (n > 2) { + cosqf1(n, &x[1], &wsave[1], &wsave[n + 1]); + } +} /* cosqf */ + +void cosqi(integer n, real *wsave) +{ + /* Local variables */ + integer k; + real fk, dt; + + /* Parameter adjustments */ + --wsave; + + dt = M_PI/2 / (real) (n); + fk = 0.f; + for (k = 1; k <= n; ++k) { + fk += 1.f; + wsave[k] = FFTPACK_COS(fk * dt); + } + rffti(n, &wsave[n + 1]); +} /* cosqi */ + +void cost(integer n, real *x, real *wsave) +{ + /* Local variables */ + integer i, k; + real c1, t1, t2; + integer kc; + real xi; + integer nm1, np1; + real x1h; + integer ns2; + real tx2, x1p3, xim2; + integer modn; + + /* Parameter adjustments */ + --wsave; + --x; + + /* Function Body */ + nm1 = n - 1; + np1 = n + 1; + ns2 = n / 2; + if (n == 2) { + x1h = x[1] + x[2]; + x[2] = x[1] - x[2]; + x[1] = x1h; + } else if (n == 3) { + x1p3 = x[1] + x[3]; + tx2 = x[2] + x[2]; + x[2] = x[1] - x[3]; + x[1] = x1p3 + tx2; + x[3] = x1p3 - tx2; + } else if (n > 3) { + c1 = x[1] - x[n]; + x[1] += x[n]; + for (k = 2; k <= ns2; ++k) { + kc = np1 - k; + t1 = x[k] + x[kc]; + t2 = x[k] - x[kc]; + c1 += wsave[kc] * t2; + t2 = wsave[k] * t2; + x[k] = t1 - t2; + x[kc] = t1 + t2; + } + modn = n % 2; + if (modn != 0) { + x[ns2 + 1] += x[ns2 + 1]; + } + rfftf(nm1, &x[1], &wsave[n + 1]); + xim2 = x[2]; + x[2] = c1; + for (i = 4; i <= n; i += 2) { + xi = x[i]; + x[i] = x[i - 2] - x[i - 1]; + x[i - 1] = xim2; + xim2 = xi; + } + if (modn != 0) { + x[n] = xim2; + } + } +} /* cost */ + +void costi(integer n, real *wsave) +{ + /* Initialized data */ + + /* Local variables */ + integer k, kc; + real fk, dt; + integer nm1, np1, ns2; + + /* Parameter adjustments */ + --wsave; + + /* Function Body */ + if (n <= 3) { + return; + } + nm1 = n - 1; + np1 = n + 1; + ns2 = n / 2; + dt = M_PI / (real) nm1; + fk = 0.f; + for (k = 2; k <= ns2; ++k) { + kc = np1 - k; + fk += 1.f; + wsave[k] = FFTPACK_SIN(fk * dt) * 2.f; + wsave[kc] = FFTPACK_COS(fk * dt) * 2.f; + } + rffti(nm1, &wsave[n + 1]); +} /* costi */ + +void sinqb(integer n, real *x, real *wsave) +{ + /* Local variables */ + integer k, kc, ns2; + real xhold; + + /* Parameter adjustments */ + --wsave; + --x; + + /* Function Body */ + if (n <= 1) { + x[1] *= 4.f; + return; + } + ns2 = n / 2; + for (k = 2; k <= n; k += 2) { + x[k] = -x[k]; + } + cosqb(n, &x[1], &wsave[1]); + for (k = 1; k <= ns2; ++k) { + kc = n - k; + xhold = x[k]; + x[k] = x[kc + 1]; + x[kc + 1] = xhold; + } +} /* sinqb */ + +void sinqf(integer n, real *x, real *wsave) +{ + /* Local variables */ + integer k, kc, ns2; + real xhold; + + /* Parameter adjustments */ + --wsave; + --x; + + /* Function Body */ + if (n == 1) { + return; + } + ns2 = n / 2; + for (k = 1; k <= ns2; ++k) { + kc = n - k; + xhold = x[k]; + x[k] = x[kc + 1]; + x[kc + 1] = xhold; + } + cosqf(n, &x[1], &wsave[1]); + for (k = 2; k <= n; k += 2) { + x[k] = -x[k]; + } +} /* sinqf */ + +void sinqi(integer n, real *wsave) +{ + + /* Parameter adjustments */ + --wsave; + + /* Function Body */ + cosqi(n, &wsave[1]); +} /* sinqi */ + +static void sint1(integer n, real *war, real *was, real *xh, real * + x, integer *ifac) +{ + /* Initialized data */ + + static const real sqrt3 = 1.73205080756888f; + + /* Local variables */ + integer i, k; + real t1, t2; + integer kc, np1, ns2, modn; + real xhold; + + /* Parameter adjustments */ + --ifac; + --x; + --xh; + --was; + --war; + + /* Function Body */ + for (i = 1; i <= n; ++i) { + xh[i] = war[i]; + war[i] = x[i]; + } + + if (n < 2) { + xh[1] += xh[1]; + } else if (n == 2) { + xhold = sqrt3 * (xh[1] + xh[2]); + xh[2] = sqrt3 * (xh[1] - xh[2]); + xh[1] = xhold; + } else { + np1 = n + 1; + ns2 = n / 2; + x[1] = 0.f; + for (k = 1; k <= ns2; ++k) { + kc = np1 - k; + t1 = xh[k] - xh[kc]; + t2 = was[k] * (xh[k] + xh[kc]); + x[k + 1] = t1 + t2; + x[kc + 1] = t2 - t1; + } + modn = n % 2; + if (modn != 0) { + x[ns2 + 2] = xh[ns2 + 1] * 4.f; + } + rfftf1(np1, &x[1], &xh[1], &war[1], &ifac[1]); + xh[1] = x[1] * .5f; + for (i = 3; i <= n; i += 2) { + xh[i - 1] = -x[i]; + xh[i] = xh[i - 2] + x[i - 1]; + } + if (modn == 0) { + xh[n] = -x[n + 1]; + } + } + for (i = 1; i <= n; ++i) { + x[i] = war[i]; + war[i] = xh[i]; + } +} /* sint1 */ + +void sinti(integer n, real *wsave) +{ + /* Local variables */ + integer k; + real dt; + integer np1, ns2; + + /* Parameter adjustments */ + --wsave; + + /* Function Body */ + if (n <= 1) { + return; + } + ns2 = n / 2; + np1 = n + 1; + dt = M_PI / (real) np1; + for (k = 1; k <= ns2; ++k) { + wsave[k] = sin(k * dt) * 2.f; + } + rffti(np1, &wsave[ns2 + 1]); +} /* sinti */ + +void sint(integer n, real *x, real *wsave) +{ + integer np1, iw1, iw2, iw3; + + /* Parameter adjustments */ + --wsave; + --x; + + /* Function Body */ + np1 = n + 1; + iw1 = n / 2 + 1; + iw2 = iw1 + np1; + iw3 = iw2 + np1; + sint1(n, &x[1], &wsave[1], &wsave[iw1], &wsave[iw2], (int*)&wsave[iw3]); +} /* sint */ + +#ifdef TESTING_FFTPACK +#include + +int main(void) +{ + static integer nd[] = { 120,91,54,49,32,28,24,8,4,3,2 }; + + /* System generated locals */ + real r1, r2, r3; + f77complex q1, q2, q3; + + /* Local variables */ + integer i, j, k, n; + real w[2000], x[200], y[200], cf, fn, dt; + f77complex cx[200], cy[200]; + real xh[200]; + integer nz, nm1, np1, ns2; + real arg, tfn; + real sum, arg1, arg2; + real sum1, sum2, dcfb; + integer modn; + real rftb, rftf; + real sqrt2; + real rftfb; + real costt, sintt, dcfftb, dcfftf, cosqfb, costfb; + real sinqfb; + real sintfb; + real cosqbt, cosqft, sinqbt, sinqft; + + + + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + /* VERSION 4 APRIL 1985 */ + + /* A TEST DRIVER FOR */ + /* A PACKAGE OF FORTRAN SUBPROGRAMS FOR THE FAST FOURIER */ + /* TRANSFORM OF PERIODIC AND OTHER SYMMETRIC SEQUENCES */ + + /* BY */ + + /* PAUL N SWARZTRAUBER */ + + /* NATIONAL CENTER FOR ATMOSPHERIC RESEARCH BOULDER,COLORADO 80307 */ + + /* WHICH IS SPONSORED BY THE NATIONAL SCIENCE FOUNDATION */ + + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + + /* THIS PROGRAM TESTS THE PACKAGE OF FAST FOURIER */ + /* TRANSFORMS FOR BOTH COMPLEX AND REAL PERIODIC SEQUENCES AND */ + /* CERTIAN OTHER SYMMETRIC SEQUENCES THAT ARE LISTED BELOW. */ + + /* 1. RFFTI INITIALIZE RFFTF AND RFFTB */ + /* 2. RFFTF FORWARD TRANSFORM OF A REAL PERIODIC SEQUENCE */ + /* 3. RFFTB BACKWARD TRANSFORM OF A REAL COEFFICIENT ARRAY */ + + /* 4. EZFFTI INITIALIZE EZFFTF AND EZFFTB */ + /* 5. EZFFTF A SIMPLIFIED REAL PERIODIC FORWARD TRANSFORM */ + /* 6. EZFFTB A SIMPLIFIED REAL PERIODIC BACKWARD TRANSFORM */ + + /* 7. SINTI INITIALIZE SINT */ + /* 8. SINT SINE TRANSFORM OF A REAL ODD SEQUENCE */ + + /* 9. COSTI INITIALIZE COST */ + /* 10. COST COSINE TRANSFORM OF A REAL EVEN SEQUENCE */ + + /* 11. SINQI INITIALIZE SINQF AND SINQB */ + /* 12. SINQF FORWARD SINE TRANSFORM WITH ODD WAVE NUMBERS */ + /* 13. SINQB UNNORMALIZED INVERSE OF SINQF */ + + /* 14. COSQI INITIALIZE COSQF AND COSQB */ + /* 15. COSQF FORWARD COSINE TRANSFORM WITH ODD WAVE NUMBERS */ + /* 16. COSQB UNNORMALIZED INVERSE OF COSQF */ + + /* 17. CFFTI INITIALIZE CFFTF AND CFFTB */ + /* 18. CFFTF FORWARD TRANSFORM OF A COMPLEX PERIODIC SEQUENCE */ + /* 19. CFFTB UNNORMALIZED INVERSE OF CFFTF */ + + + sqrt2 = sqrt(2.f); + int all_ok = 1; + for (nz = 1; nz <= (int)(sizeof nd/sizeof nd[0]); ++nz) { + n = nd[nz - 1]; + modn = n % 2; + fn = (real) n; + tfn = fn + fn; + np1 = n + 1; + nm1 = n - 1; + for (j = 1; j <= np1; ++j) { + x[j - 1] = sin((real) j * sqrt2); + y[j - 1] = x[j - 1]; + xh[j - 1] = x[j - 1]; + } + + /* TEST SUBROUTINES RFFTI,RFFTF AND RFFTB */ + + rffti(n, w); + dt = (2*M_PI) / fn; + ns2 = (n + 1) / 2; + if (ns2 < 2) { + goto L104; + } + for (k = 2; k <= ns2; ++k) { + sum1 = 0.f; + sum2 = 0.f; + arg = (real) (k - 1) * dt; + for (i = 1; i <= n; ++i) { + arg1 = (real) (i - 1) * arg; + sum1 += x[i - 1] * cos(arg1); + sum2 += x[i - 1] * sin(arg1); + } + y[(k << 1) - 3] = sum1; + y[(k << 1) - 2] = -sum2; + } + L104: + sum1 = 0.f; + sum2 = 0.f; + for (i = 1; i <= nm1; i += 2) { + sum1 += x[i - 1]; + sum2 += x[i]; + } + if (modn == 1) { + sum1 += x[n - 1]; + } + y[0] = sum1 + sum2; + if (modn == 0) { + y[n - 1] = sum1 - sum2; + } + rfftf(n, x, w); + rftf = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = rftf, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1)); + rftf = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + } + rftf /= fn; + for (i = 1; i <= n; ++i) { + sum = x[0] * .5f; + arg = (real) (i - 1) * dt; + if (ns2 < 2) { + goto L108; + } + for (k = 2; k <= ns2; ++k) { + arg1 = (real) (k - 1) * arg; + sum = sum + x[(k << 1) - 3] * cos(arg1) - x[(k << 1) - 2] * + sin(arg1); + } + L108: + if (modn == 0) { + sum += (real)pow(-1, i-1) * .5f * x[n - 1]; + } + y[i - 1] = sum + sum; + } + rfftb(n, x, w); + rftb = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = rftb, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1)); + rftb = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + y[i - 1] = xh[i - 1]; + } + rfftb(n, y, w); + rfftf(n, y, w); + cf = 1.f / fn; + rftfb = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = rftfb, r3 = (r1 = cf * y[i - 1] - x[i - 1], fabs( + r1)); + rftfb = dmax(r2,r3); + } + + /* TEST SUBROUTINES SINTI AND SINT */ + + dt = M_PI / fn; + for (i = 1; i <= nm1; ++i) { + x[i - 1] = xh[i - 1]; + } + for (i = 1; i <= nm1; ++i) { + y[i - 1] = 0.f; + arg1 = (real) i * dt; + for (k = 1; k <= nm1; ++k) { + y[i - 1] += x[k - 1] * sin((real) k * arg1); + } + y[i - 1] += y[i - 1]; + } + sinti(nm1, w); + sint(nm1, x, w); + cf = .5f / fn; + sintt = 0.f; + for (i = 1; i <= nm1; ++i) { + /* Computing MAX */ + r2 = sintt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1)); + sintt = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + y[i - 1] = x[i - 1]; + } + sintt = cf * sintt; + sint(nm1, x, w); + sint(nm1, x, w); + sintfb = 0.f; + for (i = 1; i <= nm1; ++i) { + /* Computing MAX */ + r2 = sintfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs( + r1)); + sintfb = dmax(r2,r3); + } + + /* TEST SUBROUTINES COSTI AND COST */ + + for (i = 1; i <= np1; ++i) { + x[i - 1] = xh[i - 1]; + } + for (i = 1; i <= np1; ++i) { + y[i - 1] = (x[0] + (real) pow(-1, i+1) * x[n]) * .5f; + arg = (real) (i - 1) * dt; + for (k = 2; k <= n; ++k) { + y[i - 1] += x[k - 1] * FFTPACK_COS((real) (k - 1) * arg); + } + y[i - 1] += y[i - 1]; + } + costi(np1, w); + cost(np1, x, w); + costt = 0.f; + for (i = 1; i <= np1; ++i) { + /* Computing MAX */ + r2 = costt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1)); + costt = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + y[i - 1] = xh[i - 1]; + } + costt = cf * costt; + cost(np1, x, w); + cost(np1, x, w); + costfb = 0.f; + for (i = 1; i <= np1; ++i) { + /* Computing MAX */ + r2 = costfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs( + r1)); + costfb = dmax(r2,r3); + } + + /* TEST SUBROUTINES SINQI,SINQF AND SINQB */ + + cf = .25f / fn; + for (i = 1; i <= n; ++i) { + y[i - 1] = xh[i - 1]; + } + dt = M_PI / (fn + fn); + for (i = 1; i <= n; ++i) { + x[i - 1] = 0.f; + arg = dt * (real) i; + for (k = 1; k <= n; ++k) { + x[i - 1] += y[k - 1] * sin((real) (k + k - 1) * arg); + } + x[i - 1] *= 4.f; + } + sinqi(n, w); + sinqb(n, y, w); + sinqbt = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = sinqbt, r3 = (r1 = y[i - 1] - x[i - 1], fabs(r1)) + ; + sinqbt = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + } + sinqbt = cf * sinqbt; + for (i = 1; i <= n; ++i) { + arg = (real) (i + i - 1) * dt; + y[i - 1] = (real) pow(-1, i+1) * .5f * x[n - 1]; + for (k = 1; k <= nm1; ++k) { + y[i - 1] += x[k - 1] * sin((real) k * arg); + } + y[i - 1] += y[i - 1]; + } + sinqf(n, x, w); + sinqft = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = sinqft, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1)) + ; + sinqft = dmax(r2,r3); + y[i - 1] = xh[i - 1]; + x[i - 1] = xh[i - 1]; + } + sinqf(n, y, w); + sinqb(n, y, w); + sinqfb = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = sinqfb, r3 = (r1 = cf * y[i - 1] - x[i - 1], fabs( + r1)); + sinqfb = dmax(r2,r3); + } + + /* TEST SUBROUTINES COSQI,COSQF AND COSQB */ + + for (i = 1; i <= n; ++i) { + y[i - 1] = xh[i - 1]; + } + for (i = 1; i <= n; ++i) { + x[i - 1] = 0.f; + arg = (real) (i - 1) * dt; + for (k = 1; k <= n; ++k) { + x[i - 1] += y[k - 1] * FFTPACK_COS((real) (k + k - 1) * arg); + } + x[i - 1] *= 4.f; + } + cosqi(n, w); + cosqb(n, y, w); + cosqbt = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = cosqbt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1)) + ; + cosqbt = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + } + cosqbt = cf * cosqbt; + for (i = 1; i <= n; ++i) { + y[i - 1] = x[0] * .5f; + arg = (real) (i + i - 1) * dt; + for (k = 2; k <= n; ++k) { + y[i - 1] += x[k - 1] * FFTPACK_COS((real) (k - 1) * arg); + } + y[i - 1] += y[i - 1]; + } + cosqf(n, x, w); + cosqft = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = cosqft, r3 = (r1 = y[i - 1] - x[i - 1], fabs(r1)) + ; + cosqft = dmax(r2,r3); + x[i - 1] = xh[i - 1]; + y[i - 1] = xh[i - 1]; + } + cosqft = cf * cosqft; + cosqb(n, x, w); + cosqf(n, x, w); + cosqfb = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + r2 = cosqfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(r1)); + cosqfb = dmax(r2,r3); + } + + /* TEST CFFTI,CFFTF,CFFTB */ + + for (i = 1; i <= n; ++i) { + r1 = FFTPACK_COS(sqrt2 * (real) i); + r2 = FFTPACK_SIN(sqrt2 * (real) (i * i)); + q1.r = r1, q1.i = r2; + cx[i-1].r = q1.r, cx[i-1].i = q1.i; + } + dt = (2*M_PI) / fn; + for (i = 1; i <= n; ++i) { + arg1 = -((real) (i - 1)) * dt; + cy[i-1].r = 0.f, cy[i-1].i = 0.f; + for (k = 1; k <= n; ++k) { + arg2 = (real) (k - 1) * arg1; + r1 = FFTPACK_COS(arg2); + r2 = FFTPACK_SIN(arg2); + q3.r = r1, q3.i = r2; + q2.r = q3.r * cx[k-1].r - q3.i * cx[k-1].i, q2.i = + q3.r * cx[k-1].i + q3.i * cx[k-1].r; + q1.r = cy[i-1].r + q2.r, q1.i = cy[i-1].i + q2.i; + cy[i-1].r = q1.r, cy[i-1].i = q1.i; + } + } + cffti(n, w); + cfftf(n, (real*)cx, w); + dcfftf = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + q1.r = cx[i-1].r - cy[i-1].r, q1.i = cx[i-1].i - cy[i-1] + .i; + r1 = dcfftf, r2 = c_abs(&q1); + dcfftf = dmax(r1,r2); + q1.r = cx[i-1].r / fn, q1.i = cx[i-1].i / fn; + cx[i-1].r = q1.r, cx[i-1].i = q1.i; + } + dcfftf /= fn; + for (i = 1; i <= n; ++i) { + arg1 = (real) (i - 1) * dt; + cy[i-1].r = 0.f, cy[i-1].i = 0.f; + for (k = 1; k <= n; ++k) { + arg2 = (real) (k - 1) * arg1; + r1 = FFTPACK_COS(arg2); + r2 = FFTPACK_SIN(arg2); + q3.r = r1, q3.i = r2; + q2.r = q3.r * cx[k-1].r - q3.i * cx[k-1].i, q2.i = + q3.r * cx[k-1].i + q3.i * cx[k-1].r; + q1.r = cy[i-1].r + q2.r, q1.i = cy[i-1].i + q2.i; + cy[i-1].r = q1.r, cy[i-1].i = q1.i; + } + } + cfftb(n, (real*)cx, w); + dcfftb = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + q1.r = cx[i-1].r - cy[i-1].r, q1.i = cx[i-1].i - cy[i-1].i; + r1 = dcfftb, r2 = c_abs(&q1); + dcfftb = dmax(r1,r2); + cx[i-1].r = cy[i-1].r, cx[i-1].i = cy[i-1].i; + } + cf = 1.f / fn; + cfftf(n, (real*)cx, w); + cfftb(n, (real*)cx, w); + dcfb = 0.f; + for (i = 1; i <= n; ++i) { + /* Computing MAX */ + q2.r = cf * cx[i-1].r, q2.i = cf * cx[i-1].i; + q1.r = q2.r - cy[i-1].r, q1.i = q2.i - cy[i-1].i; + r1 = dcfb, r2 = c_abs(&q1); + dcfb = dmax(r1,r2); + } + printf("%d\tRFFTF %10.3g\tRFFTB %10.ge\tRFFTFB %10.3g", n, rftf, rftb, rftfb); + printf( "\tSINT %10.3g\tSINTFB %10.ge\tCOST %10.3g\n", sintt, sintfb, costt); + printf( "\tCOSTFB %10.3g\tSINQF %10.ge\tSINQB %10.3g", costfb, sinqft, sinqbt); + printf( "\tSINQFB %10.3g\tCOSQF %10.ge\tCOSQB %10.3g\n", sinqfb, cosqft, cosqbt); + printf( "\tCOSQFB %10.3g\t", cosqfb); + printf( "\tCFFTF %10.ge\tCFFTB %10.3g\n", dcfftf, dcfftb); + printf( "\tCFFTFB %10.3g\n", dcfb); + +#define CHECK(x) if (x > 1e-3) { printf(#x " failed: %g\n", x); all_ok = 0; } + CHECK(rftf); CHECK(rftb); CHECK(rftfb); CHECK(sintt); CHECK(sintfb); CHECK(costt); + CHECK(costfb); CHECK(sinqft); CHECK(sinqbt); CHECK(sinqfb); CHECK(cosqft); CHECK(cosqbt); + CHECK(cosqfb); CHECK(dcfftf); CHECK(dcfftb); + } + + if (all_ok) printf("Everything looks fine.\n"); + else printf("ERRORS WERE DETECTED.\n"); + /* + expected: + 120 RFFTF 2.786e-06 RFFTB 6.847e-04 RFFTFB 2.795e-07 SINT 1.312e-06 SINTFB 1.237e-06 COST 1.319e-06 + COSTFB 4.355e-06 SINQF 3.281e-04 SINQB 1.876e-06 SINQFB 2.198e-07 COSQF 6.199e-07 COSQB 2.193e-06 + COSQFB 2.300e-07 DEZF 5.573e-06 DEZB 1.363e-05 DEZFB 1.371e-06 CFFTF 5.590e-06 CFFTB 4.751e-05 + CFFTFB 4.215e-07 + 54 RFFTF 4.708e-07 RFFTB 3.052e-05 RFFTFB 3.439e-07 SINT 3.532e-07 SINTFB 4.145e-07 COST 3.002e-07 + COSTFB 6.343e-07 SINQF 4.959e-05 SINQB 4.415e-07 SINQFB 2.882e-07 COSQF 2.826e-07 COSQB 2.472e-07 + COSQFB 3.439e-07 DEZF 9.388e-07 DEZB 5.066e-06 DEZFB 5.960e-07 CFFTF 1.426e-06 CFFTB 9.482e-06 + CFFTFB 2.980e-07 + 49 RFFTF 4.476e-07 RFFTB 5.341e-05 RFFTFB 2.574e-07 SINT 9.196e-07 SINTFB 9.401e-07 COST 8.174e-07 + COSTFB 1.331e-06 SINQF 4.005e-05 SINQB 9.342e-07 SINQFB 3.057e-07 COSQF 2.530e-07 COSQB 6.228e-07 + COSQFB 4.826e-07 DEZF 9.071e-07 DEZB 4.590e-06 DEZFB 5.960e-07 CFFTF 2.095e-06 CFFTB 1.414e-05 + CFFTFB 7.398e-07 + 32 RFFTF 4.619e-07 RFFTB 2.861e-05 RFFTFB 1.192e-07 SINT 3.874e-07 SINTFB 4.172e-07 COST 4.172e-07 + COSTFB 1.699e-06 SINQF 2.551e-05 SINQB 6.407e-07 SINQFB 2.980e-07 COSQF 1.639e-07 COSQB 1.714e-07 + COSQFB 2.384e-07 DEZF 1.013e-06 DEZB 2.339e-06 DEZFB 7.749e-07 CFFTF 1.127e-06 CFFTB 6.744e-06 + CFFTFB 2.666e-07 + 4 RFFTF 1.490e-08 RFFTB 1.490e-07 RFFTFB 5.960e-08 SINT 7.451e-09 SINTFB 0.000e+00 COST 2.980e-08 + COSTFB 1.192e-07 SINQF 4.768e-07 SINQB 2.980e-08 SINQFB 5.960e-08 COSQF 2.608e-08 COSQB 5.960e-08 + COSQFB 1.192e-07 DEZF 2.980e-08 DEZB 5.960e-08 DEZFB 0.000e+00 CFFTF 6.664e-08 CFFTB 5.960e-08 + CFFTFB 6.144e-08 + 3 RFFTF 3.974e-08 RFFTB 1.192e-07 RFFTFB 3.303e-08 SINT 1.987e-08 SINTFB 1.069e-08 COST 4.967e-08 + COSTFB 5.721e-08 SINQF 8.941e-08 SINQB 2.980e-08 SINQFB 1.259e-07 COSQF 7.451e-09 COSQB 4.967e-08 + COSQFB 7.029e-08 DEZF 1.192e-07 DEZB 5.960e-08 DEZFB 5.960e-08 CFFTF 7.947e-08 CFFTB 8.429e-08 + CFFTFB 9.064e-08 + 2 RFFTF 0.000e+00 RFFTB 0.000e+00 RFFTFB 0.000e+00 SINT 0.000e+00 SINTFB 0.000e+00 COST 0.000e+00 + COSTFB 0.000e+00 SINQF 1.192e-07 SINQB 2.980e-08 SINQFB 5.960e-08 COSQF 7.451e-09 COSQB 1.490e-08 + COSQFB 0.000e+00 DEZF 0.000e+00 DEZB 0.000e+00 DEZFB 0.000e+00 CFFTF 0.000e+00 CFFTB 5.960e-08 + CFFTFB 5.960e-08 + Everything looks fine. + + */ + + return all_ok ? 0 : 1; +} +#endif /* TESTING_FFTPACK */ diff --git a/pffft/fftpack.h b/pffft/fftpack.h new file mode 100644 index 0000000..45cb742 --- /dev/null +++ b/pffft/fftpack.h @@ -0,0 +1,799 @@ +/* + Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/ + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + ChangeLog: + 2011/10/02: this is my first release of this file. +*/ + +#ifndef FFTPACK_H +#define FFTPACK_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft */ + +#ifndef FFTPACK_DOUBLE_PRECISION + typedef float fftpack_real; + typedef int fftpack_int; +#else + typedef double fftpack_real; + typedef int fftpack_int; +#endif + + void cffti(fftpack_int n, fftpack_real *wsave); + + void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave); + + void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave); + + void rffti(fftpack_int n, fftpack_real *wsave); + void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave); + void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave); + + void cosqi(fftpack_int n, fftpack_real *wsave); + void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave); + void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave); + + void costi(fftpack_int n, fftpack_real *wsave); + void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave); + + void sinqi(fftpack_int n, fftpack_real *wsave); + void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave); + void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave); + + void sinti(fftpack_int n, fftpack_real *wsave); + void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave); + +#ifdef __cplusplus +} +#endif + +#endif /* FFTPACK_H */ + +/* + + FFTPACK + +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + + version 4 april 1985 + + a package of fortran subprograms for the fast fourier + transform of periodic and other symmetric sequences + + by + + paul n swarztrauber + + national center for atmospheric research boulder,colorado 80307 + + which is sponsored by the national science foundation + +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + + +this package consists of programs which perform fast fourier +transforms for both complex and real periodic sequences and +certain other symmetric sequences that are listed below. + +1. rffti initialize rfftf and rfftb +2. rfftf forward transform of a real periodic sequence +3. rfftb backward transform of a real coefficient array + +4. ezffti initialize ezfftf and ezfftb +5. ezfftf a simplified real periodic forward transform +6. ezfftb a simplified real periodic backward transform + +7. sinti initialize sint +8. sint sine transform of a real odd sequence + +9. costi initialize cost +10. cost cosine transform of a real even sequence + +11. sinqi initialize sinqf and sinqb +12. sinqf forward sine transform with odd wave numbers +13. sinqb unnormalized inverse of sinqf + +14. cosqi initialize cosqf and cosqb +15. cosqf forward cosine transform with odd wave numbers +16. cosqb unnormalized inverse of cosqf + +17. cffti initialize cfftf and cfftb +18. cfftf forward transform of a complex periodic sequence +19. cfftb unnormalized inverse of cfftf + + +****************************************************************** + +subroutine rffti(n,wsave) + + **************************************************************** + +subroutine rffti initializes the array wsave which is used in +both rfftf and rfftb. the prime factorization of n together with +a tabulation of the trigonometric functions are computed and +stored in wsave. + +input parameter + +n the length of the sequence to be transformed. + +output parameter + +wsave a work array which must be dimensioned at least 2*n+15. + the same work array can be used for both rfftf and rfftb + as long as n remains unchanged. different wsave arrays + are required for different values of n. the contents of + wsave must not be changed between calls of rfftf or rfftb. + +****************************************************************** + +subroutine rfftf(n,r,wsave) + +****************************************************************** + +subroutine rfftf computes the fourier coefficients of a real +perodic sequence (fourier analysis). the transform is defined +below at output parameter r. + +input parameters + +n the length of the array r to be transformed. the method + is most efficient when n is a product of small primes. + n may change so long as different work arrays are provided + +r a real array of length n which contains the sequence + to be transformed + +wsave a work array which must be dimensioned at least 2*n+15. + in the program that calls rfftf. the wsave array must be + initialized by calling subroutine rffti(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + the same wsave array can be used by rfftf and rfftb. + + +output parameters + +r r(1) = the sum from i=1 to i=n of r(i) + + if n is even set l =n/2 , if n is odd set l = (n+1)/2 + + then for k = 2,...,l + + r(2*k-2) = the sum from i = 1 to i = n of + + r(i)*cos((k-1)*(i-1)*2*pi/n) + + r(2*k-1) = the sum from i = 1 to i = n of + + -r(i)*sin((k-1)*(i-1)*2*pi/n) + + if n is even + + r(n) = the sum from i = 1 to i = n of + + (-1)**(i-1)*r(i) + + ***** note + this transform is unnormalized since a call of rfftf + followed by a call of rfftb will multiply the input + sequence by n. + +wsave contains results which must not be destroyed between + calls of rfftf or rfftb. + + +****************************************************************** + +subroutine rfftb(n,r,wsave) + +****************************************************************** + +subroutine rfftb computes the real perodic sequence from its +fourier coefficients (fourier synthesis). the transform is defined +below at output parameter r. + +input parameters + +n the length of the array r to be transformed. the method + is most efficient when n is a product of small primes. + n may change so long as different work arrays are provided + +r a real array of length n which contains the sequence + to be transformed + +wsave a work array which must be dimensioned at least 2*n+15. + in the program that calls rfftb. the wsave array must be + initialized by calling subroutine rffti(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + the same wsave array can be used by rfftf and rfftb. + + +output parameters + +r for n even and for i = 1,...,n + + r(i) = r(1)+(-1)**(i-1)*r(n) + + plus the sum from k=2 to k=n/2 of + + 2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n) + + -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n) + + for n odd and for i = 1,...,n + + r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of + + 2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n) + + -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n) + + ***** note + this transform is unnormalized since a call of rfftf + followed by a call of rfftb will multiply the input + sequence by n. + +wsave contains results which must not be destroyed between + calls of rfftb or rfftf. + +****************************************************************** + +subroutine sinti(n,wsave) + +****************************************************************** + +subroutine sinti initializes the array wsave which is used in +subroutine sint. the prime factorization of n together with +a tabulation of the trigonometric functions are computed and +stored in wsave. + +input parameter + +n the length of the sequence to be transformed. the method + is most efficient when n+1 is a product of small primes. + +output parameter + +wsave a work array with at least int(2.5*n+15) locations. + different wsave arrays are required for different values + of n. the contents of wsave must not be changed between + calls of sint. + +****************************************************************** + +subroutine sint(n,x,wsave) + +****************************************************************** + +subroutine sint computes the discrete fourier sine transform +of an odd sequence x(i). the transform is defined below at +output parameter x. + +sint is the unnormalized inverse of itself since a call of sint +followed by another call of sint will multiply the input sequence +x by 2*(n+1). + +the array wsave which is used by subroutine sint must be +initialized by calling subroutine sinti(n,wsave). + +input parameters + +n the length of the sequence to be transformed. the method + is most efficient when n+1 is the product of small primes. + +x an array which contains the sequence to be transformed + + +wsave a work array with dimension at least int(2.5*n+15) + in the program that calls sint. the wsave array must be + initialized by calling subroutine sinti(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + +output parameters + +x for i=1,...,n + + x(i)= the sum from k=1 to k=n + + 2*x(k)*sin(k*i*pi/(n+1)) + + a call of sint followed by another call of + sint will multiply the sequence x by 2*(n+1). + hence sint is the unnormalized inverse + of itself. + +wsave contains initialization calculations which must not be + destroyed between calls of sint. + +****************************************************************** + +subroutine costi(n,wsave) + +****************************************************************** + +subroutine costi initializes the array wsave which is used in +subroutine cost. the prime factorization of n together with +a tabulation of the trigonometric functions are computed and +stored in wsave. + +input parameter + +n the length of the sequence to be transformed. the method + is most efficient when n-1 is a product of small primes. + +output parameter + +wsave a work array which must be dimensioned at least 3*n+15. + different wsave arrays are required for different values + of n. the contents of wsave must not be changed between + calls of cost. + +****************************************************************** + +subroutine cost(n,x,wsave) + +****************************************************************** + +subroutine cost computes the discrete fourier cosine transform +of an even sequence x(i). the transform is defined below at output +parameter x. + +cost is the unnormalized inverse of itself since a call of cost +followed by another call of cost will multiply the input sequence +x by 2*(n-1). the transform is defined below at output parameter x + +the array wsave which is used by subroutine cost must be +initialized by calling subroutine costi(n,wsave). + +input parameters + +n the length of the sequence x. n must be greater than 1. + the method is most efficient when n-1 is a product of + small primes. + +x an array which contains the sequence to be transformed + +wsave a work array which must be dimensioned at least 3*n+15 + in the program that calls cost. the wsave array must be + initialized by calling subroutine costi(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + +output parameters + +x for i=1,...,n + + x(i) = x(1)+(-1)**(i-1)*x(n) + + + the sum from k=2 to k=n-1 + + 2*x(k)*cos((k-1)*(i-1)*pi/(n-1)) + + a call of cost followed by another call of + cost will multiply the sequence x by 2*(n-1) + hence cost is the unnormalized inverse + of itself. + +wsave contains initialization calculations which must not be + destroyed between calls of cost. + +****************************************************************** + +subroutine sinqi(n,wsave) + +****************************************************************** + +subroutine sinqi initializes the array wsave which is used in +both sinqf and sinqb. the prime factorization of n together with +a tabulation of the trigonometric functions are computed and +stored in wsave. + +input parameter + +n the length of the sequence to be transformed. the method + is most efficient when n is a product of small primes. + +output parameter + +wsave a work array which must be dimensioned at least 3*n+15. + the same work array can be used for both sinqf and sinqb + as long as n remains unchanged. different wsave arrays + are required for different values of n. the contents of + wsave must not be changed between calls of sinqf or sinqb. + +****************************************************************** + +subroutine sinqf(n,x,wsave) + +****************************************************************** + +subroutine sinqf computes the fast fourier transform of quarter +wave data. that is , sinqf computes the coefficients in a sine +series representation with only odd wave numbers. the transform +is defined below at output parameter x. + +sinqb is the unnormalized inverse of sinqf since a call of sinqf +followed by a call of sinqb will multiply the input sequence x +by 4*n. + +the array wsave which is used by subroutine sinqf must be +initialized by calling subroutine sinqi(n,wsave). + + +input parameters + +n the length of the array x to be transformed. the method + is most efficient when n is a product of small primes. + +x an array which contains the sequence to be transformed + +wsave a work array which must be dimensioned at least 3*n+15. + in the program that calls sinqf. the wsave array must be + initialized by calling subroutine sinqi(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + +output parameters + +x for i=1,...,n + + x(i) = (-1)**(i-1)*x(n) + + + the sum from k=1 to k=n-1 of + + 2*x(k)*sin((2*i-1)*k*pi/(2*n)) + + a call of sinqf followed by a call of + sinqb will multiply the sequence x by 4*n. + therefore sinqb is the unnormalized inverse + of sinqf. + +wsave contains initialization calculations which must not + be destroyed between calls of sinqf or sinqb. + +****************************************************************** + +subroutine sinqb(n,x,wsave) + +****************************************************************** + +subroutine sinqb computes the fast fourier transform of quarter +wave data. that is , sinqb computes a sequence from its +representation in terms of a sine series with odd wave numbers. +the transform is defined below at output parameter x. + +sinqf is the unnormalized inverse of sinqb since a call of sinqb +followed by a call of sinqf will multiply the input sequence x +by 4*n. + +the array wsave which is used by subroutine sinqb must be +initialized by calling subroutine sinqi(n,wsave). + + +input parameters + +n the length of the array x to be transformed. the method + is most efficient when n is a product of small primes. + +x an array which contains the sequence to be transformed + +wsave a work array which must be dimensioned at least 3*n+15. + in the program that calls sinqb. the wsave array must be + initialized by calling subroutine sinqi(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + +output parameters + +x for i=1,...,n + + x(i)= the sum from k=1 to k=n of + + 4*x(k)*sin((2k-1)*i*pi/(2*n)) + + a call of sinqb followed by a call of + sinqf will multiply the sequence x by 4*n. + therefore sinqf is the unnormalized inverse + of sinqb. + +wsave contains initialization calculations which must not + be destroyed between calls of sinqb or sinqf. + +****************************************************************** + +subroutine cosqi(n,wsave) + +****************************************************************** + +subroutine cosqi initializes the array wsave which is used in +both cosqf and cosqb. the prime factorization of n together with +a tabulation of the trigonometric functions are computed and +stored in wsave. + +input parameter + +n the length of the array to be transformed. the method + is most efficient when n is a product of small primes. + +output parameter + +wsave a work array which must be dimensioned at least 3*n+15. + the same work array can be used for both cosqf and cosqb + as long as n remains unchanged. different wsave arrays + are required for different values of n. the contents of + wsave must not be changed between calls of cosqf or cosqb. + +****************************************************************** + +subroutine cosqf(n,x,wsave) + +****************************************************************** + +subroutine cosqf computes the fast fourier transform of quarter +wave data. that is , cosqf computes the coefficients in a cosine +series representation with only odd wave numbers. the transform +is defined below at output parameter x + +cosqf is the unnormalized inverse of cosqb since a call of cosqf +followed by a call of cosqb will multiply the input sequence x +by 4*n. + +the array wsave which is used by subroutine cosqf must be +initialized by calling subroutine cosqi(n,wsave). + + +input parameters + +n the length of the array x to be transformed. the method + is most efficient when n is a product of small primes. + +x an array which contains the sequence to be transformed + +wsave a work array which must be dimensioned at least 3*n+15 + in the program that calls cosqf. the wsave array must be + initialized by calling subroutine cosqi(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + +output parameters + +x for i=1,...,n + + x(i) = x(1) plus the sum from k=2 to k=n of + + 2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n)) + + a call of cosqf followed by a call of + cosqb will multiply the sequence x by 4*n. + therefore cosqb is the unnormalized inverse + of cosqf. + +wsave contains initialization calculations which must not + be destroyed between calls of cosqf or cosqb. + +****************************************************************** + +subroutine cosqb(n,x,wsave) + +****************************************************************** + +subroutine cosqb computes the fast fourier transform of quarter +wave data. that is , cosqb computes a sequence from its +representation in terms of a cosine series with odd wave numbers. +the transform is defined below at output parameter x. + +cosqb is the unnormalized inverse of cosqf since a call of cosqb +followed by a call of cosqf will multiply the input sequence x +by 4*n. + +the array wsave which is used by subroutine cosqb must be +initialized by calling subroutine cosqi(n,wsave). + + +input parameters + +n the length of the array x to be transformed. the method + is most efficient when n is a product of small primes. + +x an array which contains the sequence to be transformed + +wsave a work array that must be dimensioned at least 3*n+15 + in the program that calls cosqb. the wsave array must be + initialized by calling subroutine cosqi(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + +output parameters + +x for i=1,...,n + + x(i)= the sum from k=1 to k=n of + + 4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n)) + + a call of cosqb followed by a call of + cosqf will multiply the sequence x by 4*n. + therefore cosqf is the unnormalized inverse + of cosqb. + +wsave contains initialization calculations which must not + be destroyed between calls of cosqb or cosqf. + +****************************************************************** + +subroutine cffti(n,wsave) + +****************************************************************** + +subroutine cffti initializes the array wsave which is used in +both cfftf and cfftb. the prime factorization of n together with +a tabulation of the trigonometric functions are computed and +stored in wsave. + +input parameter + +n the length of the sequence to be transformed + +output parameter + +wsave a work array which must be dimensioned at least 4*n+15 + the same work array can be used for both cfftf and cfftb + as long as n remains unchanged. different wsave arrays + are required for different values of n. the contents of + wsave must not be changed between calls of cfftf or cfftb. + +****************************************************************** + +subroutine cfftf(n,c,wsave) + +****************************************************************** + +subroutine cfftf computes the forward complex discrete fourier +transform (the fourier analysis). equivalently , cfftf computes +the fourier coefficients of a complex periodic sequence. +the transform is defined below at output parameter c. + +the transform is not normalized. to obtain a normalized transform +the output must be divided by n. otherwise a call of cfftf +followed by a call of cfftb will multiply the sequence by n. + +the array wsave which is used by subroutine cfftf must be +initialized by calling subroutine cffti(n,wsave). + +input parameters + + +n the length of the complex sequence c. the method is + more efficient when n is the product of small primes. n + +c a complex array of length n which contains the sequence + +wsave a real work array which must be dimensioned at least 4n+15 + in the program that calls cfftf. the wsave array must be + initialized by calling subroutine cffti(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + the same wsave array can be used by cfftf and cfftb. + +output parameters + +c for j=1,...,n + + c(j)=the sum from k=1,...,n of + + c(k)*exp(-i*(j-1)*(k-1)*2*pi/n) + + where i=sqrt(-1) + +wsave contains initialization calculations which must not be + destroyed between calls of subroutine cfftf or cfftb + +****************************************************************** + +subroutine cfftb(n,c,wsave) + +****************************************************************** + +subroutine cfftb computes the backward complex discrete fourier +transform (the fourier synthesis). equivalently , cfftb computes +a complex periodic sequence from its fourier coefficients. +the transform is defined below at output parameter c. + +a call of cfftf followed by a call of cfftb will multiply the +sequence by n. + +the array wsave which is used by subroutine cfftb must be +initialized by calling subroutine cffti(n,wsave). + +input parameters + + +n the length of the complex sequence c. the method is + more efficient when n is the product of small primes. + +c a complex array of length n which contains the sequence + +wsave a real work array which must be dimensioned at least 4n+15 + in the program that calls cfftb. the wsave array must be + initialized by calling subroutine cffti(n,wsave) and a + different wsave array must be used for each different + value of n. this initialization does not have to be + repeated so long as n remains unchanged thus subsequent + transforms can be obtained faster than the first. + the same wsave array can be used by cfftf and cfftb. + +output parameters + +c for j=1,...,n + + c(j)=the sum from k=1,...,n of + + c(k)*exp(i*(j-1)*(k-1)*2*pi/n) + + where i=sqrt(-1) + +wsave contains initialization calculations which must not be + destroyed between calls of subroutine cfftf or cfftb + +*/ diff --git a/pffft/fmv.h b/pffft/fmv.h new file mode 100644 index 0000000..0aa439d --- /dev/null +++ b/pffft/fmv.h @@ -0,0 +1,20 @@ +#ifndef FMV_H + +#if HAVE_FUNC_ATTRIBUTE_IFUNC +#if defined(__has_attribute) +#if __has_attribute(target_clones) +#if defined(__x86_64) + +// see https://gcc.gnu.org/wiki/FunctionMultiVersioning +#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default"))) +#define HAVE_PF_TARGET_CLONES 1 +#endif +#endif +#endif +#endif + +#ifndef PF_TARGET_CLONES +#define PF_TARGET_CLONES +#endif + +#endif diff --git a/pffft/mingw-w32-i686.cmake b/pffft/mingw-w32-i686.cmake new file mode 100644 index 0000000..eecd236 --- /dev/null +++ b/pffft/mingw-w32-i686.cmake @@ -0,0 +1,25 @@ +# Sample toolchain file for building for Windows from an Ubuntu Linux system. +# +# Typical usage: +# *) install cross compiler: `sudo apt-get install mingw-w64` +# *) cd build +# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w32-i686.cmake .. +# +# build for Windows' 32 bit architecture + +set(CMAKE_SYSTEM_NAME Windows) +set(CMAKE_SYSTEM_PROCESSOR x86_64) +set(TOOLCHAIN_PREFIX i686-w64-mingw32) + +# cross compilers to use for C, C++ and Fortran +set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc) +set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++) +set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres) + +# target environment on the build host system +set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX}) + +# modify default behavior of FIND_XXX() commands +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/pffft/mingw-w64-x64_64.cmake b/pffft/mingw-w64-x64_64.cmake new file mode 100644 index 0000000..1ed08f0 --- /dev/null +++ b/pffft/mingw-w64-x64_64.cmake @@ -0,0 +1,25 @@ +# Sample toolchain file for building for Windows from an Ubuntu Linux system. +# +# Typical usage: +# *) install cross compiler: `sudo apt-get install mingw-w64` +# *) cd build +# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w64-x86_64.cmake .. +# +# build for Windows' 64 bit architecture + +set(CMAKE_SYSTEM_NAME Windows) +set(CMAKE_SYSTEM_PROCESSOR x86_64) +set(TOOLCHAIN_PREFIX x86_64-w64-mingw32) + +# cross compilers to use for C, C++ and Fortran +set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc) +set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++) +set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres) + +# target environment on the build host system +set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX}) + +# modify default behavior of FIND_XXX() commands +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/pffft/papi_perf_counter.h b/pffft/papi_perf_counter.h new file mode 100644 index 0000000..c8e7943 --- /dev/null +++ b/pffft/papi_perf_counter.h @@ -0,0 +1,97 @@ +#pragma once + +/* for measurement of CPU cycles .. + * + * requires + * sudo apt-get install libpapi-dev papi-tools + * on debian/ubuntu linux distributions + * + */ + +#ifdef HAVE_PAPI +#include +#endif + +#include + + +struct papi_perf_counter +{ + papi_perf_counter() + : realTime(0.0F), processTime(0.0F), instructions(0LL), ipc(0.0F) + , started(false), finished(false), print_at_destruction(false) + { } + + papi_perf_counter(int _start, bool print_at_destruction_ = true) + : print_at_destruction(print_at_destruction_) + { + (void)_start; + start(); + } + + ~papi_perf_counter() + { + if (print_at_destruction) + print(stderr); + } + + bool start() + { + static bool reported_start_error = false; +#ifdef HAVE_PAPI + int ret = PAPI_ipc(&realTime, &processTime, &instructions, &ipc); + if (ret && !reported_start_error) + { + reported_start_error = true; + fprintf(stderr, "papi_perf_counter::start(): PAPI_ipc() returned error %d\n", ret); + } +#else + if (!reported_start_error) + { + reported_start_error = true; + fprintf(stderr, "papi_perf_counter::start(): no HAVE_PAPI\n"); + } + int ret = 1; +#endif + started = (!ret); + finished = false; + return started; + } + + bool finish() + { + papi_perf_counter end(1, false); + if (started && !finished && end.started) + { + realTime = end.realTime - realTime; + processTime = end.processTime - processTime; + instructions = end.instructions - instructions; + ipc = end.ipc; + finished = true; + return true; + } + return false; + } + + void print(FILE *f = stdout) + { + if (started && !finished) + finish(); + if (!started || !finished) + return; + double cycles = instructions / ipc; + fprintf(f, "real %g, process %g, instructions %lld, ins/cycle %f => cycles %g\n" + , realTime, processTime, instructions, ipc, cycles + ); + started = false; + } + + float realTime; + float processTime; + long long instructions; + float ipc; + bool started; + bool finished; + bool print_at_destruction; +}; + diff --git a/pffft/pf_carrier.cpp b/pffft/pf_carrier.cpp new file mode 100644 index 0000000..d751a55 --- /dev/null +++ b/pffft/pf_carrier.cpp @@ -0,0 +1,298 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler +Copyright (c) 2020 Hayati Ayguen + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* include own header first, to see missing includes */ +#include "pf_carrier.h" +#include "fmv.h" + +#include +#include + + +PF_TARGET_CLONES +void generate_dc_f(float* output, int size) +{ + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=(127.0F / 128.0F); + output[i++]=0.0F; + } +} + +PF_TARGET_CLONES +void generate_dc_s16(short* output, int size) +{ + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=SHRT_MAX; + output[i++]=0; + } +} + +PF_TARGET_CLONES +void generate_pos_fs4_f(float* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=(127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0.0F; + output[i++]=(127.0F / 128.0F); + /* exp(i* +pi) = -1+i*0 */ + output[i++]=(-127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0.0F; + output[i++]=(-127.0F / 128.0F); + } +} + +PF_TARGET_CLONES +void generate_pos_fs4_s16(short* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=SHRT_MAX; + output[i++]=0; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0; + output[i++]=SHRT_MAX; + /* exp(i* +pi) = -1+i*0 */ + output[i++]=-SHRT_MAX; + output[i++]=0; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0; + output[i++]=-SHRT_MAX; + } +} + +PF_TARGET_CLONES +void generate_neg_fs4_f(float* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=(127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0.0F; + output[i++]=(-127.0F / 128.0F); + /* exp(i* +pi) = -1+i*0 */ + output[i++]=(-127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0.0F; + output[i++]=(127.0F / 128.0F); + } +} + +PF_TARGET_CLONES +void generate_neg_fs4_s16(short* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=SHRT_MAX; + output[i++]=0; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0; + output[i++]=-SHRT_MAX; + /* exp(i* +pi) = -1+i*0 */ + output[i++]=-SHRT_MAX; + output[i++]=0; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0; + output[i++]=SHRT_MAX; + } +} + +/****************************************************/ + +PF_TARGET_CLONES +void generate_dc_pos_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+1+i*0 */ + output[i++]=m+m; + output[i++]=0; + /* exp(i* +pi/2) = 1+0+i*1 */ + output[i++]=m+0; + output[i++]=m; + /* exp(i* +pi) = 1-1+i*0 */ + output[i++]=m-m; + output[i++]=0; + /* exp(i* -pi/2) = 1+0+i*-1 */ + output[i++]=m; + output[i++]=-m; + } +} + +PF_TARGET_CLONES +void generate_dc_neg_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+1+i*0 */ + output[i++]=m+m; + output[i++]=0; + /* exp(i* -pi/2) = 1+0+i*-1 */ + output[i++]=m+0; + output[i++]=-m; + /* exp(i* +pi) = 1-1+i*0 */ + output[i++]=m-m; + output[i++]=0; + /* exp(i* +pi/2) = 1+0+i*1 */ + output[i++]=m+0; + output[i++]=m; + } +} + +PF_TARGET_CLONES +void generate_pos_neg_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* pos(0) + neg(0) = exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */ + output[i++]=m; + output[i++]=-m; + + /* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */ + output[i++]=-m; + output[i++]=m; + + /* pos(2) + neg(2) = exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */ + output[i++]=-m; + output[i++]=m; + + /* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */ + output[i++]=m; + output[i++]=-m; + } +} + +PF_TARGET_CLONES +void generate_dc_pos_neg_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* dc + pos(0) + neg(0) = dc + exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */ + output[i++]=m+m; + output[i++]=-m; + + /* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */ + output[i++]=0; + output[i++]=m; + + /* dc + pos(2) + neg(2) = dc + exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */ + output[i++]=0; + output[i++]=m; + + /* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */ + output[i++]=m+m; + output[i++]=-m; + } +} + + +PF_TARGET_CLONES +void generate_pos_neg_fs2_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* dc + exp(i* 0 ) = +1 */ + output[i++]=m; + output[i++]=0; + /* dc + exp(i* pi) = -1 */ + output[i++]=-m; + output[i++]=0; + /* dc + exp(i* 0 ) = +1 */ + output[i++]=m; + output[i++]=0; + /* dc + exp(i* pi) = -1 */ + output[i++]=-m; + output[i++]=0; + } +} + +PF_TARGET_CLONES +void generate_dc_pos_neg_fs2_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* with dc = i*1 */ + /* dc + exp(i* 0 ) = i*1 +1 */ + output[i++]=m; + output[i++]=m; + /* dc + exp(i* pi) = i*1 -1 */ + output[i++]=-m; + output[i++]=m; + /* dc + exp(i* 0 ) = i*1 +1 */ + output[i++]=m; + output[i++]=m; + /* dc + exp(i* pi) = i*1 -1 */ + output[i++]=-m; + output[i++]=m; + } +} + + diff --git a/pffft/pf_carrier.h b/pffft/pf_carrier.h new file mode 100644 index 0000000..c328ce0 --- /dev/null +++ b/pffft/pf_carrier.h @@ -0,0 +1,75 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler +Copyright (c) 2020 Hayati Ayguen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + _____ _ + / ____| | | + | | ___ _ __ ___ _ __ | | _____ __ + | | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ / + | |___| (_) | | | | | | |_) | | __/> < + \_____\___/|_| |_| |_| .__/|_|\___/_/\_\ + | | + |_| +*/ + +typedef struct complexf_s { float i; float q; } complexf; + + +/* generation functions */ +void generate_dc_f(float* output, int size); +void generate_dc_s16(short* output, int size); +void generate_pos_fs4_f(float* output, int size); +void generate_pos_fs4_s16(short* output, int size); +void generate_neg_fs4_f(float* output, int size); +void generate_neg_fs4_s16(short* output, int size); + +void generate_dc_pos_fs4_s16(short* output, int size); +void generate_dc_neg_fs4_s16(short* output, int size); +void generate_pos_neg_fs4_s16(short* output, int size); +void generate_dc_pos_neg_fs4_s16(short* output, int size); + +void generate_pos_neg_fs2_s16(short* output, int size); +void generate_dc_pos_neg_fs2_s16(short* output, int size); + + +#ifdef __cplusplus +} +#endif + diff --git a/pffft/pf_cic.cpp b/pffft/pf_cic.cpp new file mode 100644 index 0000000..2362853 --- /dev/null +++ b/pffft/pf_cic.cpp @@ -0,0 +1,255 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler +Copyright (c) 2020 Hayati Ayguen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* gcc requires this for M_PI !? */ +#undef __STRICT_ANSI__ + +/* include own header first, to see missing includes */ +#include "pf_cic.h" +#include "fmv.h" + +#include +#include +#include +#include + + +/* + ____ ___ ____ ____ ____ ____ + / ___|_ _/ ___| | _ \| _ \ / ___| + | | | | | | | | | | | | | + | |___ | | |___ | |_| | |_| | |___ + \____|___\____| |____/|____/ \____| +*/ + +#define SINESHIFT 12 +#define SINESIZE (1<factor = factor; + s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators + + s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable)); + double f = 2.0 * M_PI / (double)SINESIZE; + for(i = 0; i < sinesize2; i++) { + s->sinetable[i] = sineamp * cos(f * i); + } + return s; +} + +void cicddc_free(void *state) { + cicddc_t *s = (cicddc_t *)state; + free(s->sinetable); + free(s); +} + + +PF_TARGET_CLONES +void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) { + cicddc_t *s = (cicddc_t *)state; + int k; + int factor = s->factor; + cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b; + cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b; + uint64_t phase = s->phase, freq; + int16_t *sinetable = s->sinetable; + float gain = s->gain; + + freq = rate * ((float)(1ULL << 63) * 2); + + int16_t *inp = input; + for(k = 0; k < outsize; k++) { + int i; + cic_dt out0a, out0b, out1a, out1b; + cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum + for(i = 0; i < factor; i++) { + cic_dt in_a, in_b; + int sinep = phase >> (64-SINESHIFT); + in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))]; + in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep]; + phase += freq; + /* integrators: + The calculations are ordered so that each integrator + takes a result from previous loop iteration + to make the code more "pipeline-friendly". */ + ig2a += ig1a; ig2b += ig1b; + ig1a += ig0a; ig1b += ig0b; + ig0a += in_a; ig0b += in_b; + } + inp += factor; + // comb filters: + out0a = ig2a - comb0a; out0b = ig2b - comb0b; + comb0a = ig2a; comb0b = ig2b; + out1a = out0a - comb1a; out1b = out0b - comb1b; + comb1a = out0a; comb1b = out0b; + + output[k].i = (float)out1a * gain; + output[k].q = (float)out1b * gain; + } + + s->ig0a = ig0a; s->ig0b = ig0b; + s->ig1a = ig1a; s->ig1b = ig1b; + s->comb0a = comb0a; s->comb0b = comb0b; + s->comb1a = comb1a; s->comb1b = comb1b; + s->phase = phase; +} + +PF_TARGET_CLONES +void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) { + cicddc_t *s = (cicddc_t *)state; + int k; + int factor = s->factor; + cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b; + cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b; + uint64_t phase = s->phase, freq; + int16_t *sinetable = s->sinetable; + float gain = s->gain; + + freq = rate * ((float)(1ULL << 63) * 2); + + int16_t *inp = input; + for(k = 0; k < outsize; k++) { + int i; + cic_dt out0a, out0b, out1a, out1b; + cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum + for(i = 0; i < factor; i++) { + cic_dt in_a, in_b; + int32_t m_a, m_b, m_c, m_d; + int sinep = phase >> (64-SINESHIFT); + m_a = inp[2*i]; + m_b = inp[2*i+1]; + m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))]; + m_d = (int32_t)sinetable[sinep]; + // complex multiplication: + in_a = m_a*m_c - m_b*m_d; + in_b = m_a*m_d + m_b*m_c; + phase += freq; + /* integrators: + The calculations are ordered so that each integrator + takes a result from previous loop iteration + to make the code more "pipeline-friendly". */ + ig2a += ig1a; ig2b += ig1b; + ig1a += ig0a; ig1b += ig0b; + ig0a += in_a; ig0b += in_b; + } + inp += 2*factor; + // comb filters: + out0a = ig2a - comb0a; out0b = ig2b - comb0b; + comb0a = ig2a; comb0b = ig2b; + out1a = out0a - comb1a; out1b = out0b - comb1b; + comb1a = out0a; comb1b = out0b; + + output[k].i = (float)out1a * gain; + output[k].q = (float)out1b * gain; + } + + s->ig0a = ig0a; s->ig0b = ig0b; + s->ig1a = ig1a; s->ig1b = ig1b; + s->comb0a = comb0a; s->comb0b = comb0b; + s->comb1a = comb1a; s->comb1b = comb1b; + s->phase = phase; +} + + +/* This is almost copy paste from cicddc_cs16_c. + I'm afraid this is going to be annoying to maintain... */ +PF_TARGET_CLONES +void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) { + cicddc_t *s = (cicddc_t *)state; + int k; + int factor = s->factor; + cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b; + cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b; + uint64_t phase = s->phase, freq; + int16_t *sinetable = s->sinetable; + float gain = s->gain; + + freq = rate * ((float)(1ULL << 63) * 2); + + uint8_t *inp = input; + for(k = 0; k < outsize; k++) { + int i; + cic_dt out0a, out0b, out1a, out1b; + cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum + for(i = 0; i < factor; i++) { + cic_dt in_a, in_b; + int32_t m_a, m_b, m_c, m_d; + int sinep = phase >> (64-SINESHIFT); + // subtract 127.4 (good for rtl-sdr) + m_a = (((int32_t)inp[2*i]) << 8) - 32614; + m_b = (((int32_t)inp[2*i+1]) << 8) - 32614; + m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))]; + m_d = (int32_t)sinetable[sinep]; + // complex multiplication: + in_a = m_a*m_c - m_b*m_d; + in_b = m_a*m_d + m_b*m_c; + phase += freq; + /* integrators: + The calculations are ordered so that each integrator + takes a result from previous loop iteration + to make the code more "pipeline-friendly". */ + ig2a += ig1a; ig2b += ig1b; + ig1a += ig0a; ig1b += ig0b; + ig0a += in_a; ig0b += in_b; + } + inp += 2*factor; + // comb filters: + out0a = ig2a - comb0a; out0b = ig2b - comb0b; + comb0a = ig2a; comb0b = ig2b; + out1a = out0a - comb1a; out1b = out0b - comb1b; + comb1a = out0a; comb1b = out0b; + + output[k].i = (float)out1a * gain; + output[k].q = (float)out1b * gain; + } + + s->ig0a = ig0a; s->ig0b = ig0b; + s->ig1a = ig1a; s->ig1b = ig1b; + s->comb0a = comb0a; s->comb0b = comb0b; + s->comb1a = comb1a; s->comb1b = comb1b; + s->phase = phase; +} + diff --git a/pffft/pf_cic.h b/pffft/pf_cic.h new file mode 100644 index 0000000..681ee4f --- /dev/null +++ b/pffft/pf_cic.h @@ -0,0 +1,58 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler +Copyright (c) 2020 Hayati Ayguen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + ____ ___ ____ ____ ____ ____ + / ___|_ _/ ___| | _ \| _ \ / ___| + | | | | | | | | | | | | | + | |___ | | |___ | |_| | |_| | |___ + \____|___\____| |____/|____/ \____| +*/ + +typedef struct complexf_s { float i; float q; } complexf; + +void *cicddc_init(int factor); +void cicddc_free(void *state); +void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate); +void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate); +void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate); + +#ifdef __cplusplus +} +#endif + diff --git a/pffft/pf_conv.cpp b/pffft/pf_conv.cpp new file mode 100644 index 0000000..45e56d5 --- /dev/null +++ b/pffft/pf_conv.cpp @@ -0,0 +1,322 @@ + +#include "pf_conv.h" + +#include +#include + +#include + +#if 0 +#include + +#define DPRINT(...) fprintf(stderr, __VA_ARGS__) + +#else +#define DPRINT(...) do { } while (0) +#endif + + +#ifdef HAVE_MIPP +#include +#endif + + +#ifndef CONV_ARCH_POST +#error CONV_ARCH_POST not defined +#endif + +#define PP_STRINGIFY(X) #X +#define PP_TOSTRING(X) PP_STRINGIFY(X) +#define PP_CONCAT_IMPL(x, y) x##y +#define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y ) + +#define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST) + + +const char * ARCHFUNCNAME(id)() +{ + return PP_TOSTRING(CONV_ARCH_POST); +} + + +int ARCHFUNCNAME(conv_float_simd_size)() +{ +#if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP) + // have a completely MIPP independent implementation + return 1; +#else + return mipp::N(); +#endif +} + + +void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state) +{ + int R = state->size - state->offset; // this many samples from prev conv_float were not processed + if (R > 0) + { + // memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin + std::copy(&s[state->offset], &s[state->size], s); + } + else + R = 0; + state->offset = 0; // data - to be processed - is at begin + state->size = R; // this many unprocessed samples +} + + +void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state) +{ + int R = state->size - state->offset; // this many samples from prev conv_float were not processed + if (R > 0) + { + // memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin + std::copy(&s[state->offset], &s[state->size], s); + } + else + R = 0; + state->offset = 0; // data - to be processed - is at begin + state->size = R; // this many unprocessed samples +} + + +#if defined(MIPP_NO_INTRINSICS) +// have a completely MIPP independent implementation +// #error missing HAVE_MIPP: there is no MIPP-independent implementation + +int ARCHFUNCNAME(conv_float_inplace)( + float * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter + ) +{ + const int off0 = state->offset; + const int sz_s = state->size; + int offset; + + for ( offset = off0; offset + sz_filter <= sz_s; ++offset) + { + float accu = 0.0F; + for (int k = 0; k < sz_filter; ++k) + accu += s[offset+k] * filter[k]; + s[offset] = accu; + } + + state->offset = offset; + return offset - off0; +} + + +int ARCHFUNCNAME(conv_float_oop)( + const float * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter, + float * RESTRICT y + ) +{ + const int off0 = state->offset; + const int sz_s = state->size; + int offset; + + for ( offset = off0; offset + sz_filter <= sz_s; ++offset) + { + float accu = 0.0F; + for (int k = 0; k < sz_filter; ++k) + accu += s[offset+k] * filter[k]; + y[offset] = accu; + } + + state->offset = offset; + return offset - off0; +} + + +int ARCHFUNCNAME(conv_cplx_float_oop)( + const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter, + complexf * RESTRICT y_cplx + ) +{ + const int off0 = state->offset; + const int sz_s = state->size; + const int sz_f = sz_filter; + int offset; + + for ( offset = off0; offset + sz_f <= sz_s; ++offset) + { + float accu_re = 0.0F; + float accu_im = 0.0F; + for (int k = 0; k < sz_filter; ++k) + { + accu_re = s_cplx[offset+k].i * filter[k]; // accu += rS * rH; + accu_im = s_cplx[offset+k].q * filter[k]; // accu += rS * rH; + } + y_cplx[offset].i = accu_re; // == hadd() == sum of real parts + y_cplx[offset].q = accu_im; // == hadd() == sum of imag parts + } + + state->offset = offset; + return offset - off0; +} + + +#elif defined(HAVE_MIPP) + + +int ARCHFUNCNAME(conv_float_inplace)( + float * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter + ) +{ + assert( (sz_filter % mipp::N()) == 0 ); // size of filter must be divisible by conv_float_simd_size() + + mipp::Reg accu, rS, rH; + const int off0 = state->offset; + const int sz_s = state->size; + int offset; + + for ( offset = off0; offset + sz_filter <= sz_s; ++offset) + { + accu.set0(); + for (int k = 0; k < sz_filter; k += mipp::N()) + { + rS.load(&s[offset+k]); + rH.load(&filter[k]); + accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH; + } + s[offset] = accu.sum(); // == hadd() + } + + state->offset = offset; + return offset - off0; +} + + +int ARCHFUNCNAME(conv_float_oop)( + const float * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter, + float * RESTRICT y + ) +{ + assert( (sz_filter % mipp::N()) == 0 ); // size of filter must be divisible by conv_float_simd_size() + + mipp::Reg accu, rS, rH; + const int off0 = state->offset; + const int sz_s = state->size; + int offset; + + for ( offset = off0; offset + sz_filter <= sz_s; ++offset) + { + accu.set0(); + for (int k = 0; k < sz_filter; k += mipp::N()) + { + rS.loadu(&s[offset+k]); + rH.load(&filter[k]); + accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH; + } + y[offset] = accu.sum(); // == hadd() + } + + state->offset = offset; + return offset - off0; +} + + +int ARCHFUNCNAME(conv_cplx_float_oop)( + const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter, + complexf * RESTRICT y_cplx + ) +{ + assert( (sz_filter % mipp::N()) == 0 ); // size of filter must be divisible by conv_float_simd_size() + const float * RESTRICT s = &(s_cplx[0].i); + float * RESTRICT y = &(y_cplx[0].i); + + mipp::Regx2 accu_x2, rS_x2, H_x2; + const int off0 = 2 * state->offset; + const int sz_s = 2 * state->size; + const int sz_f2 = 2 * sz_filter; + int offset; + + for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2) + { + accu_x2.val[0].set0(); + accu_x2.val[1].set0(); + for (int k = 0; k < sz_filter; k += mipp::N()) + { + mipp::Reg rH; + rS_x2.loadu(&s[offset+2*k]); + rH.load(&filter[k]); + H_x2 = mipp::interleave(rH, rH); + accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]); // accu += rS * rH; + accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]); // accu += rS * rH; + } + H_x2 = mipp::deinterleave(accu_x2); + y[offset] = H_x2.val[0].sum(); // == hadd() == sum of real parts + y[offset+1] = H_x2.val[1].sum(); // == hadd() == sum of imag parts + } + + state->offset = offset /2; + return (offset - off0) / 2; +} + +#endif + + +static const conv_f_ptrs conv_ptrs = +{ + PP_TOSTRING(CONV_ARCH_POST), +#ifndef MIPP_NO_INTRINSICS + 1, +#else + 0, +#endif + + ARCHFUNCNAME(id), + ARCHFUNCNAME(conv_float_simd_size), + +#if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP) + ARCHFUNCNAME(conv_float_move_rest), + ARCHFUNCNAME(conv_float_inplace), + ARCHFUNCNAME(conv_float_oop), + + ARCHFUNCNAME(conv_cplx_move_rest), + ARCHFUNCNAME(conv_cplx_float_oop) +#else + nullptr, + nullptr, + nullptr, + + nullptr, + nullptr +#endif +}; + + +const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)() +{ + DPRINT("arch pointer for '%s':\n", conv_ptrs.id); + if (!strcmp(conv_ptrs.id, "none")) + return &conv_ptrs; + +#if defined(MIPP_NO_INTRINSICS) + DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id); + return &conv_ptrs; +#elif defined(HAVE_MIPP) + DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id); + DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp); + DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size()); + if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1) + return &conv_ptrs; + else + DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size()); +#else + DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id); +#endif + DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id); + return nullptr; +} + +#if defined(__cplusplus) && (__cplusplus >= 201703L) +[[maybe_unused]] +#endif +static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs); + diff --git a/pffft/pf_conv.h b/pffft/pf_conv.h new file mode 100644 index 0000000..0194b98 --- /dev/null +++ b/pffft/pf_conv.h @@ -0,0 +1,109 @@ +#pragma once + +/* pf_conv.h/.cpp implements linear "slow" convolution. + * this code is primarily for test/demonstration of runtime dispatching. + * each "kernel" is compiled with different compiler/architecture options, + * that activates different implementations in the MIPP headers. + * + * the dispatcher library 'pf_conv_dispatcher' collects (links agains) + * all the pf_conv_arch_ libraries .. + * and provides the get_all_conv_arch_ptrs() function, + * which delivers an array of pointers to the struct (conv_f_ptrs) + * containing the function pointers for the different implementations. + * + * requirement(s): + * - installed MIPP headers + * - compiler definitions for the different architecture types: + * see CMakeLists.txt CONV_ARCH_MSVC_AMD64, CONV_ARCH_GCC_ARM32NEON, .. + * - one cmake library target pf_conv_arch_ for each architecture option. + * each one gets it's specific architecture/compiler options + * utilizing the target_set_cxx_arch_option() macro in the CMakeLists.txt + */ + +#include "pf_cplx.h" + +#if defined(_MSC_VER) +# define RESTRICT __restrict +#elif defined(__GNUC__) +# define RESTRICT __restrict +#else +# define RESTRICT +#endif + + +struct conv_buffer_state +{ + int offset; // sample index where data (to process) starts + int size; // actual - or previous - size in amount of samples from buffer start (NOT offset) +}; + +// declare provided function pointer types + +typedef const char * (*f_conv_id)(); + +typedef int (*f_conv_float_simd_size)(); + +typedef void (*f_conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state); +typedef void (*f_conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state); + +typedef int (*f_conv_float_inplace)( + float * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter + ); + +typedef int (*f_conv_float_oop)( + const float * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter, + float * RESTRICT y + ); + +typedef int (*f_conv_cplx_float_oop)( + const complexf * RESTRICT s, conv_buffer_state * RESTRICT state, + const float * RESTRICT filter, const int sz_filter, + complexf * RESTRICT y + ); + + +// struct with the provided function pointers +struct conv_f_ptrs +{ + const char * id; + const int using_mipp; + f_conv_id fp_id; + f_conv_float_simd_size fp_conv_float_simd_size; + + f_conv_float_move_rest fp_conv_float_move_rest; + f_conv_float_inplace fp_conv_float_inplace; + f_conv_float_oop fp_conv_float_oop; + + f_conv_cplx_move_rest fp_conv_cplx_move_rest; + f_conv_cplx_float_oop fp_conv_cplx_float_oop; +}; + +typedef const conv_f_ptrs * ptr_to_conv_f_ptrs; + +// function pointer type, delivering the struct with the function pointers +typedef const conv_f_ptrs* (*f_conv_ptrs)(); + + +// helper for systematic function names +#define CONV_FN_ARCH(FN, ARCH) FN##_##ARCH + +// declare all functions - returning the structs with the function pointers +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, none)(); // = conv_ptrs_none() +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, dflt)(); // simd / mipp is activated + +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse3)(); // = conv_ptrs_sse3() +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse4)(); +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)(); +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); + +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse2)(); +//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)(); // already declared +//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); // already declared + +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_vfpv4)(); // for armv7l / 32-bit ARM +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)(); +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)(); + +extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, armv8a)(); // for aarch64 diff --git a/pffft/pf_conv_dispatcher.cpp b/pffft/pf_conv_dispatcher.cpp new file mode 100644 index 0000000..8a5f725 --- /dev/null +++ b/pffft/pf_conv_dispatcher.cpp @@ -0,0 +1,61 @@ + +#include "pf_conv_dispatcher.h" + +#if 0 +#include + +#define DPRINT(...) fprintf(stderr, __VA_ARGS__) + +#else +#define DPRINT(...) do { } while (0) +#endif + + +#define N_DEFAULT_ARCHES 2 +// 0 is "none" +// 1 "dflt" + +ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch) +{ + static ptr_to_conv_f_ptrs * all_arches = nullptr; + static int n_arch = 0; + if (!all_arches) + { + n_arch = N_DEFAULT_ARCHES; + // @TODO: runtime check if actual CPU supports specific architecture +#if defined(CONV_ARCH_GCC_AMD64) + static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+4] = {0}; + DPRINT("CONV_ARCH_GCC_AMD64: sse3, sse4, avx, avx2\n"); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse3)(); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse4)(); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) (); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)(); +#elif defined(CONV_ARCH_MSVC_AMD64) + static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0}; + DPRINT("CONV_ARCH_MSVC_AMD64: sse2, avx, avx2\n"); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse2)(); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) (); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)(); +#elif defined(CONV_ARCH_GCC_ARM32NEON) + static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0}; + DPRINT("CONV_ARCH_GCC_ARM32NEON: neon_vfpv4, neon_rpi3_a53\n"); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_vfpv4)(); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)(); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)(); +#elif defined(CONV_ARCH_GCC_AARCH64) + static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+1] = {0}; + DPRINT("CONV_ARCH_GCC_AARCH64: -\n"); + conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, armv8a)(); +#else + static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES] = {0}; + DPRINT("unknown CONV_ARCH: -\n"); +#endif + conv_arch_ptrs[0] = CONV_FN_ARCH(conv_ptrs, none)(); + conv_arch_ptrs[1] = CONV_FN_ARCH(conv_ptrs, dflt)(); + all_arches = conv_arch_ptrs; + } + if (p_num_arch) + *p_num_arch = n_arch; + return all_arches; +} + diff --git a/pffft/pf_conv_dispatcher.h b/pffft/pf_conv_dispatcher.h new file mode 100644 index 0000000..eb70d5e --- /dev/null +++ b/pffft/pf_conv_dispatcher.h @@ -0,0 +1,6 @@ +#pragma once + +#include "pf_conv.h" + +ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch); + diff --git a/pffft/pf_cplx.h b/pffft/pf_cplx.h new file mode 100644 index 0000000..61d8486 --- /dev/null +++ b/pffft/pf_cplx.h @@ -0,0 +1,44 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2020 Hayati Ayguen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +/* + _____ _ + / ____| | | + | | ___ _ __ ___ _ __ | | _____ __ + | | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ / + | |___| (_) | | | | | | |_) | | __/> < + \_____\___/|_| |_| |_| .__/|_|\___/_/\_\ + | | + |_| +*/ + +typedef struct complexf_s { float i; float q; } complexf; + diff --git a/pffft/pf_mixer.cpp b/pffft/pf_mixer.cpp new file mode 100644 index 0000000..504e059 --- /dev/null +++ b/pffft/pf_mixer.cpp @@ -0,0 +1,1148 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler +Copyright (c) 2020 Hayati Ayguen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* include own header first, to see missing includes */ +#include "pf_mixer.h" +#include "fmv.h" + +#include +#include +#include + +//they dropped M_PI in C99, so we define it: +#define PI ((float)3.14159265358979323846) + +//apply to pointers: +#define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i))) +#define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1)) + +#define USE_ALIGNED_ADDRESSES 0 + + + +/* + _____ _____ _____ __ _ _ + | __ \ / ____| __ \ / _| | | (_) + | | | | (___ | |__) | | |_ _ _ _ __ ___| |_ _ ___ _ __ ___ + | | | |\___ \| ___/ | _| | | | '_ \ / __| __| |/ _ \| '_ \/ __| + | |__| |____) | | | | | |_| | | | | (__| |_| | (_) | | | \__ \ + |_____/|_____/|_| |_| \__,_|_| |_|\___|\__|_|\___/|_| |_|___/ + +*/ + + +#if defined(__GNUC__) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) +# define RESTRICT __restrict +#elif defined(_MSC_VER) +# define ALWAYS_INLINE(return_type) __forceinline return_type +# define RESTRICT __restrict +#endif + + +#ifndef PFFFT_SIMD_DISABLE +#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) + #pragma message("Manual SSE x86/x64 optimizations are ON") + #include + #define HAVE_SSE_INTRINSICS 1 + +#elif defined(PFFFT_ENABLE_NEON) && defined(__arm__) + #pragma message "Manual NEON (arm32) optimizations are ON" + #include "sse2neon.h" + #define HAVE_SSE_INTRINSICS 1 + +#elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__) + #pragma message "Manual NEON (aarch64) optimizations are ON" + #include "sse2neon.h" + #define HAVE_SSE_INTRINSICS 1 + +#endif +#endif + +#ifdef HAVE_SSE_INTRINSICS + +typedef __m128 v4sf; +# define SIMD_SZ 4 + +typedef union v4_union { + __m128 v; + float f[4]; +} v4_union; + +#define VMUL(a,b) _mm_mul_ps(a,b) +#define VDIV(a,b) _mm_div_ps(a,b) +#define VADD(a,b) _mm_add_ps(a,b) +#define VSUB(a,b) _mm_sub_ps(a,b) +#define LD_PS1(s) _mm_set1_ps(s) +#define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr)) +#define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr)) +#define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v) +#define VSTORE_ALIGNED(ptr, v) _mm_store_ps((float*)(ptr), v) +#define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } +#define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } + +#if USE_ALIGNED_ADDRESSES + #define VLOAD(ptr) _mm_load_ps((const float *)(ptr)) + #define VSTORE(ptr, v) _mm_store_ps((float*)(ptr), v) +#else + #define VLOAD(ptr) _mm_loadu_ps((const float *)(ptr)) + #define VSTORE(ptr, v) _mm_storeu_ps((float*)(ptr), v) +#endif + + +int have_sse_shift_mixer_impl() +{ + return 1; +} + +#else + +int have_sse_shift_mixer_impl() +{ + return 0; +} + +#endif + + +/*********************************************************************/ + +/**************/ +/*** ALGO A ***/ +/**************/ + +PF_TARGET_CLONES +float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase) +{ + rate*=2; + //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath. + float phase=starting_phase; + float phase_increment=rate*PI; + float cosval, sinval; + for(int i=0;i2*PI) phase-=2*PI; //@shift_math_cc: normalize phase + while(phase<0) phase+=2*PI; + } + return phase; +} + +/*********************************************************************/ + +/**************/ +/*** ALGO B ***/ +/**************/ + +shift_table_data_t shift_table_init(int table_size) +{ + shift_table_data_t output; + output.table=(float*)malloc(sizeof(float)*table_size); + output.table_size=table_size; + for(int i=0;i1)?-1:1; //in quadrant 2 and 3 + cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2 + sinval=sin_sign*table_data.table[sin_index]; + cosval=cos_sign*table_data.table[cos_index]; + //we multiply two complex numbers. + //how? enter this to maxima (software) for explanation: + // (a+b*%i)*(c+d*%i), rectform; + iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i); + qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i); + phase+=phase_increment; + while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase + while(phase<0) phase+=2*PI; + } + return phase; +} + +/*********************************************************************/ + +/**************/ +/*** ALGO C ***/ +/**************/ + +shift_addfast_data_t shift_addfast_init(float rate) +{ + shift_addfast_data_t output; + output.phase_increment=2*rate*PI; + for(int i=0;i<4;i++) + { + output.dsin[i]=sinf(output.phase_increment*(i+1)); + output.dcos[i]=cosf(output.phase_increment*(i+1)); + } + return output; +} + +#define SADF_L1(j) \ + cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \ + sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j; +#define SADF_L2(j) \ + iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \ + qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j); + +PF_TARGET_CLONES +float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start=cosf(starting_phase); + float sin_start=sinf(starting_phase); + float cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, + sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, + dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], + dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; + + for(int i=0;iphase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +#undef SADF_L2 + + +#define SADF_L2(j) \ + tmp_inp_cos = iof(in_out,4*i+j); \ + tmp_inp_sin = qof(in_out,4*i+j); \ + iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \ + qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin; + +PF_TARGET_CLONES +float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start=cosf(starting_phase); + float sin_start=sinf(starting_phase); + float tmp_inp_cos, tmp_inp_sin, + cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, + sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, + dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], + dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; + + for(int i=0;iphase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +#undef SADF_L1 +#undef SADF_L2 + + +/*********************************************************************/ + +/**************/ +/*** ALGO D ***/ +/**************/ + +shift_unroll_data_t shift_unroll_init(float rate, int size) +{ + shift_unroll_data_t output; + output.phase_increment=2*rate*PI; + output.size = size; + output.dsin=(float*)malloc(sizeof(float)*size); + output.dcos=(float*)malloc(sizeof(float)*size); + float myphase = 0; + for(int i=0;iPI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + output.dsin[i]=sinf(myphase); + output.dcos[i]=cosf(myphase); + } + return output; +} + +void shift_unroll_deinit(shift_unroll_data_t* d) +{ + if (!d) + return; + free(d->dsin); + free(d->dcos); + d->dsin = NULL; + d->dcos = NULL; +} + +PF_TARGET_CLONES +float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start = cosf(starting_phase); + float sin_start = sinf(starting_phase); + float cos_val = cos_start, sin_val = sin_start; + for(int i=0;idcos[i] - sin_start * d->dsin[i]; + sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i]; + } + starting_phase+=input_size*d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +PF_TARGET_CLONES +float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase) +{ + float cos_start = cosf(starting_phase); + float sin_start = sinf(starting_phase); + float cos_val = cos_start, sin_val = sin_start; + for(int i=0;idcos[i] - sin_start * d->dsin[i]; + sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i]; + } + starting_phase += size * d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + + +/*********************************************************************/ + +/**************/ +/*** ALGO E ***/ +/**************/ + +shift_limited_unroll_data_t shift_limited_unroll_init(float rate) +{ + shift_limited_unroll_data_t output; + output.phase_increment=2*rate*PI; + float myphase = 0; + for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + output.dcos[i] = cosf(myphase); + output.dsin[i] = sinf(myphase); + } + output.complex_phase.i = 1.0F; + output.complex_phase.q = 0.0F; + return output; +} + +PF_TARGET_CLONES +void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d) +{ + float cos_start = d->complex_phase.i; + float sin_start = d->complex_phase.q; + float cos_val = cos_start, sin_val = sin_start, mag; + while (size > 0) + { + int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size; + for(int i=0;idcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + } + } + // "starts := vals := vals / |vals|" + mag = sqrtf(cos_val * cos_val + sin_val * sin_val); + cos_val /= mag; + sin_val /= mag; + cos_start = cos_val; + sin_start = sin_val; + + input += PF_SHIFT_LIMITED_UNROLL_SIZE; + output += PF_SHIFT_LIMITED_UNROLL_SIZE; + size -= PF_SHIFT_LIMITED_UNROLL_SIZE; + } + d->complex_phase.i = cos_val; + d->complex_phase.q = sin_val; +} + +PF_TARGET_CLONES +void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d) +{ + float inp_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float inp_q[PF_SHIFT_LIMITED_SIMD_SZ]; + // "vals := starts := phase_state" + float cos_start = d->complex_phase.i; + float sin_start = d->complex_phase.q; + float cos_val = cos_start, sin_val = sin_start, mag; + while (N_cplx) + { + int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + for(int i=0;idcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + } + } + // "starts := vals := vals / |vals|" + mag = sqrtf(cos_val * cos_val + sin_val * sin_val); + cos_val /= mag; + sin_val /= mag; + cos_start = cos_val; + sin_start = sin_val; + + in_out += PF_SHIFT_LIMITED_UNROLL_SIZE; + N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE; + } + // "phase_state := starts" + d->complex_phase.i = cos_start; + d->complex_phase.q = sin_start; +} + + +#ifdef HAVE_SSE_INTRINSICS + +/*********************************************************************/ + +/**************/ +/*** ALGO F ***/ +/**************/ + +shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) +{ + shift_limited_unroll_A_sse_data_t output; + float myphase; + + output.phase_increment = 2*relative_freq*PI; + + myphase = 0.0F; + for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) + { + for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + output.dcos[i] = cosf(myphase); + output.dsin[i] = sinf(myphase); + for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + output.dcos[i+k] = output.dcos[i]; + output.dsin[i+k] = output.dsin[i]; + } + } + + output.dcos_blk = 0.0F; + output.dsin_blk = 0.0F; + + myphase = phase_start_rad; + for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) + { + output.phase_state_i[i] = cosf(myphase); + output.phase_state_q[i] = sinf(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + return output; +} + + +PF_TARGET_CLONES +void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) +{ + // "vals := starts := phase_state" + __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); + __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); + __m128 cos_vals = cos_starts; + __m128 sin_vals = sin_starts; + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT p_trig_cos_tab; + __m128 * RESTRICT p_trig_sin_tab; + __m128 * RESTRICT u = (__m128*)in_out; + + while (N_cplx) + { + const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + int B = NB; + p_trig_cos_tab = (__m128*)( &d->dcos[0] ); + p_trig_sin_tab = (__m128*)( &d->dsin[0] ); + while (B) + { + // complex multiplication of 4 complex values from/to in_out[] + // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): + // "out[] = inp[] * vals" + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); + product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); + u += 2; + // calculate complex phasor for next iteration + // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) + // "vals := d[] * starts" + inp_re = VLOAD(p_trig_cos_tab); + inp_im = VLOAD(p_trig_sin_tab); + cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); + sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); + ++p_trig_cos_tab; + ++p_trig_sin_tab; + B -= 4; + } + N_cplx -= NB; + /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ + /* re-use product_re[]/product_im[] for normalization */ + // "starts := vals := vals / |vals|" + product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); +#if 0 + // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 + // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 + product_im = _mm_rsqrt_ps(product_re); + cos_starts = cos_vals = VMUL(cos_vals, product_im); + sin_starts = sin_vals = VMUL(sin_vals, product_im); +#else + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again + product_im = _mm_sqrt_ps(product_re); + cos_starts = cos_vals = VDIV(cos_vals, product_im); + sin_starts = sin_vals = VDIV(sin_vals, product_im); +#endif + } + // "phase_state := starts" + VSTORE( &d->phase_state_i[0], cos_starts ); + VSTORE( &d->phase_state_q[0], sin_starts ); +} + + +/*********************************************************************/ + +/**************/ +/*** ALGO G ***/ +/**************/ + +shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) +{ + shift_limited_unroll_B_sse_data_t output; + float myphase; + + output.phase_increment = 2*relative_freq*PI; + + myphase = 0.0F; + for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) + { + for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + output.dtrig[i+0] = cosf(myphase); + output.dtrig[i+1] = sinf(myphase); + output.dtrig[i+2] = output.dtrig[i+0]; + output.dtrig[i+3] = output.dtrig[i+1]; + } + + output.dcos_blk = 0.0F; + output.dsin_blk = 0.0F; + + myphase = phase_start_rad; + for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) + { + output.phase_state_i[i] = cosf(myphase); + output.phase_state_q[i] = sinf(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + return output; +} + + +PF_TARGET_CLONES +void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) +{ + // "vals := starts := phase_state" + __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); + __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); + __m128 cos_vals = cos_starts; + __m128 sin_vals = sin_starts; + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT p_trig_tab; + __m128 * RESTRICT u = (__m128*)in_out; + + while (N_cplx) + { + const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + int B = NB; + p_trig_tab = (__m128*)( &d->dtrig[0] ); + while (B) + { + // complex multiplication of 4 complex values from/to in_out[] + // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): + // "out[] = inp[] * vals" + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); + product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); + u += 2; + // calculate complex phasor for next iteration + // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) + // "vals := d[] * starts" + product_re = VLOAD(p_trig_tab); + UNINTERLEAVE2(product_re, product_re, inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); + sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); + ++p_trig_tab; + B -= 4; + } + N_cplx -= NB; + /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ + /* re-use product_re[]/product_im[] for normalization */ + // "starts := vals := vals / |vals|" + product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); +#if 0 + // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 + // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 + product_im = _mm_rsqrt_ps(product_re); + cos_starts = cos_vals = VMUL(cos_vals, product_im); + sin_starts = sin_vals = VMUL(sin_vals, product_im); +#else + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again + product_im = _mm_sqrt_ps(product_re); + cos_starts = cos_vals = VDIV(cos_vals, product_im); + sin_starts = sin_vals = VDIV(sin_vals, product_im); +#endif + } + // "phase_state := starts" + VSTORE( &d->phase_state_i[0], cos_starts ); + VSTORE( &d->phase_state_q[0], sin_starts ); +} + + +/*********************************************************************/ + + +/**************/ +/*** ALGO H ***/ +/**************/ + +shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) +{ + shift_limited_unroll_C_sse_data_t output; + float myphase; + + output.phase_increment = 2*relative_freq*PI; + + myphase = 0.0F; + for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) + { + for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + output.dinterl_trig[2*i] = cosf(myphase); + output.dinterl_trig[2*i+4] = sinf(myphase); + for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i]; + output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4]; + } + } + + output.dcos_blk = 0.0F; + output.dsin_blk = 0.0F; + + myphase = phase_start_rad; + for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) + { + output.phase_state_i[i] = cosf(myphase); + output.phase_state_q[i] = sinf(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + return output; +} + + +PF_TARGET_CLONES +void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) +{ + // "vals := starts := phase_state" + __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); + __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); + __m128 cos_vals = cos_starts; + __m128 sin_vals = sin_starts; + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT p_trig_tab; + __m128 * RESTRICT u = (__m128*)in_out; + + while (N_cplx) + { + const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + int B = NB; + p_trig_tab = (__m128*)( &d->dinterl_trig[0] ); + while (B) + { + // complex multiplication of 4 complex values from/to in_out[] + // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): + // "out[] = inp[] * vals" + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); + product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); + u += 2; + // calculate complex phasor for next iteration + // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) + // "vals := d[] * starts" + inp_re = VLOAD(p_trig_tab); + inp_im = VLOAD(p_trig_tab+1); + cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); + sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); + p_trig_tab += 2; + B -= 4; + } + N_cplx -= NB; + /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ + /* re-use product_re[]/product_im[] for normalization */ + // "starts := vals := vals / |vals|" + product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); +#if 0 + // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 + // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 + product_im = _mm_rsqrt_ps(product_re); + cos_starts = cos_vals = VMUL(cos_vals, product_im); + sin_starts = sin_vals = VMUL(sin_vals, product_im); +#else + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again + product_im = _mm_sqrt_ps(product_re); + cos_starts = cos_vals = VDIV(cos_vals, product_im); + sin_starts = sin_vals = VDIV(sin_vals, product_im); +#endif + } + // "phase_state := starts" + VSTORE( &d->phase_state_i[0], cos_starts ); + VSTORE( &d->phase_state_q[0], sin_starts ); +} + + +#else + +/*********************************************************************/ + +shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) { + assert(0); + shift_limited_unroll_A_sse_data_t r; + return r; +} +shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) { + assert(0); + shift_limited_unroll_B_sse_data_t r; + return r; +} +shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) { + assert(0); + shift_limited_unroll_C_sse_data_t r; + return r; +} + +void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) { + assert(0); +} +void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) { + assert(0); +} +void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) { + assert(0); +} + +#endif + + +/*********************************************************************/ + +/**************/ +/*** ALGO I ***/ +/**************/ + +void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state) +{ + // constants for single phase step + float phase_increment_s = rate*PI; + float k1 = tanf(0.5f*phase_increment_s); + float k2 = 2*k1 /(1 + k1 * k1); + for (int j=1; ju_cos[j] = state->u_cos[j-1]; + state->v_sin[j] = state->v_sin[j-1]; + // small steps + tmp = state->u_cos[j] - k1 * state->v_sin[j]; + state->v_sin[j] += k2 * tmp; + state->u_cos[j] = tmp - k1 * state->v_sin[j]; + } + + // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step + float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ; + while(phase_increment_b > PI) phase_increment_b-=2*PI; + while(phase_increment_b < -PI) phase_increment_b+=2*PI; + conf->k1 = tanf(0.5f*phase_increment_b); + conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1); +} + +void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state) +{ + if (starting_phase != 0.0F) + { + state->u_cos[0] = cosf(starting_phase); + state->v_sin[0] = sinf(starting_phase); + } + else + { + state->u_cos[0] = 1.0F; + state->v_sin[0] = 0.0F; + } + shift_recursive_osc_update_rate(rate, conf, state); +} + + +PF_TARGET_CLONES +void shift_recursive_osc_cc(const complexf *input, complexf* output, + int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) +{ + float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ]; + shift_recursive_osc_t state = *state_ext; + const float k1 = conf->k1; + const float k2 = conf->k2; + for(int i=0;ik1; + const float k2 = conf->k2; + for(int i=0;ik1; + const float k2 = conf->k2; + for(int i=0;iu_cos[j] = state->u_cos[j-1]; + state->v_sin[j] = state->v_sin[j-1]; + // small steps + tmp = state->u_cos[j] - k1 * state->v_sin[j]; + state->v_sin[j] += k2 * tmp; + state->u_cos[j] = tmp - k1 * state->v_sin[j]; + } + + // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step + float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; + while(phase_increment_b > PI) phase_increment_b-=2*PI; + while(phase_increment_b < -PI) phase_increment_b+=2*PI; + conf->k1 = tanf(0.5f*phase_increment_b); + conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1); +} + + +void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state) +{ + if (starting_phase != 0.0F) + { + state->u_cos[0] = cosf(starting_phase); + state->v_sin[0] = sinf(starting_phase); + } + else + { + state->u_cos[0] = 1.0F; + state->v_sin[0] = 0.0F; + } + shift_recursive_osc_sse_update_rate(rate, conf, state); +} + + +PF_TARGET_CLONES +void shift_recursive_osc_sse_inp_c(complexf* in_out, + int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext) +{ + const __m128 k1 = LD_PS1( conf->k1 ); + const __m128 k2 = LD_PS1( conf->k2 ); + __m128 u_cos = VLOAD( &state_ext->u_cos[0] ); + __m128 v_sin = VLOAD( &state_ext->v_sin[0] ); + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT u = (__m128*)in_out; + + while (N_cplx) + { + //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i; + //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q; + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + + //we multiply two complex numbers - similar to shift_math_cc + //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) ); + product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); + u += 2; + + // update complex phasor - like incrementing phase + // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; + product_re = VSUB( u_cos, VMUL(k1, v_sin) ); + // state.v_sin[j] += k2 * tmp[j]; + v_sin = VADD( v_sin, VMUL(k2, product_re) ); + // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; + u_cos = VSUB( product_re, VMUL(k1, v_sin) ); + + N_cplx -= 4; + } + VSTORE( &state_ext->u_cos[0], u_cos ); + VSTORE( &state_ext->v_sin[0], v_sin ); +} + +#else + +void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state) +{ + assert(0); +} + +void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state) +{ + assert(0); +} + + +void shift_recursive_osc_sse_inp_c(complexf* in_out, + int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext) +{ + assert(0); +} + +#endif + diff --git a/pffft/pf_mixer.h b/pffft/pf_mixer.h new file mode 100644 index 0000000..e153ad0 --- /dev/null +++ b/pffft/pf_mixer.h @@ -0,0 +1,270 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler +Copyright (c) 2020 Hayati Ayguen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include + +#include "pf_cplx.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +// ================================================================================= + +int have_sse_shift_mixer_impl(); + + +/*********************************************************************/ + +/**************/ +/*** ALGO A ***/ +/**************/ + +float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase); + + +/*********************************************************************/ + +/**************/ +/*** ALGO B ***/ +/**************/ + +typedef struct shift_table_data_s +{ + float* table; + int table_size; +} shift_table_data_t; + +void shift_table_deinit(shift_table_data_t table_data); +shift_table_data_t shift_table_init(int table_size); +float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase); + +/*********************************************************************/ + +/**************/ +/*** ALGO C ***/ +/**************/ + +typedef struct shift_addfast_data_s +{ + float dsin[4]; + float dcos[4]; + float phase_increment; +} shift_addfast_data_t; + +shift_addfast_data_t shift_addfast_init(float rate); +float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase); +float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase); + + +/*********************************************************************/ + +/**************/ +/*** ALGO D ***/ +/**************/ + +typedef struct shift_unroll_data_s +{ + float* dsin; + float* dcos; + float phase_increment; + int size; +} shift_unroll_data_t; + +shift_unroll_data_t shift_unroll_init(float rate, int size); +void shift_unroll_deinit(shift_unroll_data_t* d); +float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase); +float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase); + + +/*********************************************************************/ + +/**************/ +/*** ALGO E ***/ +/**************/ + +/* similar to shift_unroll_cc() - but, have fixed and limited precalc size + * idea: smaller cache usage by table + * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4) + */ +#define PF_SHIFT_LIMITED_UNROLL_SIZE 128 +#define PF_SHIFT_LIMITED_SIMD_SZ 4 + +typedef struct shift_limited_unroll_data_s +{ + float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE]; + float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE]; + complexf complex_phase; + float phase_increment; +} shift_limited_unroll_data_t; + +shift_limited_unroll_data_t shift_limited_unroll_init(float rate); +/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */ +/* starting_phase for next call is kept internal in state */ +void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d); +void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d); + + +/*********************************************************************/ + +/**************/ +/*** ALGO F ***/ +/**************/ + +typedef struct shift_limited_unroll_A_sse_data_s +{ + /* small/limited trig table */ + float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ]; + float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ]; + /* 4 times complex phase */ + float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ]; + /* N_cplx_per_block times increment - for future parallel variants */ + float dcos_blk; + float dsin_blk; + /* */ + float phase_increment; +} shift_limited_unroll_A_sse_data_t; + +shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d); + + +/*********************************************************************/ + +/**************/ +/*** ALGO G ***/ +/**************/ + +typedef struct shift_limited_unroll_B_sse_data_s +{ + /* small/limited trig table */ + float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ]; + /* 4 times complex phase */ + float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ]; + /* N_cplx_per_block times increment - for future parallel variants */ + float dcos_blk; + float dsin_blk; + /* */ + float phase_increment; +} shift_limited_unroll_B_sse_data_t; + +shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d); + +/*********************************************************************/ + +/**************/ +/*** ALGO H ***/ +/**************/ + +typedef struct shift_limited_unroll_C_sse_data_s +{ + /* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */ + float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)]; + /* 4 times complex phase */ + float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ]; + /* N_cplx_per_block times increment - for future parallel variants */ + float dcos_blk; + float dsin_blk; + /* */ + float phase_increment; +} shift_limited_unroll_C_sse_data_t; + +shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d); + + + +/*********************************************************************/ + +/**************/ +/*** ALGO I ***/ +/**************/ + +/* Recursive Quadrature Oscillator functions "recursive_osc" + * see https://www.vicanek.de/articles/QuadOsc.pdf + */ +#define PF_SHIFT_RECURSIVE_SIMD_SZ 8 +typedef struct shift_recursive_osc_s +{ + float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ]; +} shift_recursive_osc_t; + +typedef struct shift_recursive_osc_conf_s +{ + float k1; + float k2; +} shift_recursive_osc_conf_t; + +void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state); +void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); + +/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */ +/* starting_phase for next call is kept internal in state */ +void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); +void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); +void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); + +/*********************************************************************/ + +/**************/ +/*** ALGO J ***/ +/**************/ + +#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ 4 +typedef struct shift_recursive_osc_sse_s +{ + float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ]; + float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ]; +} shift_recursive_osc_sse_t; + +typedef struct shift_recursive_osc_sse_conf_s +{ + float k1; + float k2; +} shift_recursive_osc_sse_conf_t; + +void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state); +void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state); +void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext); + + +#ifdef __cplusplus +} +#endif + diff --git a/pffft/pffastconv.c b/pffft/pffastconv.c new file mode 100644 index 0000000..8bb2a65 --- /dev/null +++ b/pffft/pffastconv.c @@ -0,0 +1,264 @@ +/* + Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de ) + */ + +#include "pffastconv.h" +#include "pffft.h" + +#include +#include +#include +#include +#include +#include + +#define FASTCONV_DBG_OUT 0 + + +/* detect compiler flavour */ +#if defined(_MSC_VER) +# define RESTRICT __restrict +#pragma warning( disable : 4244 4305 4204 4456 ) +#elif defined(__GNUC__) +# define RESTRICT __restrict +#endif + + +void *pffastconv_malloc(size_t nb_bytes) +{ + return pffft_aligned_malloc(nb_bytes); +} + +void pffastconv_free(void *p) +{ + pffft_aligned_free(p); +} + +int pffastconv_simd_size() +{ + return pffft_simd_size(); +} + + + +struct PFFASTCONV_Setup +{ + float * Xt; /* input == x in time domain - copy for alignment */ + float * Xf; /* input == X in freq domain */ + float * Hf; /* filterCoeffs == H in freq domain */ + float * Mf; /* input * filterCoeffs in freq domain */ + PFFFT_Setup *st; + int filterLen; /* convolution length */ + int Nfft; /* FFT/block length */ + int flags; + float scale; +}; + + +PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags ) +{ + PFFASTCONV_Setup * s = NULL; + const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1; + const int minFftLen = 2*pffft_simd_size()*pffft_simd_size(); + int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1); +#if FASTCONV_DBG_OUT + const int iOldBlkLen = *blockLen; +#endif + + if ( Nfft < minFftLen ) + Nfft = minFftLen; + + if ( flags & PFFASTCONV_CPLX_FILTER ) + return NULL; + + s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) ); + + if ( *blockLen > Nfft ) { + Nfft = *blockLen; + Nfft = pffft_next_power_of_two(Nfft); + } + *blockLen = Nfft; /* this is in (complex) samples */ + + Nfft *= cplxFactor; + + if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) ) + s->Xt = NULL; + else + s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float)); + s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float)); + s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float)); + s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float)); + s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */ + s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */ + if ( cplxFactor == 2 ) + s->filterLen = 2 * filterLen - 1; + s->Nfft = Nfft; /* FFT/block length */ + s->flags = flags; + s->scale = (float)( 1.0 / Nfft ); + + memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) ); + if ( flags & PFFASTCONV_CORRELATION ) { + for ( i = 0; i < filterLen; ++i ) + s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ]; + } else { + for ( i = 0; i < filterLen; ++i ) + s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ]; + } + + pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD); + +#if FASTCONV_DBG_OUT + printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n" + , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 ); +#endif + + return s; +} + + +void pffastconv_destroy_setup( PFFASTCONV_Setup * s ) +{ + if (!s) + return; + pffft_destroy_setup(s->st); + pffastconv_free(s->Mf); + pffastconv_free(s->Hf); + pffastconv_free(s->Xf); + if ( s->Xt ) + pffastconv_free(s->Xt); + pffastconv_free(s); +} + + +int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush) +{ + const float * RESTRICT X = input_; + float * RESTRICT Y = output_; + const int Nfft = s->Nfft; + const int filterLen = s->filterLen; + const int flags = s->flags; + const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1; + const int inputLen = cplxFactor * cplxInputLen; + int inpOff, procLen, numOut = 0, j, part, cplxOff; + + /* applyFlush != 0: + * inputLen - inpOff -filterLen + 1 > 0 + * <=> inputLen -filterLen + 1 > inpOff + * <=> inpOff < inputLen -filterLen + 1 + * + * applyFlush == 0: + * inputLen - inpOff >= Nfft + * <=> inputLen - Nfft >= inpOff + * <=> inpOff <= inputLen - Nfft + * <=> inpOff < inputLen - Nfft + 1 + */ + + if ( cplxFactor == 2 ) + { + const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1); +#if 0 + printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff); +#endif + for ( inpOff = 0; inpOff < maxOff; inpOff += numOut ) + { + procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff); + numOut = ( procLen - filterLen + 1 ) & ( ~1 ); + if (!numOut) + break; +#if 0 + if (!inpOff) + printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut); + if (inpOff + filterLen + 2 >= maxOff ) + printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut); +#endif + + if ( flags & PFFASTCONV_DIRECT_INP ) + { + pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD); + } + else + { + memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) ); + if ( procLen < Nfft ) + memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) ); + + pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD); + } + + pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale); + + if ( flags & PFFASTCONV_DIRECT_OUT ) + { + pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD); + } + else + { + pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD); + memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) ); + } + } + return inpOff / cplxFactor; + } + else + { + const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1); + const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1; + + for ( inpOff = 0; inpOff < maxOff; inpOff += numOut ) + { + procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff); + numOut = procLen - filterLen + 1; + + for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */ + { + + if ( flags & PFFASTCONV_CPLX_INP_OUT ) + { + cplxOff = 2 * inpOff + part; + for ( j = 0; j < procLen; ++j ) + s->Xt[j] = X[cplxOff + 2 * j]; + if ( procLen < Nfft ) + memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) ); + + pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD); + } + else if ( flags & PFFASTCONV_DIRECT_INP ) + { + pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD); + } + else + { + memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) ); + if ( procLen < Nfft ) + memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) ); + + pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD); + } + + pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale); + + if ( flags & PFFASTCONV_CPLX_INP_OUT ) + { + pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD); + + cplxOff = 2 * inpOff + part; + for ( j = 0; j < numOut; ++j ) + Y[ cplxOff + 2 * j ] = s->Xf[j]; + } + else if ( flags & PFFASTCONV_DIRECT_OUT ) + { + pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD); + } + else + { + pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD); + memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) ); + } + + } + } + + return inpOff; + } +} + diff --git a/pffft/pffastconv.h b/pffft/pffastconv.h new file mode 100644 index 0000000..6bc5e47 --- /dev/null +++ b/pffft/pffastconv.h @@ -0,0 +1,171 @@ +/* Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of PFFFT, PFFASTCONV, nor the names of its + sponsors or contributors may be used to endorse or promote products + derived from this Software without specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +/* + PFFASTCONV : a Pretty Fast Fast Convolution + + This is basically the implementation of fast convolution, + utilizing the FFT (pffft). + + Restrictions: + + - 1D transforms only, with 32-bit single precision. + + - all (float*) pointers in the functions below are expected to + have an "simd-compatible" alignment, that is 16 bytes on x86 and + powerpc CPUs. + + You can allocate such buffers with the functions + pffft_aligned_malloc / pffft_aligned_free (or with stuff like + posix_memalign..) + +*/ + +#ifndef PFFASTCONV_H +#define PFFASTCONV_H + +#include /* for size_t */ +#include "pffft.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + /* opaque struct holding internal stuff + this struct can't be shared by many threads as it contains + temporary data, computed within the convolution + */ + typedef struct PFFASTCONV_Setup PFFASTCONV_Setup; + + typedef enum { + PFFASTCONV_CPLX_INP_OUT = 1, + /* set when input and output is complex, + * with real and imag part interleaved in both vectors. + * input[] has inputLen complex values: 2 * inputLen floats, + * output[] is also written with complex values. + * without this flag, the input is interpreted as real vector + */ + + PFFASTCONV_CPLX_FILTER = 2, + /* set when filterCoeffs is complex, + * with real and imag part interleaved. + * filterCoeffs[] has filterLen complex values: 2 * filterLen floats + * without this flag, the filter is interpreted as real vector + * ATTENTION: this is not implemented yet! + */ + + PFFASTCONV_DIRECT_INP = 4, + /* set PFFASTCONV_DIRECT_INP only, when following conditions are met: + * 1- input vecor X must be aligned + * 2- (all) inputLen <= ouput blockLen + * 3- X must have minimum length of output BlockLen + * 4- the additional samples from inputLen .. BlockLen-1 + * must contain valid small and non-NAN samples (ideally zero) + * + * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set + */ + + PFFASTCONV_DIRECT_OUT = 8, + /* set PFFASTCONV_DIRECT_OUT only when following conditions are met: + * 1- output vector Y must be aligned + * 2- (all) inputLen <= ouput blockLen + * 3- Y must have minimum length of output blockLen + * + * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set + */ + + PFFASTCONV_CPLX_SINGLE_FFT = 16, + /* hint to process complex data with one single FFT; + * default is to use 2 FFTs: one for real part, one for imag part + * */ + + + PFFASTCONV_SYMMETRIC = 32, + /* just informal, that filter is symmetric .. and filterLen is multiple of 8 */ + + PFFASTCONV_CORRELATION = 64, + /* filterCoeffs[] of pffastconv_new_setup are for correlation; + * thus, do not flip them for the internal fft calculation + * - as necessary for the fast convolution */ + + } pffastconv_flags_t; + + /* + prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'. + The output 'blockLen' might be bigger to allow the fast convolution. + + 'flags' are bitmask over the 'pffastconv_flags_t' enum. + + PFFASTCONV_Setup structure can't be shared accross multiple filters + or concurrent threads. + */ + PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags ); + + void pffastconv_destroy_setup(PFFASTCONV_Setup *); + + /* + Perform the fast convolution. + + 'input' and 'output' don't need to be aligned - unless any of + PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'. + + inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed. + in this case, multiple FFTs are called internally, to process the + input[]. + + 'output' vector must have size >= (inputLen - filterLen + 1) + + set bool option 'applyFlush' to process the full input[]. + with this option, 'tail samples' of input are also processed. + This might be inefficient, because the FFT is called to produce + few(er) output samples, than possible. + This option is useful to process the last samples of an input (file) + or to reduce latency. + + return value is the number of produced samples in output[]. + the same amount of samples is processed from input[]. to continue + processing, the caller must save/move the remaining samples of + input[]. + + */ + int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush); + + void *pffastconv_malloc(size_t nb_bytes); + void pffastconv_free(void *); + + /* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */ + int pffastconv_simd_size(); + + +#ifdef __cplusplus +} +#endif + +#endif /* PFFASTCONV_H */ diff --git a/pffft/pffft.c b/pffft/pffft.c new file mode 100644 index 0000000..4862a4f --- /dev/null +++ b/pffft/pffft.c @@ -0,0 +1,134 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB + (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber + of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + + PFFFT : a Pretty Fast FFT. + + This file is largerly based on the original FFTPACK implementation, modified in + order to take advantage of SIMD instructions of modern CPUs. +*/ + +/* + ChangeLog: + - 2011/10/02, version 1: This is the very first release of this file. +*/ + +#include "pffft.h" + +/* detect compiler flavour */ +#if defined(_MSC_VER) +# define COMPILER_MSVC +#elif defined(__GNUC__) +# define COMPILER_GCC +#endif + +#include +#include +#include +#include +#include + +#if defined(COMPILER_GCC) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) +# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; +#elif defined(COMPILER_MSVC) +# define ALWAYS_INLINE(return_type) __forceinline return_type +# define NEVER_INLINE(return_type) __declspec(noinline) return_type +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) +#endif + + +#ifdef COMPILER_MSVC +#pragma warning( disable : 4244 4305 4204 4456 ) +#endif + +/* + vector support macros: the rest of the code is independant of + SSE/Altivec/NEON -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ +#include "simd/pf_float.h" + +/* have code comparable with this definition */ +#define SETUP_STRUCT PFFFT_Setup +#define FUNC_NEW_SETUP pffft_new_setup +#define FUNC_DESTROY pffft_destroy_setup +#define FUNC_TRANSFORM_UNORDRD pffft_transform +#define FUNC_TRANSFORM_ORDERED pffft_transform_ordered +#define FUNC_ZREORDER pffft_zreorder +#define FUNC_ZCONVOLVE_ACCUMULATE pffft_zconvolve_accumulate +#define FUNC_ZCONVOLVE_NO_ACCU pffft_zconvolve_no_accu + +#define FUNC_ALIGNED_MALLOC pffft_aligned_malloc +#define FUNC_ALIGNED_FREE pffft_aligned_free +#define FUNC_SIMD_SIZE pffft_simd_size +#define FUNC_MIN_FFT_SIZE pffft_min_fft_size +#define FUNC_IS_VALID_SIZE pffft_is_valid_size +#define FUNC_NEAREST_SIZE pffft_nearest_transform_size +#define FUNC_SIMD_ARCH pffft_simd_arch +#define FUNC_VALIDATE_SIMD_A validate_pffft_simd +#define FUNC_VALIDATE_SIMD_EX validate_pffft_simd_ex + +#define FUNC_CPLX_FINALIZE pffft_cplx_finalize +#define FUNC_CPLX_PREPROCESS pffft_cplx_preprocess +#define FUNC_REAL_PREPROCESS_4X4 pffft_real_preprocess_4x4 +#define FUNC_REAL_PREPROCESS pffft_real_preprocess +#define FUNC_REAL_FINALIZE_4X4 pffft_real_finalize_4x4 +#define FUNC_REAL_FINALIZE pffft_real_finalize +#define FUNC_TRANSFORM_INTERNAL pffft_transform_internal + +#define FUNC_COS cosf +#define FUNC_SIN sinf + + +#include "pffft_priv_impl.h" + + diff --git a/pffft/pffft.h b/pffft/pffft.h new file mode 100644 index 0000000..7ad925c --- /dev/null +++ b/pffft/pffft.h @@ -0,0 +1,241 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB, + authored by Dr Paul Swarztrauber of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +/* + PFFFT : a Pretty Fast FFT. + + This is basically an adaptation of the single precision fftpack + (v4) as found on netlib taking advantage of SIMD instruction found + on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). + + For architectures where no SIMD instruction is available, the code + falls back to a scalar version. + + Restrictions: + + - 1D transforms only, with 32-bit single precision. + + - supports only transforms for inputs of length N of the form + N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, + 144, 160, etc are all acceptable lengths). Performance is best for + 128<=N<=8192. + + - all (float*) pointers in the functions below are expected to + have an "simd-compatible" alignment, that is 16 bytes on x86 and + powerpc CPUs. + + You can allocate such buffers with the functions + pffft_aligned_malloc / pffft_aligned_free (or with stuff like + posix_memalign..) + +*/ + +#ifndef PFFFT_H +#define PFFFT_H + +#include /* for size_t */ + +#ifdef __cplusplus +extern "C" { +#endif + + /* opaque struct holding internal stuff (precomputed twiddle factors) + this struct can be shared by many threads as it contains only + read-only data. + */ + typedef struct PFFFT_Setup PFFFT_Setup; + +#ifndef PFFFT_COMMON_ENUMS +#define PFFFT_COMMON_ENUMS + + /* direction of the transform */ + typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; + + /* type of transform */ + typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; + +#endif + + /* + prepare for performing transforms of size N -- the returned + PFFFT_Setup structure is read-only so it can safely be shared by + multiple concurrent threads. + */ + PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); + void pffft_destroy_setup(PFFFT_Setup *); + /* + Perform a Fourier transform , The z-domain data is stored in the + most efficient order for transforming it back, or using it for + convolution. If you need to have its content sorted in the + "usual" way, that is as an array of interleaved complex numbers, + either use pffft_transform_ordered , or call pffft_zreorder after + the forward fft, and before the backward fft. + + Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. + Typically you will want to scale the backward transform by 1/N. + + The 'work' pointer should point to an area of N (2*N for complex + fft) floats, properly aligned. If 'work' is NULL, then stack will + be used instead (this is probably the best strategy for small + FFTs, say for N < 16384). Threads usually have a small stack, that + there's no sufficient amount of memory, usually leading to a crash! + Use the heap with pffft_aligned_malloc() in this case. + + For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real + input with input(=transformation) length N, the output array is + 'mostly' complex: + index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N + index k == 0 is a special case: + the real() part contains the result for the DC frequency 0, + the imag() part contains the result for the Nyquist frequency Samplerate/2 + both 0-frequency and half frequency components, which are real, + are assembled in the first entry as F(0)+i*F(N/2). + With the output size N/2 complex values (=N real/imag values), it is + obvious, that the result for negative frequencies are not output, + cause of symmetry. + + input and output may alias. + */ + void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + + /* + Similar to pffft_transform, but makes sure that the output is + ordered as expected (interleaved complex numbers). This is + similar to calling pffft_transform and then pffft_zreorder. + + input and output may alias. + */ + void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + + /* + call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., + PFFFT_FORWARD) if you want to have the frequency components in + the correct "canonical" order, as interleaved complex numbers. + + (for real transforms, both 0-frequency and half frequency + components, which are real, are assembled in the first entry as + F(0)+i*F(n/2+1). Note that the original fftpack did place + F(n/2+1) at the end of the arrays). + + input and output should not alias. + */ + void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and accumulate them into dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab += (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and put result in dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab = (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + + /* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */ + int pffft_simd_size(); + + /* return string identifier of used architecture (SSE/NEON/Altivec/..) */ + const char * pffft_simd_arch(); + + + /* following functions are identical to the pffftd_ functions */ + + /* simple helper to get minimum possible fft size */ + int pffft_min_fft_size(pffft_transform_t transform); + + /* simple helper to determine next power of 2 + - without inexact/rounding floating point operations + */ + int pffft_next_power_of_two(int N); + + /* simple helper to determine if power of 2 - returns bool */ + int pffft_is_power_of_two(int N); + + /* simple helper to determine size N is valid + - factorizable to pffft_min_fft_size() with factors 2, 3, 5 + returns bool + */ + int pffft_is_valid_size(int N, pffft_transform_t cplx); + + /* determine nearest valid transform size (by brute-force testing) + - factorizable to pffft_min_fft_size() with factors 2, 3, 5. + higher: bool-flag to find nearest higher value; else lower. + */ + int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher); + + /* + the float buffers must have the correct alignment (16-byte boundary + on intel and powerpc). This function may be used to obtain such + correctly aligned buffers. + */ + void *pffft_aligned_malloc(size_t nb_bytes); + void pffft_aligned_free(void *); + +#ifdef __cplusplus +} +#endif + +#endif /* PFFFT_H */ + diff --git a/pffft/pffft.hpp b/pffft/pffft.hpp new file mode 100644 index 0000000..28e9db1 --- /dev/null +++ b/pffft/pffft.hpp @@ -0,0 +1,1060 @@ +/* Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of PFFFT, nor the names of its + sponsors or contributors may be used to endorse or promote products + derived from this Software without specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include + +namespace pffft { +namespace detail { +#if defined(PFFFT_ENABLE_FLOAT) || ( !defined(PFFFT_ENABLE_FLOAT) && !defined(PFFFT_ENABLE_DOUBLE) ) +#include "pffft.h" +#endif +#if defined(PFFFT_ENABLE_DOUBLE) +#include "pffft_double.h" +#endif +} +} + +namespace pffft { + +// enum { PFFFT_REAL, PFFFT_COMPLEX } +typedef detail::pffft_transform_t TransformType; + +// define 'Scalar' and 'Complex' (in namespace pffft) with template Types<> +// and other type specific helper functions +template struct Types {}; +#if defined(PFFFT_ENABLE_FLOAT) || ( !defined(PFFFT_ENABLE_FLOAT) && !defined(PFFFT_ENABLE_DOUBLE) ) +template<> struct Types { + typedef float Scalar; + typedef std::complex Complex; + static int simd_size() { return detail::pffft_simd_size(); } + static const char * simd_arch() { return detail::pffft_simd_arch(); } + static int minFFtsize() { return pffft_min_fft_size(detail::PFFFT_REAL); } + static bool isValidSize(int N) { return pffft_is_valid_size(N, detail::PFFFT_REAL); } + static int nearestTransformSize(int N, bool higher) { return pffft_nearest_transform_size(N, detail::PFFFT_REAL, higher ? 1 : 0); } +}; +template<> struct Types< std::complex > { + typedef float Scalar; + typedef std::complex Complex; + static int simd_size() { return detail::pffft_simd_size(); } + static const char * simd_arch() { return detail::pffft_simd_arch(); } + static int minFFtsize() { return pffft_min_fft_size(detail::PFFFT_COMPLEX); } + static bool isValidSize(int N) { return pffft_is_valid_size(N, detail::PFFFT_COMPLEX); } + static int nearestTransformSize(int N, bool higher) { return pffft_nearest_transform_size(N, detail::PFFFT_COMPLEX, higher ? 1 : 0); } +}; +#endif +#if defined(PFFFT_ENABLE_DOUBLE) +template<> struct Types { + typedef double Scalar; + typedef std::complex Complex; + static int simd_size() { return detail::pffftd_simd_size(); } + static const char * simd_arch() { return detail::pffftd_simd_arch(); } + static int minFFtsize() { return pffftd_min_fft_size(detail::PFFFT_REAL); } + static bool isValidSize(int N) { return pffftd_is_valid_size(N, detail::PFFFT_REAL); } + static int nearestTransformSize(int N, bool higher) { return pffftd_nearest_transform_size(N, detail::PFFFT_REAL, higher ? 1 : 0); } +}; +template<> struct Types< std::complex > { + typedef double Scalar; + typedef std::complex Complex; + static int simd_size() { return detail::pffftd_simd_size(); } + static const char * simd_arch() { return detail::pffftd_simd_arch(); } + static int minFFtsize() { return pffftd_min_fft_size(detail::PFFFT_COMPLEX); } + static bool isValidSize(int N) { return pffftd_is_valid_size(N, detail::PFFFT_COMPLEX); } + static int nearestTransformSize(int N, bool higher) { return pffftd_nearest_transform_size(N, detail::PFFFT_COMPLEX, higher ? 1 : 0); } +}; +#endif + +// Allocator +template class PFAlloc; + +namespace detail { + template class Setup; +} + +#if (__cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)) + +// define AlignedVector utilizing 'using' in C++11 +template +using AlignedVector = typename std::vector< T, PFAlloc >; + +#else + +// define AlignedVector having to derive std::vector<> +template +struct AlignedVector : public std::vector< T, PFAlloc > { + AlignedVector() : std::vector< T, PFAlloc >() { } + AlignedVector(int N) : std::vector< T, PFAlloc >(N) { } +}; + +#endif + + +// T can be float, double, std::complex or std::complex +// define PFFFT_ENABLE_DOUBLE before include this file for double and std::complex +template +class Fft +{ +public: + + // define types value_type, Scalar and Complex + typedef T value_type; + typedef typename Types::Scalar Scalar; + typedef typename Types::Complex Complex; + + // static retrospection functions + static bool isComplexTransform() { return sizeof(T) == sizeof(Complex); } + static bool isFloatScalar() { return sizeof(Scalar) == sizeof(float); } + static bool isDoubleScalar() { return sizeof(Scalar) == sizeof(double); } + + // simple helper to determine next power of 2 - without inexact/rounding floating point operations + static int nextPowerOfTwo(int N) { return detail::pffft_next_power_of_two(N); } + static bool isPowerOfTwo(int N) { return detail::pffft_is_power_of_two(N) ? true : false; } + + + static int simd_size() { return Types::simd_size(); } + static const char * simd_arch() { return Types::simd_arch(); } + + // simple helper to get minimum possible fft length + static int minFFtsize() { return Types::minFFtsize(); } + + // helper to determine nearest transform size - factorizable to minFFtsize() with factors 2, 3, 5 + static bool isValidSize(int N) { return Types::isValidSize(N); } + static int nearestTransformSize(int N, bool higher=true) { return Types::nearestTransformSize(N, higher); } + + + ////////////////// + + /* + * Contructor, with transformation length, preparing transforms. + * + * For length <= stackThresholdLen, the stack is used for the internal + * work memory. for bigger length', the heap is used. + * + * Using the stack is probably the best strategy for small + * FFTs, say for N <= 4096). Threads usually have a small stack, that + * there's no sufficient amount of memory, usually leading to a crash! + */ + Fft( int length, int stackThresholdLen = 4096 ); + + + /* + * constructor or prepareLength() produced a valid FFT instance? + * delivers false for invalid FFT sizes + */ + bool isValid() const; + + + ~Fft(); + + /* + * prepare for transformation length 'newLength'. + * length is identical to forward()'s input vector's size, + * and also equals inverse()'s output vector size. + * this function is no simple setter. it pre-calculates twiddle factors. + * returns true if newLength is >= minFFtsize, false otherwise + */ + bool prepareLength(int newLength); + + /* + * retrieve the transformation length. + */ + int getLength() const { return length; } + + /* + * retrieve size of complex spectrum vector, + * the output of forward() + */ + int getSpectrumSize() const { return isComplexTransform() ? length : ( length / 2 ); } + + /* + * retrieve size of spectrum vector - in internal layout; + * the output of forwardToInternalLayout() + */ + int getInternalLayoutSize() const { return isComplexTransform() ? ( 2 * length ) : length; } + + + //////////////////////////////////////////// + //// + //// API 1, with std::vector<> based containers, + //// which free the allocated memory themselves (RAII). + //// + //// uses an Allocator for the alignment of SIMD data. + //// + //////////////////////////////////////////// + + // create suitably preallocated aligned vector for one FFT + AlignedVector valueVector() const; + AlignedVector spectrumVector() const; + AlignedVector internalLayoutVector() const; + + //////////////////////////////////////////// + // although using Vectors for output .. + // they need to have resize() applied before! + + // core API, having the spectrum in canonical order + + /* + * Perform the forward Fourier transform. + * + * Transforms are not scaled: inverse(forward(x)) = N*x. + * Typically you will want to scale the backward transform by 1/N. + * + * The output 'spectrum' is canonically ordered - as expected. + * + * a) for complex input isComplexTransform() == true, + * and transformation length N the output array is complex: + * index k in 0 .. N/2 -1 corresponds to frequency k * Samplerate / N + * index k in N/2 .. N -1 corresponds to frequency (k -N) * Samplerate / N, + * resulting in negative frequencies + * + * b) for real input isComplexTransform() == false, + * and transformation length N the output array is 'mostly' complex: + * index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N + * index k == 0 is a special case: + * the real() part contains the result for the DC frequency 0, + * the imag() part contains the result for the Nyquist frequency Samplerate/2 + * both 0-frequency and half frequency components, which are real, + * are assembled in the first entry as F(0)+i*F(N/2). + * with the output size N/2 complex values, it is obvious, that the + * result for negative frequencies are not output, cause of symmetry. + * + * input and output may alias - if you do nasty type conversion. + * return is just the given output parameter 'spectrum'. + */ + AlignedVector & forward(const AlignedVector & input, AlignedVector & spectrum); + + /* + * Perform the inverse Fourier transform, see forward(). + * return is just the given output parameter 'output'. + */ + AlignedVector & inverse(const AlignedVector & spectrum, AlignedVector & output); + + + // provide additional functions with spectrum in some internal Layout. + // these are faster, cause the implementation omits the reordering. + // these are useful in special applications, like fast convolution, + // where inverse() is following anyway .. + + /* + * Perform the forward Fourier transform - similar to forward(), BUT: + * + * The z-domain data is stored in the most efficient order + * for transforming it back, or using it for convolution. + * If you need to have its content sorted in the "usual" canonical order, + * either use forward(), or call reorderSpectrum() after calling + * forwardToInternalLayout(), and before the backward fft + * + * return is just the given output parameter 'spectrum_internal_layout'. + */ + AlignedVector & forwardToInternalLayout( + const AlignedVector & input, + AlignedVector & spectrum_internal_layout ); + + /* + * Perform the inverse Fourier transform, see forwardToInternalLayout() + * + * return is just the given output parameter 'output'. + */ + AlignedVector & inverseFromInternalLayout( + const AlignedVector & spectrum_internal_layout, + AlignedVector & output ); + + /* + * Reorder the spectrum from internal layout to have the + * frequency components in the correct "canonical" order. + * see forward() for a description of the canonical order. + * + * input and output should not alias. + */ + void reorderSpectrum( + const AlignedVector & input, + AlignedVector & output ); + + /* + * Perform a multiplication of the frequency components of + * spectrum_internal_a and spectrum_internal_b + * into spectrum_internal_ab. + * The arrays should have been obtained with forwardToInternalLayout) + * and should *not* have been reordered with reorderSpectrum(). + * + * the operation performed is: + * spectrum_internal_ab = (spectrum_internal_a * spectrum_internal_b)*scaling + * + * The spectrum_internal_[a][b], pointers may alias. + * return is just the given output parameter 'spectrum_internal_ab'. + */ + AlignedVector & convolve( + const AlignedVector & spectrum_internal_a, + const AlignedVector & spectrum_internal_b, + AlignedVector & spectrum_internal_ab, + const Scalar scaling ); + + /* + * Perform a multiplication and accumulation of the frequency components + * - similar to convolve(). + * + * the operation performed is: + * spectrum_internal_ab += (spectrum_internal_a * spectrum_internal_b)*scaling + * + * The spectrum_internal_[a][b], pointers may alias. + * return is just the given output parameter 'spectrum_internal_ab'. + */ + AlignedVector & convolveAccumulate( + const AlignedVector & spectrum_internal_a, + const AlignedVector & spectrum_internal_b, + AlignedVector & spectrum_internal_ab, + const Scalar scaling ); + + + //////////////////////////////////////////// + //// + //// API 2, dealing with raw pointers, + //// which need to be deallocated using alignedFree() + //// + //// the special allocation is required cause SIMD + //// implementations require aligned memory + //// + //// Method descriptions are equal to the methods above, + //// having AlignedVector parameters - instead of raw pointers. + //// That is why following methods have no documentation. + //// + //////////////////////////////////////////// + + static void alignedFree(void* ptr); + + static T * alignedAllocType(int length); + static Scalar* alignedAllocScalar(int length); + static Complex* alignedAllocComplex(int length); + + // core API, having the spectrum in canonical order + + Complex* forward(const T* input, Complex* spectrum); + + T* inverse(const Complex* spectrum, T* output); + + + // provide additional functions with spectrum in some internal Layout. + // these are faster, cause the implementation omits the reordering. + // these are useful in special applications, like fast convolution, + // where inverse() is following anyway .. + + Scalar* forwardToInternalLayout(const T* input, + Scalar* spectrum_internal_layout); + + T* inverseFromInternalLayout(const Scalar* spectrum_internal_layout, T* output); + + void reorderSpectrum(const Scalar* input, Complex* output ); + + Scalar* convolve(const Scalar* spectrum_internal_a, + const Scalar* spectrum_internal_b, + Scalar* spectrum_internal_ab, + const Scalar scaling); + + Scalar* convolveAccumulate(const Scalar* spectrum_internal_a, + const Scalar* spectrum_internal_b, + Scalar* spectrum_internal_ab, + const Scalar scaling); + +private: + detail::Setup setup; + Scalar* work; + int length; + int stackThresholdLen; +}; + + +template +inline T* alignedAlloc(int length) { + return (T*)detail::pffft_aligned_malloc( length * sizeof(T) ); +} + +inline void alignedFree(void *ptr) { + detail::pffft_aligned_free(ptr); +} + + +// simple helper to determine next power of 2 - without inexact/rounding floating point operations +inline int nextPowerOfTwo(int N) { + return detail::pffft_next_power_of_two(N); +} + +inline bool isPowerOfTwo(int N) { + return detail::pffft_is_power_of_two(N) ? true : false; +} + + + +//////////////////////////////////////////////////////////////////// + +// implementation + +namespace detail { + +template +class Setup +{}; + +#if defined(PFFFT_ENABLE_FLOAT) || ( !defined(PFFFT_ENABLE_FLOAT) && !defined(PFFFT_ENABLE_DOUBLE) ) + +template<> +class Setup +{ + PFFFT_Setup* self; + +public: + typedef float value_type; + typedef Types< value_type >::Scalar Scalar; + + Setup() + : self(NULL) + {} + + ~Setup() { pffft_destroy_setup(self); } + + void prepareLength(int length) + { + if (self) { + pffft_destroy_setup(self); + } + self = pffft_new_setup(length, PFFFT_REAL); + } + + bool isValid() const { return (self); } + + void transform_ordered(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffft_transform_ordered(self, input, output, work, direction); + } + + void transform(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffft_transform(self, input, output, work, direction); + } + + void reorder(const Scalar* input, Scalar* output, pffft_direction_t direction) + { + pffft_zreorder(self, input, output, direction); + } + + void convolveAccumulate(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffft_zconvolve_accumulate(self, dft_a, dft_b, dft_ab, scaling); + } + + void convolve(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffft_zconvolve_no_accu(self, dft_a, dft_b, dft_ab, scaling); + } +}; + + +template<> +class Setup< std::complex > +{ + PFFFT_Setup* self; + +public: + typedef std::complex value_type; + typedef Types< value_type >::Scalar Scalar; + + Setup() + : self(NULL) + {} + + ~Setup() { pffft_destroy_setup(self); } + + void prepareLength(int length) + { + if (self) { + pffft_destroy_setup(self); + } + self = pffft_new_setup(length, PFFFT_COMPLEX); + } + + bool isValid() const { return (self); } + + void transform_ordered(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffft_transform_ordered(self, input, output, work, direction); + } + + void transform(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffft_transform(self, input, output, work, direction); + } + + void reorder(const Scalar* input, Scalar* output, pffft_direction_t direction) + { + pffft_zreorder(self, input, output, direction); + } + + void convolve(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffft_zconvolve_no_accu(self, dft_a, dft_b, dft_ab, scaling); + } +}; + +#endif /* defined(PFFFT_ENABLE_FLOAT) || ( !defined(PFFFT_ENABLE_FLOAT) && !defined(PFFFT_ENABLE_DOUBLE) ) */ + + +#if defined(PFFFT_ENABLE_DOUBLE) + +template<> +class Setup +{ + PFFFTD_Setup* self; + +public: + typedef double value_type; + typedef Types< value_type >::Scalar Scalar; + + Setup() + : self(NULL) + {} + + ~Setup() { pffftd_destroy_setup(self); } + + void prepareLength(int length) + { + if (self) { + pffftd_destroy_setup(self); + self = NULL; + } + if (length > 0) { + self = pffftd_new_setup(length, PFFFT_REAL); + } + } + + bool isValid() const { return (self); } + + void transform_ordered(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffftd_transform_ordered(self, input, output, work, direction); + } + + void transform(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffftd_transform(self, input, output, work, direction); + } + + void reorder(const Scalar* input, Scalar* output, pffft_direction_t direction) + { + pffftd_zreorder(self, input, output, direction); + } + + void convolveAccumulate(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffftd_zconvolve_accumulate(self, dft_a, dft_b, dft_ab, scaling); + } + + void convolve(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffftd_zconvolve_no_accu(self, dft_a, dft_b, dft_ab, scaling); + } +}; + +template<> +class Setup< std::complex > +{ + PFFFTD_Setup* self; + +public: + typedef std::complex value_type; + typedef Types< value_type >::Scalar Scalar; + + Setup() + : self(NULL) + {} + + ~Setup() { pffftd_destroy_setup(self); } + + void prepareLength(int length) + { + if (self) { + pffftd_destroy_setup(self); + } + self = pffftd_new_setup(length, PFFFT_COMPLEX); + } + + bool isValid() const { return (self); } + + void transform_ordered(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffftd_transform_ordered(self, input, output, work, direction); + } + + void transform(const Scalar* input, + Scalar* output, + Scalar* work, + pffft_direction_t direction) + { + pffftd_transform(self, input, output, work, direction); + } + + void reorder(const Scalar* input, Scalar* output, pffft_direction_t direction) + { + pffftd_zreorder(self, input, output, direction); + } + + void convolveAccumulate(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffftd_zconvolve_accumulate(self, dft_a, dft_b, dft_ab, scaling); + } + + void convolve(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) + { + pffftd_zconvolve_no_accu(self, dft_a, dft_b, dft_ab, scaling); + } +}; + +#endif /* defined(PFFFT_ENABLE_DOUBLE) */ + +} // end of anonymous namespace for Setup<> + + +template +inline Fft::Fft(int length, int stackThresholdLen) + : work(NULL) + , length(0) + , stackThresholdLen(stackThresholdLen) +{ +#if (__cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)) + static_assert( sizeof(Complex) == 2 * sizeof(Scalar), "pffft requires sizeof(std::complex<>) == 2 * sizeof(Scalar)" ); +#elif defined(__GNUC__) + char static_assert_like[(sizeof(Complex) == 2 * sizeof(Scalar)) ? 1 : -1]; // pffft requires sizeof(std::complex<>) == 2 * sizeof(Scalar) +#endif + prepareLength(length); +} + +template +inline Fft::~Fft() +{ + alignedFree(work); +} + +template +inline bool +Fft::isValid() const +{ + return setup.isValid(); +} + +template +inline bool +Fft::prepareLength(int newLength) +{ + if(newLength < minFFtsize()) + return false; + + const bool wasOnHeap = ( work != NULL ); + + const bool useHeap = newLength > stackThresholdLen; + + if (useHeap == wasOnHeap && newLength == length) { + return true; + } + + length = 0; + + setup.prepareLength(newLength); + if (!setup.isValid()) + return false; + + length = newLength; + + if (work) { + alignedFree(work); + work = NULL; + } + + if (useHeap) { + work = reinterpret_cast( alignedAllocType(length) ); + } + + return true; +} + + +template +inline AlignedVector +Fft::valueVector() const +{ + return AlignedVector(length); +} + +template +inline AlignedVector< typename Fft::Complex > +Fft::spectrumVector() const +{ + return AlignedVector( getSpectrumSize() ); +} + +template +inline AlignedVector< typename Fft::Scalar > +Fft::internalLayoutVector() const +{ + return AlignedVector( getInternalLayoutSize() ); +} + + +template +inline AlignedVector< typename Fft::Complex > & +Fft::forward(const AlignedVector & input, AlignedVector & spectrum) +{ + forward( input.data(), spectrum.data() ); + return spectrum; +} + +template +inline AlignedVector & +Fft::inverse(const AlignedVector & spectrum, AlignedVector & output) +{ + inverse( spectrum.data(), output.data() ); + return output; +} + + +template +inline AlignedVector< typename Fft::Scalar > & +Fft::forwardToInternalLayout( + const AlignedVector & input, + AlignedVector & spectrum_internal_layout ) +{ + forwardToInternalLayout( input.data(), spectrum_internal_layout.data() ); + return spectrum_internal_layout; +} + +template +inline AlignedVector & +Fft::inverseFromInternalLayout( + const AlignedVector & spectrum_internal_layout, + AlignedVector & output ) +{ + inverseFromInternalLayout( spectrum_internal_layout.data(), output.data() ); + return output; +} + +template +inline void +Fft::reorderSpectrum( + const AlignedVector & input, + AlignedVector & output ) +{ + reorderSpectrum( input.data(), output.data() ); +} + +template +inline AlignedVector< typename Fft::Scalar > & +Fft::convolveAccumulate( + const AlignedVector & spectrum_internal_a, + const AlignedVector & spectrum_internal_b, + AlignedVector & spectrum_internal_ab, + const Scalar scaling ) +{ + convolveAccumulate( spectrum_internal_a.data(), spectrum_internal_b.data(), + spectrum_internal_ab.data(), scaling ); + return spectrum_internal_ab; +} + +template +inline AlignedVector< typename Fft::Scalar > & +Fft::convolve( + const AlignedVector & spectrum_internal_a, + const AlignedVector & spectrum_internal_b, + AlignedVector & spectrum_internal_ab, + const Scalar scaling ) +{ + convolve( spectrum_internal_a.data(), spectrum_internal_b.data(), + spectrum_internal_ab.data(), scaling ); + return spectrum_internal_ab; +} + + +template +inline typename Fft::Complex * +Fft::forward(const T* input, Complex * spectrum) +{ + assert(isValid()); + setup.transform_ordered(reinterpret_cast(input), + reinterpret_cast(spectrum), + work, + detail::PFFFT_FORWARD); + return spectrum; +} + +template +inline T* +Fft::inverse(Complex const* spectrum, T* output) +{ + assert(isValid()); + setup.transform_ordered(reinterpret_cast(spectrum), + reinterpret_cast(output), + work, + detail::PFFFT_BACKWARD); + return output; +} + +template +inline typename pffft::Fft::Scalar* +Fft::forwardToInternalLayout(const T* input, Scalar* spectrum_internal_layout) +{ + assert(isValid()); + setup.transform(reinterpret_cast(input), + spectrum_internal_layout, + work, + detail::PFFFT_FORWARD); + return spectrum_internal_layout; +} + +template +inline T* +Fft::inverseFromInternalLayout(const Scalar* spectrum_internal_layout, T* output) +{ + assert(isValid()); + setup.transform(spectrum_internal_layout, + reinterpret_cast(output), + work, + detail::PFFFT_BACKWARD); + return output; +} + +template +inline void +Fft::reorderSpectrum( const Scalar* input, Complex* output ) +{ + assert(isValid()); + setup.reorder(input, reinterpret_cast(output), detail::PFFFT_FORWARD); +} + +template +inline typename pffft::Fft::Scalar* +Fft::convolveAccumulate(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) +{ + assert(isValid()); + setup.convolveAccumulate(dft_a, dft_b, dft_ab, scaling); + return dft_ab; +} + +template +inline typename pffft::Fft::Scalar* +Fft::convolve(const Scalar* dft_a, + const Scalar* dft_b, + Scalar* dft_ab, + const Scalar scaling) +{ + assert(isValid()); + setup.convolve(dft_a, dft_b, dft_ab, scaling); + return dft_ab; +} + +template +inline void +Fft::alignedFree(void* ptr) +{ + pffft::alignedFree(ptr); +} + + +template +inline T* +pffft::Fft::alignedAllocType(int length) +{ + return alignedAlloc(length); +} + +template +inline typename pffft::Fft::Scalar* +pffft::Fft::alignedAllocScalar(int length) +{ + return alignedAlloc(length); +} + +template +inline typename Fft::Complex * +Fft::alignedAllocComplex(int length) +{ + return alignedAlloc(length); +} + + + +//////////////////////////////////////////////////////////////////// + +// Allocator - for std::vector<>: +// origin: http://www.josuttis.com/cppcode/allocator.html +// http://www.josuttis.com/cppcode/myalloc.hpp +// +// minor renaming and utilizing of pffft (de)allocation functions +// are applied to Jossutis' allocator + +/* The following code example is taken from the book + * "The C++ Standard Library - A Tutorial and Reference" + * by Nicolai M. Josuttis, Addison-Wesley, 1999 + * + * (C) Copyright Nicolai M. Josuttis 1999. + * Permission to copy, use, modify, sell and distribute this software + * is granted provided this copyright notice appears in all copies. + * This software is provided "as is" without express or implied + * warranty, and with no claim as to its suitability for any purpose. + */ + +template +class PFAlloc { + public: + // type definitions + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + // rebind allocator to type U + template + struct rebind { + typedef PFAlloc other; + }; + + // return address of values + pointer address (reference value) const { + return &value; + } + const_pointer address (const_reference value) const { + return &value; + } + + /* constructors and destructor + * - nothing to do because the allocator has no state + */ + PFAlloc() throw() { + } + PFAlloc(const PFAlloc&) throw() { + } + template + PFAlloc (const PFAlloc&) throw() { + } + ~PFAlloc() throw() { + } + + // return maximum number of elements that can be allocated + size_type max_size () const throw() { + return std::numeric_limits::max() / sizeof(T); + } + + // allocate but don't initialize num elements of type T + pointer allocate (size_type num, const void* = 0) { + pointer ret = (pointer)( alignedAlloc(int(num)) ); + return ret; + } + + // initialize elements of allocated storage p with value value + void construct (pointer p, const T& value) { + // initialize memory with placement new + new((void*)p)T(value); + } + + // destroy elements of initialized storage p + void destroy (pointer p) { + // destroy objects by calling their destructor + p->~T(); + } + + // deallocate storage p of deleted elements + void deallocate (pointer p, size_type num) { + // deallocate memory with pffft + alignedFree( (void*)p ); + } +}; + +// return that all specializations of this allocator are interchangeable +template +bool operator== (const PFAlloc&, + const PFAlloc&) throw() { + return true; +} +template +bool operator!= (const PFAlloc&, + const PFAlloc&) throw() { + return false; +} + + +} // namespace pffft + diff --git a/pffft/pffft_common.c b/pffft/pffft_common.c new file mode 100644 index 0000000..106fdd2 --- /dev/null +++ b/pffft/pffft_common.c @@ -0,0 +1,53 @@ + +#include "pffft.h" + +#include + +/* SSE and co like 16-bytes aligned pointers + * with a 64-byte alignment, we are even aligned on L2 cache lines... */ +#define MALLOC_V4SF_ALIGNMENT 64 + +static void * Valigned_malloc(size_t nb_bytes) { + void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); + if (!p0) return (void *) 0; + p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); + *((void **) p - 1) = p0; + return p; +} + +static void Valigned_free(void *p) { + if (p) free(*((void **) p - 1)); +} + + +static int next_power_of_two(int N) { + /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ + /* compute the next highest power of 2 of 32-bit v */ + unsigned v = N; + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +static int is_power_of_two(int N) { + /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */ + int f = N && !(N & (N - 1)); + return f; +} + + + +void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); } +void pffft_aligned_free(void *p) { Valigned_free(p); } +int pffft_next_power_of_two(int N) { return next_power_of_two(N); } +int pffft_is_power_of_two(int N) { return is_power_of_two(N); } + +void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); } +void pffftd_aligned_free(void *p) { Valigned_free(p); } +int pffftd_next_power_of_two(int N) { return next_power_of_two(N); } +int pffftd_is_power_of_two(int N) { return is_power_of_two(N); } diff --git a/pffft/pffft_double.c b/pffft/pffft_double.c new file mode 100644 index 0000000..066782b --- /dev/null +++ b/pffft/pffft_double.c @@ -0,0 +1,147 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB + (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber + of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + + PFFFT : a Pretty Fast FFT. + + This file is largerly based on the original FFTPACK implementation, modified in + order to take advantage of SIMD instructions of modern CPUs. +*/ + +/* + NOTE: This file is adapted from Julien Pommier's original PFFFT, + which works on 32 bit floating point precision using SSE instructions, + to work with 64 bit floating point precision using AVX instructions. + Author: Dario Mambro @ https://github.com/unevens/pffft +*/ + +#include "pffft_double.h" + +/* detect compiler flavour */ +#if defined(_MSC_VER) +# define COMPILER_MSVC +#elif defined(__GNUC__) +# define COMPILER_GCC +#endif + +#ifdef COMPILER_MSVC +# define _USE_MATH_DEFINES +# include +#elif defined(__MINGW32__) || defined(__MINGW64__) +# include +#else +# include +#endif + +#include +#include +#include +#include +#include + +#if defined(COMPILER_GCC) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) +# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; +#elif defined(COMPILER_MSVC) +# define ALWAYS_INLINE(return_type) __forceinline return_type +# define NEVER_INLINE(return_type) __declspec(noinline) return_type +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) +#endif + + +#ifdef COMPILER_MSVC +#pragma warning( disable : 4244 4305 4204 4456 ) +#endif + +/* + vector support macros: the rest of the code is independant of + AVX -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ +#include "simd/pf_double.h" + +/* have code comparable with this definition */ +#define float double +#define SETUP_STRUCT PFFFTD_Setup +#define FUNC_NEW_SETUP pffftd_new_setup +#define FUNC_DESTROY pffftd_destroy_setup +#define FUNC_TRANSFORM_UNORDRD pffftd_transform +#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered +#define FUNC_ZREORDER pffftd_zreorder +#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate +#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu + +#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc +#define FUNC_ALIGNED_FREE pffftd_aligned_free +#define FUNC_SIMD_SIZE pffftd_simd_size +#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size +#define FUNC_IS_VALID_SIZE pffftd_is_valid_size +#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size +#define FUNC_SIMD_ARCH pffftd_simd_arch +#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd +#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex + +#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize +#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess +#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4 +#define FUNC_REAL_PREPROCESS pffftd_real_preprocess +#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4 +#define FUNC_REAL_FINALIZE pffftd_real_finalize +#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal + +#define FUNC_COS cos +#define FUNC_SIN sin + + +#include "pffft_priv_impl.h" + + diff --git a/pffft/pffft_double.h b/pffft/pffft_double.h new file mode 100644 index 0000000..afa8de0 --- /dev/null +++ b/pffft/pffft_double.h @@ -0,0 +1,236 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB, + authored by Dr Paul Swarztrauber of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ +/* + NOTE: This file is adapted from Julien Pommier's original PFFFT, + which works on 32 bit floating point precision using SSE instructions, + to work with 64 bit floating point precision using AVX instructions. + Author: Dario Mambro @ https://github.com/unevens/pffft +*/ +/* + PFFFT : a Pretty Fast FFT. + + This is basically an adaptation of the single precision fftpack + (v4) as found on netlib taking advantage of SIMD instruction found + on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). + + For architectures where no SIMD instruction is available, the code + falls back to a scalar version. + + Restrictions: + + - 1D transforms only, with 64-bit double precision. + + - supports only transforms for inputs of length N of the form + N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, + 144, 160, etc are all acceptable lengths). Performance is best for + 128<=N<=8192. + + - all (double*) pointers in the functions below are expected to + have an "simd-compatible" alignment, that is 32 bytes on x86 and + powerpc CPUs. + + You can allocate such buffers with the functions + pffft_aligned_malloc / pffft_aligned_free (or with stuff like + posix_memalign..) + +*/ + +#ifndef PFFFT_DOUBLE_H +#define PFFFT_DOUBLE_H + +#include /* for size_t */ + +#ifdef __cplusplus +extern "C" { +#endif + + /* opaque struct holding internal stuff (precomputed twiddle factors) + this struct can be shared by many threads as it contains only + read-only data. + */ + typedef struct PFFFTD_Setup PFFFTD_Setup; + +#ifndef PFFFT_COMMON_ENUMS +#define PFFFT_COMMON_ENUMS + + /* direction of the transform */ + typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; + + /* type of transform */ + typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; + +#endif + + /* + prepare for performing transforms of size N -- the returned + PFFFTD_Setup structure is read-only so it can safely be shared by + multiple concurrent threads. + */ + PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform); + void pffftd_destroy_setup(PFFFTD_Setup *); + /* + Perform a Fourier transform , The z-domain data is stored in the + most efficient order for transforming it back, or using it for + convolution. If you need to have its content sorted in the + "usual" way, that is as an array of interleaved complex numbers, + either use pffft_transform_ordered , or call pffft_zreorder after + the forward fft, and before the backward fft. + + Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. + Typically you will want to scale the backward transform by 1/N. + + The 'work' pointer should point to an area of N (2*N for complex + fft) doubles, properly aligned. If 'work' is NULL, then stack will + be used instead (this is probably the best strategy for small + FFTs, say for N < 16384). Threads usually have a small stack, that + there's no sufficient amount of memory, usually leading to a crash! + Use the heap with pffft_aligned_malloc() in this case. + + input and output may alias. + */ + void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction); + + /* + Similar to pffft_transform, but makes sure that the output is + ordered as expected (interleaved complex numbers). This is + similar to calling pffft_transform and then pffft_zreorder. + + input and output may alias. + */ + void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction); + + /* + call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., + PFFFT_FORWARD) if you want to have the frequency components in + the correct "canonical" order, as interleaved complex numbers. + + (for real transforms, both 0-frequency and half frequency + components, which are real, are assembled in the first entry as + F(0)+i*F(n/2+1). Note that the original fftpack did place + F(n/2+1) at the end of the arrays). + + input and output should not alias. + */ + void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and accumulate them into dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab += (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and put result in dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab = (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling); + + /* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */ + int pffftd_simd_size(); + + /* return string identifier of used architecture (AVX/..) */ + const char * pffftd_simd_arch(); + + /* simple helper to get minimum possible fft size */ + int pffftd_min_fft_size(pffft_transform_t transform); + + /* simple helper to determine size N is valid + - factorizable to pffft_min_fft_size() with factors 2, 3, 5 + */ + int pffftd_is_valid_size(int N, pffft_transform_t cplx); + + /* determine nearest valid transform size (by brute-force testing) + - factorizable to pffft_min_fft_size() with factors 2, 3, 5. + higher: bool-flag to find nearest higher value; else lower. + */ + int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher); + + + /* following functions are identical to the pffft_ functions - both declared */ + + /* simple helper to determine next power of 2 + - without inexact/rounding floating point operations + */ + int pffftd_next_power_of_two(int N); + int pffft_next_power_of_two(int N); + + /* simple helper to determine if power of 2 - returns bool */ + int pffftd_is_power_of_two(int N); + int pffft_is_power_of_two(int N); + + /* + the double buffers must have the correct alignment (32-byte boundary + on intel and powerpc). This function may be used to obtain such + correctly aligned buffers. + */ + void *pffftd_aligned_malloc(size_t nb_bytes); + void *pffft_aligned_malloc(size_t nb_bytes); + void pffftd_aligned_free(void *); + void pffft_aligned_free(void *); + +#ifdef __cplusplus +} +#endif + +#endif /* PFFFT_DOUBLE_H */ + diff --git a/pffft/pffft_priv_impl.h b/pffft/pffft_priv_impl.h new file mode 100644 index 0000000..e92fdc8 --- /dev/null +++ b/pffft/pffft_priv_impl.h @@ -0,0 +1,2233 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB + (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber + of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + + PFFFT : a Pretty Fast FFT. + + This file is largerly based on the original FFTPACK implementation, modified in + order to take advantage of SIMD instructions of modern CPUs. +*/ + +/* this file requires architecture specific preprocessor definitions + * it's only for library internal use + */ + + +/* define own constants required to turn off g++ extensions .. */ +#ifndef M_PI + #define M_PI 3.14159265358979323846 /* pi */ +#endif + +#ifndef M_SQRT2 + #define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ +#endif + + +int FUNC_SIMD_SIZE() { return SIMD_SZ; } + +int FUNC_MIN_FFT_SIZE(pffft_transform_t transform) { + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + int simdSz = FUNC_SIMD_SIZE(); + if (transform == PFFFT_REAL) + return ( 2 * simdSz * simdSz ); + else if (transform == PFFFT_COMPLEX) + return ( simdSz * simdSz ); + else + return 1; +} + +int FUNC_IS_VALID_SIZE(int N, pffft_transform_t cplx) { + const int N_min = FUNC_MIN_FFT_SIZE(cplx); + int R = N; + while (R >= 5*N_min && (R % 5) == 0) R /= 5; + while (R >= 3*N_min && (R % 3) == 0) R /= 3; + while (R >= 2*N_min && (R % 2) == 0) R /= 2; + return (R == N_min) ? 1 : 0; +} + +int FUNC_NEAREST_SIZE(int N, pffft_transform_t cplx, int higher) { + int d; + const int N_min = FUNC_MIN_FFT_SIZE(cplx); + if (N < N_min) + N = N_min; + d = (higher) ? N_min : -N_min; + if (d > 0) + N = N_min * ((N+N_min-1) / N_min); /* round up */ + else + N = N_min * (N / N_min); /* round down */ + + for (; ; N += d) + if (FUNC_IS_VALID_SIZE(N, cplx)) + return N; +} + +const char * FUNC_SIMD_ARCH() { return VARCH; } + + +/* + passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 +*/ +static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { + int k, i; + int l1ido = l1*ido; + if (ido <= 2) { + for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) { + ch[0] = VADD(cc[0], cc[ido+0]); + ch[l1ido] = VSUB(cc[0], cc[ido+0]); + ch[1] = VADD(cc[1], cc[ido+1]); + ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); + } + } else { + for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) { + for (i=0; i 2); + for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { + for (i=0; i 2); + for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { + for (i = 0; i < ido-1; i += 2) { + ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); + ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); + ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); + ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); + tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); + tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); + ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); + ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); + cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); + ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); + cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); + ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); + cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); + ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); + cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); + ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); + dr3 = VSUB(cr3, ci4); + dr4 = VADD(cr3, ci4); + di3 = VADD(ci3, cr4); + di4 = VSUB(ci3, cr4); + dr5 = VADD(cr2, ci5); + dr2 = VSUB(cr2, ci5); + di5 = VSUB(ci2, cr5); + di2 = VADD(ci2, cr5); + wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; + wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; + VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch_ref(i - 1, 2) = dr2; + ch_ref(i, 2) = di2; + VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch_ref(i - 1, 3) = dr3; + ch_ref(i, 3) = di3; + VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); + ch_ref(i - 1, 4) = dr4; + ch_ref(i, 4) = di4; + VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); + ch_ref(i - 1, 5) = dr5; + ch_ref(i, 5) = di5; + } + } +#undef ch_ref +#undef cc_ref +} + +static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { + static const float minus_one = -1.f; + int i, k, l1ido = l1*ido; + for (k=0; k < l1ido; k += ido) { + v4sf a = cc[k], b = cc[k + l1ido]; + ch[2*k] = VADD(a, b); + ch[2*(k+ido)-1] = VSUB(a, b); + } + if (ido < 2) return; + if (ido != 2) { + for (k=0; k < l1ido; k += ido) { + for (i=2; i 5) { + wa[i1-1] = wa[i-1]; + wa[i1] = wa[i]; + } + } + l1 = l2; + } +} /* cffti1 */ + + +static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { + v4sf *in = (v4sf*)input_readonly; + v4sf *out = (in == work2 ? work1 : work2); + int nf = ifac[1], k1; + int l1 = 1; + int iw = 0; + assert(in != out && work1 != work2); + for (k1=2; k1<=nf+1; k1++) { + int ip = ifac[k1]; + int l2 = ip*l1; + int ido = n / l2; + int idot = ido + ido; + switch (ip) { + case 5: { + int ix2 = iw + idot; + int ix3 = ix2 + idot; + int ix4 = ix3 + idot; + passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + } break; + case 4: { + int ix2 = iw + idot; + int ix3 = ix2 + idot; + passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign); + } break; + case 2: { + passf2_ps(idot, l1, in, out, &wa[iw], isign); + } break; + case 3: { + int ix2 = iw + idot; + passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign); + } break; + default: + assert(0); + } + l1 = l2; + iw += (ip - 1)*idot; + if (out == work2) { + out = work1; in = work2; + } else { + out = work2; in = work1; + } + } + + return in; /* this is in fact the output .. */ +} + + +struct SETUP_STRUCT { + int N; + int Ncvec; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */ + int ifac[15]; + pffft_transform_t transform; + v4sf *data; /* allocated room for twiddle coefs */ + float *e; /* points into 'data', N/4*3 elements */ + float *twiddle; /* points into 'data', N/4 elements */ +}; + +SETUP_STRUCT *FUNC_NEW_SETUP(int N, pffft_transform_t transform) { + SETUP_STRUCT *s = 0; + int k, m; + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + if (transform == PFFFT_REAL) { if ((N%(2*SIMD_SZ*SIMD_SZ)) || N<=0) return s; } + if (transform == PFFFT_COMPLEX) { if ((N%( SIMD_SZ*SIMD_SZ)) || N<=0) return s; } + s = (SETUP_STRUCT*)malloc(sizeof(SETUP_STRUCT)); + if (!s) return s; + /* assert((N % 32) == 0); */ + s->N = N; + s->transform = transform; + /* nb of complex simd vectors */ + s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; + s->data = (v4sf*)FUNC_ALIGNED_MALLOC(2*s->Ncvec * sizeof(v4sf)); + if (!s->data) { free(s); return 0; } + s->e = (float*)s->data; + s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); + + if (transform == PFFFT_REAL) { + for (k=0; k < s->Ncvec; ++k) { + int i = k/SIMD_SZ; + int j = k%SIMD_SZ; + for (m=0; m < SIMD_SZ-1; ++m) { + float A = -2*(float)M_PI*(m+1)*k / N; + s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = FUNC_COS(A); + s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = FUNC_SIN(A); + } + } + rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + } else { + for (k=0; k < s->Ncvec; ++k) { + int i = k/SIMD_SZ; + int j = k%SIMD_SZ; + for (m=0; m < SIMD_SZ-1; ++m) { + float A = -2*(float)M_PI*(m+1)*k / N; + s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = FUNC_COS(A); + s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = FUNC_SIN(A); + } + } + cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + } + + /* check that N is decomposable with allowed prime factors */ + for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } + if (m != N/SIMD_SZ) { + FUNC_DESTROY(s); s = 0; + } + + return s; +} + + +void FUNC_DESTROY(SETUP_STRUCT *s) { + if (!s) + return; + FUNC_ALIGNED_FREE(s->data); + free(s); +} + +#if ( SIMD_SZ == 4 ) /* !defined(PFFFT_SIMD_DISABLE) */ + +/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ +static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { + v4sf g0, g1; + int k; + INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; + + *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */ + for (k=1; k < N; ++k) { + v4sf h0, h1; + INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; + *--out = VSWAPHL(g1, h0); + *--out = VSWAPHL(h0, h1); + g1 = h1; + } + *--out = VSWAPHL(g1, g0); +} + +static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { + v4sf g0, g1, h0, h1; + int k; + g0 = g1 = in[0]; ++in; + for (k=1; k < N; ++k) { + h0 = *in++; h1 = *in++; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; + g1 = h1; + } + h0 = *in++; h1 = g0; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + UNINTERLEAVE2(h0, g1, out[0], out[1]); +} + +void FUNC_ZREORDER(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) { + int k, N = setup->N, Ncvec = setup->Ncvec; + const v4sf *vin = (const v4sf*)in; + v4sf *vout = (v4sf*)out; + assert(in != out); + if (setup->transform == PFFFT_REAL) { + int k, dk = N/32; + if (direction == PFFFT_FORWARD) { + for (k=0; k < dk; ++k) { + INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); + INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); + } + reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2)); + reversed_copy(dk, vin+6, 8, (v4sf*)(out + N)); + } else { + for (k=0; k < dk; ++k) { + UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); + UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); + } + unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8); + unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8); + } + } else { + if (direction == PFFFT_FORWARD) { + for (k=0; k < Ncvec; ++k) { + int kk = (k/4) + (k%4)*(Ncvec/4); + INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); + } + } else { + for (k=0; k < Ncvec; ++k) { + int kk = (k/4) + (k%4)*(Ncvec/4); + UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); + } + } + } +} + +void FUNC_CPLX_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + assert(in != out); + for (k=0; k < dk; ++k) { + r0 = in[8*k+0]; i0 = in[8*k+1]; + r1 = in[8*k+2]; i1 = in[8*k+3]; + r2 = in[8*k+4]; i2 = in[8*k+5]; + r3 = in[8*k+6]; i3 = in[8*k+7]; + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]); + VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]); + VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]); + + sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); + si0 = VADD(i0,i2); di0 = VSUB(i0, i2); + si1 = VADD(i1,i3); di1 = VSUB(i1, i3); + + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 -1 0 0 -1 0 1] [r1] + [1 -1 1 -1 0 0 0 0] [r2] + [1 0 -1 0 0 1 0 -1] [r3] + [0 0 0 0 1 1 1 1] * [i0] + [0 1 0 -1 1 0 -1 0] [i1] + [0 0 0 0 1 -1 1 -1] [i2] + [0 -1 0 1 1 0 -1 0] [i3] + */ + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + +void FUNC_CPLX_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + assert(in != out); + for (k=0; k < dk; ++k) { + r0 = in[8*k+0]; i0 = in[8*k+1]; + r1 = in[8*k+2]; i1 = in[8*k+3]; + r2 = in[8*k+4]; i2 = in[8*k+5]; + r3 = in[8*k+6]; i3 = in[8*k+7]; + + sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); + si0 = VADD(i0,i2); di0 = VSUB(i0, i2); + si1 = VADD(i1,i3); di1 = VSUB(i1, i3); + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); + + VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]); + VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]); + VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]); + + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + + +static ALWAYS_INLINE(void) FUNC_REAL_FINALIZE_4X4(const v4sf *in0, const v4sf *in1, const v4sf *in, + const v4sf *e, v4sf *out) { + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + r0 = *in0; i0 = *in1; + r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 -1 0 0 -1 0 1] [r1] + [1 0 -1 0 0 1 0 -1] [r2] + [1 -1 1 -1 0 0 0 0] [r3] + [0 0 0 0 1 1 1 1] * [i0] + [0 -1 0 1 -1 0 1 0] [i1] + [0 -1 0 1 1 0 -1 0] [i2] + [0 0 0 0 -1 1 -1 1] [i3] + */ + + /* cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */ + /* cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */ + + VCPLXMUL(r1,i1,e[0],e[1]); + VCPLXMUL(r2,i2,e[2],e[3]); + VCPLXMUL(r3,i3,e[4],e[5]); + + /* cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */ + /* cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */ + + sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); + si0 = VADD(i0,i2); di0 = VSUB(i0,i2); + si1 = VADD(i1,i3); di1 = VSUB(i3,i1); + + r0 = VADD(sr0, sr1); + r3 = VSUB(sr0, sr1); + i0 = VADD(si0, si1); + i3 = VSUB(si1, si0); + r1 = VADD(dr0, di1); + r2 = VSUB(dr0, di1); + i1 = VSUB(dr1, di0); + i2 = VADD(dr1, di0); + + *out++ = r0; + *out++ = i0; + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; + +} + +static NEVER_INLINE(void) FUNC_REAL_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + v4sf_union cr, ci, *uout = (v4sf_union*)out; + v4sf save = in[7], zero=VZERO(); + float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3; + static const float s = (float)M_SQRT2/2; + + cr.v = in[0]; ci.v = in[Ncvec*2-1]; + assert(in != out); + FUNC_REAL_FINALIZE_4X4(&zero, &zero, in+1, e, out); + + /* + [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] + + [Xr(1)] ] [1 1 1 1 0 0 0 0] + [Xr(N/4) ] [0 0 0 0 1 s 0 -s] + [Xr(N/2) ] [1 0 -1 0 0 0 0 0] + [Xr(3N/4)] [0 0 0 0 1 -s 0 s] + [Xi(1) ] [1 -1 1 -1 0 0 0 0] + [Xi(N/4) ] [0 0 0 0 0 -s -1 -s] + [Xi(N/2) ] [0 -1 0 1 0 0 0 0] + [Xi(3N/4)] [0 0 0 0 0 -s 1 -s] + */ + + xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0; + xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0; + xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2; + xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2; + xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1; + xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1; + xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3; + xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; + + for (k=1; k < dk; ++k) { + v4sf save_next = in[8*k+7]; + FUNC_REAL_FINALIZE_4X4(&save, &in[8*k+0], in + 8*k+1, + e + k*6, out + k*8); + save = save_next; + } + +} + +static ALWAYS_INLINE(void) FUNC_REAL_PREPROCESS_4X4(const v4sf *in, + const v4sf *e, v4sf *out, int first) { + v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 0 -1 0 -1 -1 0] [r1] + [1 -1 -1 1 0 0 0 0] [r2] + [1 0 0 -1 0 1 1 0] [r3] + [0 0 0 0 1 -1 1 -1] * [i0] + [0 -1 1 0 1 0 0 1] [i1] + [0 0 0 0 1 1 -1 -1] [i2] + [0 1 -1 0 1 0 0 1] [i3] + */ + + v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); + v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); + v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); + v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); + + r0 = VADD(sr0, sr1); + r2 = VSUB(sr0, sr1); + r1 = VSUB(dr0, si1); + r3 = VADD(dr0, si1); + i0 = VSUB(di0, di1); + i2 = VADD(di0, di1); + i1 = VSUB(si0, dr1); + i3 = VADD(si0, dr1); + + VCPLXMULCONJ(r1,i1,e[0],e[1]); + VCPLXMULCONJ(r2,i2,e[2],e[3]); + VCPLXMULCONJ(r3,i3,e[4],e[5]); + + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + if (!first) { + *out++ = r0; + *out++ = i0; + } + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; +} + +static NEVER_INLINE(void) FUNC_REAL_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + v4sf_union Xr, Xi, *uout = (v4sf_union*)out; + float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3; + static const float s = (float)M_SQRT2; + assert(in != out); + for (k=0; k < 4; ++k) { + Xr.f[k] = ((float*)in)[8*k]; + Xi.f[k] = ((float*)in)[8*k+4]; + } + + FUNC_REAL_PREPROCESS_4X4(in, e, out+1, 1); /* will write only 6 values */ + + /* + [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] + + [cr0] [1 0 2 0 1 0 0 0] + [cr1] [1 0 0 0 -1 0 -2 0] + [cr2] [1 0 -2 0 1 0 0 0] + [cr3] [1 0 0 0 -1 0 2 0] + [ci0] [0 2 0 2 0 0 0 0] + [ci1] [0 s 0 -s 0 -s 0 -s] + [ci2] [0 0 0 0 0 -2 0 2] + [ci3] [0 -s 0 s 0 -s 0 -s] + */ + for (k=1; k < dk; ++k) { + FUNC_REAL_PREPROCESS_4X4(in+8*k, e + k*6, out-1+k*8, 0); + } + + cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0; + cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1; + cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2; + cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3; + ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0; + ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1; + ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2; + ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3; +} + + +void FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT *setup, const float *finput, float *foutput, v4sf *scratch, + pffft_direction_t direction, int ordered) { + int k, Ncvec = setup->Ncvec; + int nf_odd = (setup->ifac[1] & 1); + + /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); + + const v4sf *vinput = (const v4sf*)finput; + v4sf *voutput = (v4sf*)foutput; + v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; + int ib = (nf_odd ^ ordered ? 1 : 0); + + assert(VALIGNED(finput) && VALIGNED(foutput)); + + /* assert(finput != foutput); */ + if (direction == PFFFT_FORWARD) { + ib = !ib; + if (setup->transform == PFFFT_REAL) { + ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + FUNC_REAL_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); + } else { + v4sf *tmp = buff[ib]; + for (k=0; k < Ncvec; ++k) { + UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); + } + ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], + setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); + FUNC_CPLX_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); + } + if (ordered) { + FUNC_ZREORDER(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); + } else ib = !ib; + } else { + if (vinput == buff[ib]) { + ib = !ib; /* may happen when finput == foutput */ + } + if (ordered) { + FUNC_ZREORDER(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); + vinput = buff[ib]; ib = !ib; + } + if (setup->transform == PFFFT_REAL) { + FUNC_REAL_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e); + ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + FUNC_CPLX_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e); + ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], + setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); + for (k=0; k < Ncvec; ++k) { + INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); + } + } + } + + if (buff[ib] != voutput) { + /* extra copy required -- this situation should only happen when finput == foutput */ + assert(finput==foutput); + for (k=0; k < Ncvec; ++k) { + v4sf a = buff[ib][2*k], b = buff[ib][2*k+1]; + voutput[2*k] = a; voutput[2*k+1] = b; + } + ib = !ib; + } + assert(buff[ib] == voutput); +} + +void FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) { + int Ncvec = s->Ncvec; + const v4sf * RESTRICT va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * RESTRICT vab = (v4sf*)ab; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif +#endif + + float ar, ai, br, bi, abr, abi; +#ifndef ZCONVOLVE_USING_INLINE_ASM + v4sf vscal = LD_PS1(scaling); + int i; +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + ar = ((v4sf_union*)va)[0].f[0]; + ai = ((v4sf_union*)va)[1].f[0]; + br = ((v4sf_union*)vb)[0].f[0]; + bi = ((v4sf_union*)vb)[1].f[0]; + abr = ((v4sf_union*)vab)[0].f[0]; + abi = ((v4sf_union*)vab)[1].f[0]; + +#ifdef ZCONVOLVE_USING_INLINE_ASM + /* inline asm version, unfortunately miscompiled by clang 3.2, + * at least on ubuntu.. so this will be restricted to gcc */ + const float *a_ = a, *b_ = b; float *ab_ = ab; + int N = Ncvec; + asm volatile("mov r8, %2 \n" + "vdup.f32 q15, %4 \n" + "1: \n" + "pld [%0,#64] \n" + "pld [%1,#64] \n" + "pld [%2,#64] \n" + "pld [%0,#96] \n" + "pld [%1,#96] \n" + "pld [%2,#96] \n" + "vld1.f32 {q0,q1}, [%0,:128]! \n" + "vld1.f32 {q4,q5}, [%1,:128]! \n" + "vld1.f32 {q2,q3}, [%0,:128]! \n" + "vld1.f32 {q6,q7}, [%1,:128]! \n" + "vld1.f32 {q8,q9}, [r8,:128]! \n" + + "vmul.f32 q10, q0, q4 \n" + "vmul.f32 q11, q0, q5 \n" + "vmul.f32 q12, q2, q6 \n" + "vmul.f32 q13, q2, q7 \n" + "vmls.f32 q10, q1, q5 \n" + "vmla.f32 q11, q1, q4 \n" + "vld1.f32 {q0,q1}, [r8,:128]! \n" + "vmls.f32 q12, q3, q7 \n" + "vmla.f32 q13, q3, q6 \n" + "vmla.f32 q8, q10, q15 \n" + "vmla.f32 q9, q11, q15 \n" + "vmla.f32 q0, q12, q15 \n" + "vmla.f32 q1, q13, q15 \n" + "vst1.f32 {q8,q9},[%2,:128]! \n" + "vst1.f32 {q0,q1},[%2,:128]! \n" + "subs %3, #2 \n" + "bne 1b \n" + : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); +#else + /* default routine, works fine for non-arm cpus with current compilers */ + for (i=0; i < Ncvec; i += 2) { + v4sf ar, ai, br, bi; + ar = va[2*i+0]; ai = va[2*i+1]; + br = vb[2*i+0]; bi = vb[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]); + vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]); + ar = va[2*i+2]; ai = va[2*i+3]; + br = vb[2*i+2]; bi = vb[2*i+3]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]); + vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]); + } +#endif + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling; + ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling; + } +} + +void FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) { + v4sf vscal = LD_PS1(scaling); + const v4sf * RESTRICT va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * RESTRICT vab = (v4sf*)ab; + float sar, sai, sbr, sbi; + const int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ + int k; /* was i -- but always used "2*i" - except at for() */ + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + sar = ((v4sf_union*)va)[0].f[0]; + sai = ((v4sf_union*)va)[1].f[0]; + sbr = ((v4sf_union*)vb)[0].f[0]; + sbi = ((v4sf_union*)vb)[1].f[0]; + + /* default routine, works fine for non-arm cpus with current compilers */ + for (k=0; k < NcvecMulTwo; k += 4) { + v4sf var, vai, vbr, vbi; + var = va[k+0]; vai = va[k+1]; + vbr = vb[k+0]; vbi = vb[k+1]; + VCPLXMUL(var, vai, vbr, vbi); + vab[k+0] = VMUL(var, vscal); + vab[k+1] = VMUL(vai, vscal); + var = va[k+2]; vai = va[k+3]; + vbr = vb[k+2]; vbi = vb[k+3]; + VCPLXMUL(var, vai, vbr, vbi); + vab[k+2] = VMUL(var, vscal); + vab[k+3] = VMUL(vai, vscal); + } + + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling; + ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling; + } +} + + +#else /* #if ( SIMD_SZ == 4 ) * !defined(PFFFT_SIMD_DISABLE) */ + +/* standard routine using scalar floats, without SIMD stuff. */ + +#define pffft_zreorder_nosimd FUNC_ZREORDER +void pffft_zreorder_nosimd(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) { + int k, N = setup->N; + if (setup->transform == PFFFT_COMPLEX) { + for (k=0; k < 2*N; ++k) out[k] = in[k]; + return; + } + else if (direction == PFFFT_FORWARD) { + float x_N = in[N-1]; + for (k=N-1; k > 1; --k) out[k] = in[k-1]; + out[0] = in[0]; + out[1] = x_N; + } else { + float x_N = in[1]; + for (k=1; k < N-1; ++k) out[k] = in[k+1]; + out[0] = in[0]; + out[N-1] = x_N; + } +} + +#define pffft_transform_internal_nosimd FUNC_TRANSFORM_INTERNAL +void pffft_transform_internal_nosimd(SETUP_STRUCT *setup, const float *input, float *output, float *scratch, + pffft_direction_t direction, int ordered) { + int Ncvec = setup->Ncvec; + int nf_odd = (setup->ifac[1] & 1); + + /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); + float *buff[2]; + int ib; + if (scratch == 0) scratch = scratch_on_stack; + buff[0] = output; buff[1] = scratch; + + if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */ + ib = (nf_odd ^ ordered ? 1 : 0); + + if (direction == PFFFT_FORWARD) { + if (setup->transform == PFFFT_REAL) { + ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); + } + if (ordered) { + FUNC_ZREORDER(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; + } + } else { + if (input == buff[ib]) { + ib = !ib; /* may happen when finput == foutput */ + } + if (ordered) { + FUNC_ZREORDER(setup, input, buff[!ib], PFFFT_BACKWARD); + input = buff[!ib]; + } + if (setup->transform == PFFFT_REAL) { + ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); + } + } + if (buff[ib] != output) { + int k; + /* extra copy required -- this situation should happens only when finput == foutput */ + assert(input==output); + for (k=0; k < Ncvec; ++k) { + float a = buff[ib][2*k], b = buff[ib][2*k+1]; + output[2*k] = a; output[2*k+1] = b; + } + ib = !ib; + } + assert(buff[ib] == output); +} + +#define pffft_zconvolve_accumulate_nosimd FUNC_ZCONVOLVE_ACCUMULATE +void pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT *s, const float *a, const float *b, + float *ab, float scaling) { + int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ + int k; /* was i -- but always used "2*i" - except at for() */ + + if (s->transform == PFFFT_REAL) { + /* take care of the fftpack ordering */ + ab[0] += a[0]*b[0]*scaling; + ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling; + ++ab; ++a; ++b; NcvecMulTwo -= 2; + } + for (k=0; k < NcvecMulTwo; k += 2) { + float ar, ai, br, bi; + ar = a[k+0]; ai = a[k+1]; + br = b[k+0]; bi = b[k+1]; + VCPLXMUL(ar, ai, br, bi); + ab[k+0] += ar*scaling; + ab[k+1] += ai*scaling; + } +} + +#define pffft_zconvolve_no_accu_nosimd FUNC_ZCONVOLVE_NO_ACCU +void pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT *s, const float *a, const float *b, + float *ab, float scaling) { + int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ + int k; /* was i -- but always used "2*i" - except at for() */ + + if (s->transform == PFFFT_REAL) { + /* take care of the fftpack ordering */ + ab[0] += a[0]*b[0]*scaling; + ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling; + ++ab; ++a; ++b; NcvecMulTwo -= 2; + } + for (k=0; k < NcvecMulTwo; k += 2) { + float ar, ai, br, bi; + ar = a[k+0]; ai = a[k+1]; + br = b[k+0]; bi = b[k+1]; + VCPLXMUL(ar, ai, br, bi); + ab[k+0] = ar*scaling; + ab[k+1] = ai*scaling; + } +} + + +#endif /* #if ( SIMD_SZ == 4 ) * !defined(PFFFT_SIMD_DISABLE) */ + + +void FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) { + FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 0); +} + +void FUNC_TRANSFORM_ORDERED(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) { + FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 1); +} + + +#if ( SIMD_SZ == 4 ) + +#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) + +/* detect bugs with the vector support macros */ +void FUNC_VALIDATE_SIMD_A() { + float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; + v4sf_union a0, a1, a2, a3, t, u; + memcpy(a0.f, f, 4*sizeof(float)); + memcpy(a1.f, f+4, 4*sizeof(float)); + memcpy(a2.f, f+8, 4*sizeof(float)); + memcpy(a3.f, f+12, 4*sizeof(float)); + + t = a0; u = a1; t.v = VZERO(); + printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); + t.v = VADD(a1.v, a2.v); + printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); + t.v = VMUL(a1.v, a2.v); + printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); + t.v = VMADD(a1.v, a2.v,a0.v); + printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); + + INTERLEAVE2(a1.v,a2.v,t.v,u.v); + printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); + assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); + UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); + printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); + assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); + + t.v=LD_PS1(f[15]); + printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); + assertv4(t, 15, 15, 15, 15); + t.v = VSWAPHL(a1.v, a2.v); + printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); + assertv4(t, 8, 9, 6, 7); + VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); + printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", + a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], + a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); + assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); +} + + +static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo ) +{ + if ( !( fabs( result - ref ) < 0.01F ) ) + { + fprintf(stderr, "%s: assert for %s at %s(%d)\n expected %f value %f\n", functxt, vartxt, f, lineNo, ref, result); + ++(*numErrs); + } +} + +static void pffft_assert4( vsfscalar v0, vsfscalar v1, vsfscalar v2, vsfscalar v3, + float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo ) +{ + pffft_assert1( v0, a, "[0]", functxt, numErrs, f, lineNo ); + pffft_assert1( v1, b, "[1]", functxt, numErrs, f, lineNo ); + pffft_assert1( v2, c, "[2]", functxt, numErrs, f, lineNo ); + pffft_assert1( v3, d, "[3]", functxt, numErrs, f, lineNo ); +} + +#define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT ) pffft_assert4( (V).f[0], (V).f[1], (V).f[2], (V).f[3], a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ ) + + +int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut) +{ + int numErrs = 0; + + { + v4sf_union C; + int k; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: { }\n" ); + } + C.v = VZERO(); + if (DbgOut) { + fprintf(DbgOut, "VZERO(a) => C) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( C, 0.0F, 0.0F, 0.0F, 0.0F, "VZERO() Out C" ); + } + + { + v4sf_union C; + float a = 42.0F; + int k; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: a = {\n" ); + fprintf(DbgOut, " Inp a: %f\n", a ); + fprintf(DbgOut, "}\n" ); + } + C.v = LD_PS1(a); + if (DbgOut) { + fprintf(DbgOut, "LD_PS1(a) => C) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( C, 42.0F, 42.0F, 42.0F, 42.0F, "LD_PS1() Out C" ); + } + + { + v4sf_union C; + float a[16]; + int numAligned = 0, numUnaligned = 0; + int k; + const char * pUn; + for ( k = 0; k < 16; ++k ) a[k] = k+1; + + for ( k = 0; k + 3 < 16; ++k ) + { + const float * ptr = &a[k]; + if (DbgOut) + fprintf(DbgOut, "\ninput: a = [ %f, %f, %f, %f ]\n", ptr[0], ptr[1], ptr[2], ptr[3] ); + if ( VALIGNED(ptr) ) + { + C.v = VLOAD_ALIGNED( ptr ); + pUn = ""; + ++numAligned; + } + else + { + C.v = VLOAD_UNALIGNED( ptr ); + pUn = "UN"; + ++numUnaligned; + } + if (DbgOut) { + fprintf(DbgOut, "C = VLOAD_%sALIGNED(&a[%d]) => {\n", pUn, k ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + //PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" ); + + if ( numAligned >= 1 && numUnaligned >= 4 ) + break; + } + if ( numAligned < 1 ) { + fprintf(stderr, "VALIGNED() should have found at least 1 occurence!"); + ++numErrs; + } + if ( numUnaligned < 4 ) { + fprintf(stderr, "!VALIGNED() should have found at least 4 occurences!"); + ++numErrs; + } + } + + { + v4sf_union A, B, C; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, "}\n" ); + } + C.v = VADD(A.v, B.v); + if (DbgOut) { + fprintf(DbgOut, "C = VADD(A,B) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VADD(): Inp A" ); + PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VADD(): Inp B" ); + PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" ); + } + + { + v4sf_union A, B, C; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 20 + 2*k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, "}\n" ); + } + C.v = VSUB(A.v, B.v); + if (DbgOut) { + fprintf(DbgOut, "C = VSUB(A,B) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 21.0F, 23.0F, 25.0F, 27.0F, "VSUB(): Inp A" ); + PFFFT_ASSERT4( B, 11.0F, 12.0F, 13.0F, 14.0F, "VSUB(): Inp B" ); + PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VSUB(): Out C" ); + } + + { + v4sf_union A, B, C; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, "}\n" ); + } + C.v = VMUL(A.v, B.v); + if (DbgOut) { + fprintf(DbgOut, "C = VMUL(A,B) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMUL(): Inp A" ); + PFFFT_ASSERT4( B, 1.0F, 2.0F, 3.0F, 4.0F, "VMUL(): Inp B" ); + PFFFT_ASSERT4( C, 11.0F, 24.0F, 39.0F, 56.0F, "VMUL(): Out C" ); + } + + { + v4sf_union A, B, C, D; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 10 + k; + for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B,C = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, " Inp C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + D.v = VMADD(A.v, B.v, C.v); + if (DbgOut) { + fprintf(DbgOut, "D = VMADD(A,B,C) => {\n" ); + fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMADD(): Inp A" ); + PFFFT_ASSERT4( B, 1.0F, 2.0F, 3.0F, 4.0F, "VMADD(): Inp B" ); + PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VMADD(): Inp C" ); + PFFFT_ASSERT4( D, 21.0F, 35.0F, 51.0F, 69.0F, "VMADD(): Out D" ); + } + + { + v4sf_union A, B, C, D; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, "}\n" ); + } + INTERLEAVE2(A.v, B.v, C.v, D.v); + if (DbgOut) { + fprintf(DbgOut, "INTERLEAVE2(A,B, => C,D) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "INTERLEAVE2() Inp A" ); + PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "INTERLEAVE2() Inp B" ); + PFFFT_ASSERT4( C, 11.0F, 21.0F, 12.0F, 22.0F, "INTERLEAVE2() Out C" ); + PFFFT_ASSERT4( D, 13.0F, 23.0F, 14.0F, 24.0F, "INTERLEAVE2() Out D" ); + } + + { + v4sf_union A, B, C, D; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, "}\n" ); + } + UNINTERLEAVE2(A.v, B.v, C.v, D.v); + if (DbgOut) { + fprintf(DbgOut, "UNINTERLEAVE2(A,B, => C,D) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "UNINTERLEAVE2() Inp A" ); + PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "UNINTERLEAVE2() Inp B" ); + PFFFT_ASSERT4( C, 11.0F, 13.0F, 21.0F, 23.0F, "UNINTERLEAVE2() Out C" ); + PFFFT_ASSERT4( D, 12.0F, 14.0F, 22.0F, 24.0F, "UNINTERLEAVE2() Out D" ); + } + + { + v4sf_union A, B, C, D; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B,C,D = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, " Inp C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, " Inp D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); + fprintf(DbgOut, "}\n" ); + } + VTRANSPOSE4(A.v, B.v, C.v, D.v); + if (DbgOut) { + fprintf(DbgOut, "VTRANSPOSE4(A,B,C,D) => {\n" ); + fprintf(DbgOut, " Out A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Out B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 21.0F, 31.0F, 41.0F, "VTRANSPOSE4(): Out A" ); + PFFFT_ASSERT4( B, 12.0F, 22.0F, 32.0F, 42.0F, "VTRANSPOSE4(): Out B" ); + PFFFT_ASSERT4( C, 13.0F, 23.0F, 33.0F, 43.0F, "VTRANSPOSE4(): Out C" ); + PFFFT_ASSERT4( D, 14.0F, 24.0F, 34.0F, 44.0F, "VTRANSPOSE4(): Out D" ); + } + + { + v4sf_union A, B, C; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A,B = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); + fprintf(DbgOut, "}\n" ); + } + C.v = VSWAPHL(A.v, B.v); + if (DbgOut) { + fprintf(DbgOut, "C = VSWAPHL(A,B) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VSWAPHL(): Inp A" ); + PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VSWAPHL(): Inp B" ); + PFFFT_ASSERT4( C, 21.0F, 22.0F, 13.0F, 14.0F, "VSWAPHL(): Out C" ); + } + + { + v4sf_union A, C; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, "}\n" ); + } + C.v = VREV_S(A.v); + if (DbgOut) { + fprintf(DbgOut, "C = VREV_S(A) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_S(): Inp A" ); + PFFFT_ASSERT4( C, 14.0F, 13.0F, 12.0F, 11.0F, "VREV_S(): Out C" ); + } + + { + v4sf_union A, C; + int k; + for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; + + if (DbgOut) { + fprintf(DbgOut, "\ninput: A = {\n" ); + fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); + fprintf(DbgOut, "}\n" ); + } + C.v = VREV_C(A.v); + if (DbgOut) { + fprintf(DbgOut, "C = VREV_C(A) => {\n" ); + fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); + fprintf(DbgOut, "}\n" ); + } + PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_C(): Inp A" ); + PFFFT_ASSERT4( C, 13.0F, 14.0F, 11.0F, 12.0F, "VREV_C(): Out A" ); + } + + return numErrs; +} + +#else /* if ( SIMD_SZ == 4 ) */ + +void FUNC_VALIDATE_SIMD_A() +{ +} + +int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut) +{ + return -1; +} + +#endif /* end if ( SIMD_SZ == 4 ) */ + diff --git a/pffft/plots.sh b/pffft/plots.sh new file mode 100755 index 0000000..c09affe --- /dev/null +++ b/pffft/plots.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +OUTPNG="1" +W="1024" +H="768" +PTS="20" +LWS="20" + +for f in $(ls -1 *-4-*.csv *-6-*.csv); do + b=$(basename "$f" ".csv") + #echo $b + LASTCOL="$(head -n 1 $f |sed 's/,/,\n/g' |grep -c ',')" + echo "${b}: last column is $LASTCOL" + if [ $(echo "$b" |grep -c -- "-1-") -gt 0 ]; then + YL="duration in ms; less is better" + elif [ $(echo "$b" |grep -c -- "-4-") -gt 0 ]; then + YL="duration relative to pffft; less is better" + else + YL="" + fi + + E="" + if [ "${OUTPNG}" = "1" ]; then + E="set terminal png size $W,$H" + E="${E} ; set output '${b}.png'" + fi + if [ -z "${E}" ]; then + E="set key outside" + else + E="${E} ; set key outside" + fi + E="${E} ; set datafile separator ','" + E="${E} ; set title '${b}'" + E="${E} ; set xlabel 'fft order: fft size N = 2\\^order'" + if [ ! -z "${YL}" ]; then + #echo " setting Y label to ${YL}" + E="${E} ; set ylabel '${YL}'" + fi + # unfortunately no effect for + #for LNO in $(seq 1 ${LASTCOL}) ; do + # E="${E} ; set style line ${LNO} ps ${PTS} lw ${LWS}" + #done + E="${E} ; plot for [col=3:${LASTCOL}] '${f}' using 2:col with lines title columnhead" + + if [ "${OUTPNG}" = "1" ]; then + gnuplot -e "${E}" + else + gnuplot -e "${E}" --persist + fi +done diff --git a/pffft/simd/pf_altivec_float.h b/pffft/simd/pf_altivec_float.h new file mode 100644 index 0000000..ef2526d --- /dev/null +++ b/pffft/simd/pf_altivec_float.h @@ -0,0 +1,81 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_ALTIVEC_FLT_H +#define PF_ALTIVEC_FLT_H + +/* + Altivec support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__)) +#pragma message( __FILE__ ": ALTIVEC float macros are defined" ) +typedef vector float v4sf; + +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + float f[SIMD_SZ]; +} v4sf_union; + +# define VREQUIRES_ALIGN 1 /* not sure, if really required */ +# define VARCH "ALTIVEC" +# define VZERO() ((vector float) vec_splat_u8(0)) +# define VMUL(a,b) vec_madd(a,b, VZERO()) +# define VADD(a,b) vec_add(a,b) +# define VMADD(a,b,c) vec_madd(a,b,c) +# define VSUB(a,b) vec_sub(a,b) +inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } +# define LD_PS1(p) ld_ps1(&p) +# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \ + vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \ + v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ + } +# define VTRANSPOSE4(x0,x1,x2,x3) { \ + v4sf y0 = vec_mergeh(x0, x2); \ + v4sf y1 = vec_mergel(x0, x2); \ + v4sf y2 = vec_mergeh(x1, x3); \ + v4sf y3 = vec_mergel(x1, x3); \ + x0 = vec_mergeh(y0, y2); \ + x1 = vec_mergel(y0, y2); \ + x2 = vec_mergeh(y1, y3); \ + x3 = vec_mergel(y1, y3); \ + } +# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) + +#endif + +#endif /* PF_SSE1_FLT_H */ + diff --git a/pffft/simd/pf_avx_double.h b/pffft/simd/pf_avx_double.h new file mode 100644 index 0000000..fe0efa8 --- /dev/null +++ b/pffft/simd/pf_avx_double.h @@ -0,0 +1,145 @@ +/* + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +*/ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_AVX_DBL_H +#define PF_AVX_DBL_H + +/* + vector support macros: the rest of the code is independant of + AVX -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ + + +/* + AVX support macros +*/ +#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__) +#pragma message( __FILE__ ": AVX macros are defined" ) + +#include +typedef __m256d v4sf; + +/* 4 doubles by simd vector */ +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + double f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "AVX" +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm256_setzero_pd() +# define VMUL(a,b) _mm256_mul_pd(a,b) +# define VADD(a,b) _mm256_add_pd(a,b) +# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) +# define VSUB(a,b) _mm256_sub_pd(a,b) +# define LD_PS1(p) _mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) + +/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in2[0], in1[1], in2[1] ] +out2 = [ in1[2], in2[2], in1[3], in2[3] ] +*/ +# define INTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ + _mm_shuffle_pd(low1__, low2__, 3), \ + 1); \ + out2 = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ + _mm_shuffle_pd(high1__, high2__, 3), \ + 1); \ + out1 = tmp__; \ +} + +/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in1[2], in2[0], in2[2] ] +out2 = [ in1[1], in1[3], in2[1], in2[3] ] +*/ +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ + _mm_shuffle_pd(low2__, high2__, 0), \ + 1); \ + out2 = _mm256_insertf128_pd( \ + _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ + _mm_shuffle_pd(low2__, high2__, 3), \ + 1); \ + out1 = tmp__; \ +} + +# define VTRANSPOSE4(row0, row1, row2, row3) { \ + __m256d tmp3, tmp2, tmp1, tmp0; \ + \ + tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \ + tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \ + tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \ + tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \ + \ + (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \ + (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \ + (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \ + (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \ + } + +/*VSWAPHL(a, b) pseudo code: +return [ b[0], b[1], a[2], a[3] ] +*/ +# define VSWAPHL(a,b) \ + _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) + +/* reverse/flip all floats */ +# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1) + +/* reverse/flip complex floats */ +# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) + +#endif + +#endif /* PF_AVX_DBL_H */ + diff --git a/pffft/simd/pf_double.h b/pffft/simd/pf_double.h new file mode 100644 index 0000000..1025827 --- /dev/null +++ b/pffft/simd/pf_double.h @@ -0,0 +1,84 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_DBL_H +#define PF_DBL_H + +#include +#include +#include + + +/* + * SIMD reference material: + * + * general SIMD introduction: + * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing + * + * SSE 1: + * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ + * + * ARM NEON: + * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics + * + * Altivec: + * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf + * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html + * better one? + * + */ + +typedef double vsfscalar; + +#include "pf_avx_double.h" +#include "pf_sse2_double.h" +#include "pf_neon_double.h" + +#ifndef SIMD_SZ +# if !defined(PFFFT_SIMD_DISABLE) +# pragma message( "building double with simd disabled !" ) +# define PFFFT_SIMD_DISABLE /* fallback to scalar code */ +# endif +#endif + +#include "pf_scalar_double.h" + +/* shortcuts for complex multiplcations */ +#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } +#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } +#ifndef SVMUL +/* multiply a scalar with a vector */ +#define SVMUL(f,v) VMUL(LD_PS1(f),v) +#endif + +#endif /* PF_DBL_H */ + diff --git a/pffft/simd/pf_float.h b/pffft/simd/pf_float.h new file mode 100644 index 0000000..eab2723 --- /dev/null +++ b/pffft/simd/pf_float.h @@ -0,0 +1,84 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_FLT_H +#define PF_FLT_H + +#include +#include +#include + + +/* + * SIMD reference material: + * + * general SIMD introduction: + * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing + * + * SSE 1: + * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ + * + * ARM NEON: + * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics + * + * Altivec: + * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf + * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html + * better one? + * + */ + +typedef float vsfscalar; + +#include "pf_sse1_float.h" +#include "pf_neon_float.h" +#include "pf_altivec_float.h" + +#ifndef SIMD_SZ +# if !defined(PFFFT_SIMD_DISABLE) +# pragma message( "building float with simd disabled !" ) +# define PFFFT_SIMD_DISABLE /* fallback to scalar code */ +# endif +#endif + +#include "pf_scalar_float.h" + +/* shortcuts for complex multiplcations */ +#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } +#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } +#ifndef SVMUL +/* multiply a scalar with a vector */ +#define SVMUL(f,v) VMUL(LD_PS1(f),v) +#endif + +#endif /* PF_FLT_H */ + diff --git a/pffft/simd/pf_neon_double.h b/pffft/simd/pf_neon_double.h new file mode 100644 index 0000000..e432abc --- /dev/null +++ b/pffft/simd/pf_neon_double.h @@ -0,0 +1,203 @@ +/* + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +*/ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_NEON_DBL_H +#define PF_NEON_DBL_H + +/* + NEON 64bit support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__)) + +#pragma message (__FILE__ ": NEON (from AVX) macros are defined" ) + +#include "pf_neon_double_from_avx.h" +typedef __m256d v4sf; + +/* 4 doubles by simd vector */ +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + double f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "NEON" +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm256_setzero_pd() +# define VMUL(a,b) _mm256_mul_pd(a,b) +# define VADD(a,b) _mm256_add_pd(a,b) +# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) +# define VSUB(a,b) _mm256_sub_pd(a,b) +# define LD_PS1(p) _mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) + +FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b; + return res; +} + +FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) +{ + float64x1_t al = vget_low_f64(a); + float64x1_t bl = vget_low_f64(b); + return vcombine_f64(al, bl); +} + +FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b) +{ + float64x1_t ah = vget_high_f64(a); + float64x1_t bh = vget_high_f64(b); + return vcombine_f64(ah, bh); +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]); + res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]); + return res; +} + +FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]); + res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]); + return res; +} + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) { + __m256d res; + res.vect_f64[0] = a.vect_f64[0]; + res.vect_f64[1] = b.vect_f64[0]; + return res; +} + + +FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b) +{ + __m256d res; + res.vect_f64[0] = a.vect_f64[1]; + res.vect_f64[1] = b.vect_f64[1]; + return res; +} + +FORCE_INLINE __m256d _mm256_reverse(__m256d x) +{ + __m256d res; + float64x2_t low = x.vect_f64[0]; + float64x2_t high = x.vect_f64[1]; + float64x1_t a = vget_low_f64(low); + float64x1_t b = vget_high_f64(low); + float64x1_t c = vget_low_f64(high); + float64x1_t d = vget_high_f64(high); + res.vect_f64[0] = vcombine_f64(d, c); + res.vect_f64[1] = vcombine_f64(b, a); + return res; +} + +/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in2[0], in1[1], in2[1] ] +out2 = [ in1[2], in2[2], in1[3], in2[3] ] +*/ +# define INTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ + _mm_shuffle_pd_11(low1__, low2__)); \ + out2 = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ + _mm_shuffle_pd_11(high1__, high2__)); \ + out1 = tmp__; \ +} + +/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in1[2], in2[0], in2[2] ] +out2 = [ in1[1], in1[3], in2[1], in2[3] ] +*/ +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = _mm256_castpd256_pd128(in1); \ + __m128d low2__ = _mm256_castpd256_pd128(in2); \ + __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ + __m256d tmp__ = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ + _mm_shuffle_pd_00(low2__, high2__)); \ + out2 = _mm256_insertf128_pd_1( \ + _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ + _mm_shuffle_pd_11(low2__, high2__)); \ + out1 = tmp__; \ +} + +# define VTRANSPOSE4(row0, row1, row2, row3) { \ + __m256d tmp3, tmp2, tmp1, tmp0; \ + \ + tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \ + tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \ + tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \ + tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \ + \ + (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \ + (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \ + (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \ + (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \ + } + +/*VSWAPHL(a, b) pseudo code: +return [ b[0], b[1], a[2], a[3] ] +*/ +# define VSWAPHL(a,b) \ + _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) + +/* reverse/flip all floats */ +# define VREV_S(a) _mm256_reverse(a) + +/* reverse/flip complex floats */ +# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) + +#endif + +#endif /* PF_AVX_DBL_H */ + diff --git a/pffft/simd/pf_neon_double_from_avx.h b/pffft/simd/pf_neon_double_from_avx.h new file mode 100644 index 0000000..5cce17e --- /dev/null +++ b/pffft/simd/pf_neon_double_from_avx.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + + */ + +//see https://github.com/kunpengcompute/AvxToNeon + +#ifndef PF_NEON_DBL_FROM_AVX_H +#define PF_NEON_DBL_FROM_AVX_H +#include + + +#if defined(__GNUC__) || defined(__clang__) + +#pragma push_macro("FORCE_INLINE") +#define FORCE_INLINE static inline __attribute__((always_inline)) + +#else + +#error "Macro name collisions may happens with unknown compiler" +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif + +#define FORCE_INLINE static inline + +#endif + +typedef struct { + float32x4_t vect_f32[2]; +} __m256; + +typedef struct { + float64x2_t vect_f64[2]; +} __m256d; + +typedef float64x2_t __m128d; + +FORCE_INLINE __m256d _mm256_setzero_pd(void) +{ + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); + return ret; +} + +FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) +{ + __m256d res_m256d; + res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); + res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); + return res_m256d; +} + +FORCE_INLINE __m256d _mm256_set1_pd(double a) +{ + __m256d ret; + ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); + return ret; +} + +FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) +{ + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; +} +FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) +{ + __m256d res; + res.vect_f64[0] = vld1q_f64((const double *)mem_addr); + res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); + return res; +} + +FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) +{ + return a.vect_f64[0]; +} + +FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) +{ + assert(imm8 >= 0 && imm8 <= 1); + return a.vect_f64[imm8]; +} + +FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) +{ + __m256d res; + res.vect_f64[0] = a; + return res; +} + +#endif /* PF_AVX_DBL_H */ + diff --git a/pffft/simd/pf_neon_float.h b/pffft/simd/pf_neon_float.h new file mode 100644 index 0000000..c7a547f --- /dev/null +++ b/pffft/simd/pf_neon_float.h @@ -0,0 +1,87 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_NEON_FLT_H +#define PF_NEON_FLT_H + +/* + ARM NEON support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__)) +#pragma message( __FILE__ ": ARM NEON macros are defined" ) + +# include +typedef float32x4_t v4sf; + +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + float f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "NEON" +# define VREQUIRES_ALIGN 0 /* usually no alignment required */ +# define VZERO() vdupq_n_f32(0) +# define VMUL(a,b) vmulq_f32(a,b) +# define VADD(a,b) vaddq_f32(a,b) +# define VMADD(a,b,c) vmlaq_f32(c,a,b) +# define VSUB(a,b) vsubq_f32(a,b) +# define LD_PS1(p) vld1q_dup_f32(&(p)) +# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr))) +# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr))) +# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } +# define VTRANSPOSE4(x0,x1,x2,x3) { \ + float32x4x2_t t0_ = vzipq_f32(x0, x2); \ + float32x4x2_t t1_ = vzipq_f32(x1, x3); \ + float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ + float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \ + x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ + } +// marginally faster version +//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } +# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) + +/* reverse/flip all floats */ +# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a))) +/* reverse/flip complex floats */ +# define VREV_C(a) vextq_f32(a, a, 2) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0) + +#else +/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */ +#endif + +#endif /* PF_NEON_FLT_H */ + diff --git a/pffft/simd/pf_scalar_double.h b/pffft/simd/pf_scalar_double.h new file mode 100644 index 0000000..b7a1cae --- /dev/null +++ b/pffft/simd/pf_scalar_double.h @@ -0,0 +1,185 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_SCAL_DBL_H +#define PF_SCAL_DBL_H + +/* + fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead +*/ + +#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED) +#pragma message( __FILE__ ": double SCALAR4 macros are defined" ) + +typedef struct { + vsfscalar a; + vsfscalar b; + vsfscalar c; + vsfscalar d; +} v4sf; + +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + vsfscalar f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "4xScalar" +# define VREQUIRES_ALIGN 0 + + static ALWAYS_INLINE(v4sf) VZERO() { + v4sf r = { 0.f, 0.f, 0.f, 0.f }; + return r; + } + + static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) { + v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) { + v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) { + v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) { + v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) { + v4sf r = { v, v, v, v }; + return r; + } + +# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr))) + +# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr))) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0) + + + /* INTERLEAVE2() */ + #define INTERLEAVE2( A, B, C, D) \ + do { \ + v4sf Cr = { A.a, B.a, A.b, B.b }; \ + v4sf Dr = { A.c, B.c, A.d, B.d }; \ + C = Cr; \ + D = Dr; \ + } while (0) + + + /* UNINTERLEAVE2() */ + #define UNINTERLEAVE2(A, B, C, D) \ + do { \ + v4sf Cr = { A.a, A.c, B.a, B.c }; \ + v4sf Dr = { A.b, A.d, B.b, B.d }; \ + C = Cr; \ + D = Dr; \ + } while (0) + + + /* VTRANSPOSE4() */ + #define VTRANSPOSE4(A, B, C, D) \ + do { \ + v4sf Ar = { A.a, B.a, C.a, D.a }; \ + v4sf Br = { A.b, B.b, C.b, D.b }; \ + v4sf Cr = { A.c, B.c, C.c, D.c }; \ + v4sf Dr = { A.d, B.d, C.d, D.d }; \ + A = Ar; \ + B = Br; \ + C = Cr; \ + D = Dr; \ + } while (0) + + + /* VSWAPHL() */ + static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) { + v4sf r = { B.a, B.b, A.c, A.d }; + return r; + } + + + /* reverse/flip all floats */ + static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) { + v4sf r = { A.d, A.c, A.b, A.a }; + return r; + } + + /* reverse/flip complex floats */ + static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) { + v4sf r = { A.c, A.d, A.a, A.b }; + return r; + } + +#else +/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */ +#endif + + +#if !defined(SIMD_SZ) +#pragma message( __FILE__ ": float SCALAR1 macros are defined" ) +typedef vsfscalar v4sf; + +# define SIMD_SZ 1 + +typedef union v4sf_union { + v4sf v; + vsfscalar f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "Scalar" +# define VREQUIRES_ALIGN 0 +# define VZERO() 0.0 +# define VMUL(a,b) ((a)*(b)) +# define VADD(a,b) ((a)+(b)) +# define VMADD(a,b,c) ((a)*(b)+(c)) +# define VSUB(a,b) ((a)-(b)) +# define LD_PS1(p) (p) +# define VLOAD_UNALIGNED(ptr) (*(ptr)) +# define VLOAD_ALIGNED(ptr) (*(ptr)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0) + +#else +/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */ +#endif + + +#endif /* PF_SCAL_DBL_H */ + diff --git a/pffft/simd/pf_scalar_float.h b/pffft/simd/pf_scalar_float.h new file mode 100644 index 0000000..4588588 --- /dev/null +++ b/pffft/simd/pf_scalar_float.h @@ -0,0 +1,185 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_SCAL_FLT_H +#define PF_SCAL_FLT_H + +/* + fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead +*/ + +#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED) +#pragma message( __FILE__ ": float SCALAR4 macros are defined" ) + +typedef struct { + vsfscalar a; + vsfscalar b; + vsfscalar c; + vsfscalar d; +} v4sf; + +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + vsfscalar f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "4xScalar" +# define VREQUIRES_ALIGN 0 + + static ALWAYS_INLINE(v4sf) VZERO() { + v4sf r = { 0.f, 0.f, 0.f, 0.f }; + return r; + } + + static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) { + v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) { + v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) { + v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) { + v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d }; + return r; + } + + static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) { + v4sf r = { v, v, v, v }; + return r; + } + +# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr))) + +# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr))) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0) + + + /* INTERLEAVE2() */ + #define INTERLEAVE2( A, B, C, D) \ + do { \ + v4sf Cr = { A.a, B.a, A.b, B.b }; \ + v4sf Dr = { A.c, B.c, A.d, B.d }; \ + C = Cr; \ + D = Dr; \ + } while (0) + + + /* UNINTERLEAVE2() */ + #define UNINTERLEAVE2(A, B, C, D) \ + do { \ + v4sf Cr = { A.a, A.c, B.a, B.c }; \ + v4sf Dr = { A.b, A.d, B.b, B.d }; \ + C = Cr; \ + D = Dr; \ + } while (0) + + + /* VTRANSPOSE4() */ + #define VTRANSPOSE4(A, B, C, D) \ + do { \ + v4sf Ar = { A.a, B.a, C.a, D.a }; \ + v4sf Br = { A.b, B.b, C.b, D.b }; \ + v4sf Cr = { A.c, B.c, C.c, D.c }; \ + v4sf Dr = { A.d, B.d, C.d, D.d }; \ + A = Ar; \ + B = Br; \ + C = Cr; \ + D = Dr; \ + } while (0) + + + /* VSWAPHL() */ + static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) { + v4sf r = { B.a, B.b, A.c, A.d }; + return r; + } + + + /* reverse/flip all floats */ + static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) { + v4sf r = { A.d, A.c, A.b, A.a }; + return r; + } + + /* reverse/flip complex floats */ + static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) { + v4sf r = { A.c, A.d, A.a, A.b }; + return r; + } + +#else +/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */ +#endif + + +#if !defined(SIMD_SZ) +#pragma message( __FILE__ ": float SCALAR1 macros are defined" ) +typedef vsfscalar v4sf; + +# define SIMD_SZ 1 + +typedef union v4sf_union { + v4sf v; + vsfscalar f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "Scalar" +# define VREQUIRES_ALIGN 0 +# define VZERO() 0.f +# define VMUL(a,b) ((a)*(b)) +# define VADD(a,b) ((a)+(b)) +# define VMADD(a,b,c) ((a)*(b)+(c)) +# define VSUB(a,b) ((a)-(b)) +# define LD_PS1(p) (p) +# define VLOAD_UNALIGNED(ptr) (*(ptr)) +# define VLOAD_ALIGNED(ptr) (*(ptr)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0) + +#else +/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */ +#endif + + +#endif /* PF_SCAL_FLT_H */ + diff --git a/pffft/simd/pf_sse1_float.h b/pffft/simd/pf_sse1_float.h new file mode 100644 index 0000000..df73c2e --- /dev/null +++ b/pffft/simd/pf_sse1_float.h @@ -0,0 +1,82 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_SSE1_FLT_H +#define PF_SSE1_FLT_H + +/* + SSE1 support macros +*/ +#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86)) +#pragma message( __FILE__ ": SSE1 float macros are defined" ) + +#include +typedef __m128 v4sf; + +/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions + * anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */ +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + float f[SIMD_SZ]; +} v4sf_union; + +# define VARCH "SSE1" +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm_setzero_ps() +# define VMUL(a,b) _mm_mul_ps(a,b) +# define VADD(a,b) _mm_add_ps(a,b) +# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) +# define VSUB(a,b) _mm_sub_ps(a,b) +# define LD_PS1(p) _mm_set1_ps(p) +# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr) +# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr) + +# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } +# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) +# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) + +/* reverse/flip all floats */ +# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3)) +/* reverse/flip complex floats */ +# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2)) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) + +#else +/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */ +#endif + +#endif /* PF_SSE1_FLT_H */ + diff --git a/pffft/simd/pf_sse2_double.h b/pffft/simd/pf_sse2_double.h new file mode 100644 index 0000000..da87951 --- /dev/null +++ b/pffft/simd/pf_sse2_double.h @@ -0,0 +1,281 @@ +/* + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) +*/ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#ifndef PF_SSE2_DBL_H +#define PF_SSE2_DBL_H + +//detect sse2 support under MSVC +#if defined ( _M_IX86_FP ) +# if _M_IX86_FP == 2 +# if !defined(__SSE2__) +# define __SSE2__ +# endif +# endif +#endif + +/* + SSE2 64bit support macros +*/ +#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 )) +#pragma message (__FILE__ ": SSE2 double macros are defined" ) + +#include + +typedef struct { + __m128d d128[2]; +} m256d; + +typedef m256d v4sf; + +# define SIMD_SZ 4 + +typedef union v4sf_union { + v4sf v; + double f[SIMD_SZ]; +} v4sf_union; + + +#if defined(__GNUC__) || defined(__clang__) + +#pragma push_macro("FORCE_INLINE") +#define FORCE_INLINE static inline __attribute__((always_inline)) + +#elif defined (_MSC_VER) +#define FORCE_INLINE static __forceinline + +#else +#error "Macro name collisions may happens with unknown compiler" +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#endif + +FORCE_INLINE m256d mm256_setzero_pd(void) +{ + m256d ret; + ret.d128[0] = ret.d128[1] = _mm_setzero_pd(); + return ret; +} + +FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b) +{ + m256d ret; + ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]); + ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]); + return ret; +} + +FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b) +{ + m256d ret; + ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]); + ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]); + return ret; +} + +FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b) +{ + m256d ret; + ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]); + ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]); + return ret; +} + +FORCE_INLINE m256d mm256_set1_pd(double a) +{ + m256d ret; + ret.d128[0] = ret.d128[1] = _mm_set1_pd(a); + return ret; +} + +FORCE_INLINE m256d mm256_load_pd (double const * mem_addr) +{ + m256d res; + res.d128[0] = _mm_load_pd((const double *)mem_addr); + res.d128[1] = _mm_load_pd((const double *)mem_addr + 2); + return res; +} +FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr) +{ + m256d res; + res.d128[0] = _mm_loadu_pd((const double *)mem_addr); + res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2); + return res; +} + + +# define VARCH "SSE2" +# define VREQUIRES_ALIGN 1 +# define VZERO() mm256_setzero_pd() +# define VMUL(a,b) mm256_mul_pd(a,b) +# define VADD(a,b) mm256_add_pd(a,b) +# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c) +# define VSUB(a,b) mm256_sub_pd(a,b) +# define LD_PS1(p) mm256_set1_pd(p) +# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr) +# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr) + + +FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a) +{ + return a.d128[0]; +} + +FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8) +{ + assert(imm8 >= 0 && imm8 <= 1); + return a.d128[imm8]; +} +FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b) +{ + m256d res; + res.d128[0] = a.d128[0]; + res.d128[1] = b; + return res; +} +FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a) +{ + m256d res; + res.d128[0] = a; + return res; +} + +FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b) +{ + m256d res; + res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0); + res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0); + return res; +} + +FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b) +{ + m256d res; + res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3); + res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3); + return res; +} + +FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) { + m256d res; + res.d128[0] = a.d128[0]; + res.d128[1] = b.d128[0]; + return res; +} + + +FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b) +{ + m256d res; + res.d128[0] = a.d128[1]; + res.d128[1] = b.d128[1]; + return res; +} + +FORCE_INLINE m256d mm256_reverse(m256d x) +{ + m256d res; + res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1); + res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1); + return res; +} + +/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in2[0], in1[1], in2[1] ] +out2 = [ in1[2], in2[2], in1[3], in2[3] ] +*/ +# define INTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = mm256_castpd256_pd128(in1); \ + __m128d low2__ = mm256_castpd256_pd128(in2); \ + __m128d high1__ = mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = mm256_extractf128_pd(in2, 1); \ + m256d tmp__ = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ + _mm_shuffle_pd(low1__, low2__, 3)); \ + out2 = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ + _mm_shuffle_pd(high1__, high2__, 3)); \ + out1 = tmp__; \ +} + +/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: +out1 = [ in1[0], in1[2], in2[0], in2[2] ] +out2 = [ in1[1], in1[3], in2[1], in2[3] ] +*/ +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + __m128d low1__ = mm256_castpd256_pd128(in1); \ + __m128d low2__ = mm256_castpd256_pd128(in2); \ + __m128d high1__ = mm256_extractf128_pd(in1, 1); \ + __m128d high2__ = mm256_extractf128_pd(in2, 1); \ + m256d tmp__ = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ + _mm_shuffle_pd(low2__, high2__, 0)); \ + out2 = mm256_insertf128_pd_1( \ + mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ + _mm_shuffle_pd(low2__, high2__, 3)); \ + out1 = tmp__; \ +} + +# define VTRANSPOSE4(row0, row1, row2, row3) { \ + m256d tmp3, tmp2, tmp1, tmp0; \ + \ + tmp0 = mm256_shuffle_pd_00((row0),(row1)); \ + tmp2 = mm256_shuffle_pd_11((row0),(row1)); \ + tmp1 = mm256_shuffle_pd_00((row2),(row3)); \ + tmp3 = mm256_shuffle_pd_11((row2),(row3)); \ + \ + (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \ + (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \ + (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \ + (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \ + } + +/*VSWAPHL(a, b) pseudo code: +return [ b[0], b[1], a[2], a[3] ] +*/ +# define VSWAPHL(a,b) \ + mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1)) + +/* reverse/flip all floats */ +# define VREV_S(a) mm256_reverse(a) + +/* reverse/flip complex floats */ +# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a)) + +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) + +#endif +#endif diff --git a/pffft/sse2neon.h b/pffft/sse2neon.h new file mode 100644 index 0000000..b28a797 --- /dev/null +++ b/pffft/sse2neon.h @@ -0,0 +1,5956 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// This header file does not yet translate all of the SSE intrinsics. +// +// Contributors to this work are: +// John W. Ratcliff +// Brandon Rowlett +// Ken Fast +// Eric van Beurden +// Alexander Potylitsin +// Hasindu Gamaarachchi +// Jim Huang +// Mark Cheng +// Malcolm James MacLeod +// Devin Hussey (easyaspi314) +// Sebastian Pop +// Developer Ecosystem Engineering +// Danila Kutenin +// François Turban (JishinMaster) +// Pei-Hsuan Hung +// Yang-Hao Yuan + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Tunable configurations */ + +/* Enable precise implementation of _mm_min_ps and _mm_max_ps + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#else +#error "Macro name collisions may happen with unsupported compiler." +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#endif + +#include +#include + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#elif defined(__aarch64__) +#pragma GCC push_options +#pragma GCC target("+simd") +#else +#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#endif +#endif + +#include + +/* Rounding functions require either Aarch64 instructions or libm failback */ +#if !defined(__aarch64__) +#include +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if __GNUC__ <= 9 +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an _m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ <= 9 +FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#endif +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm__ + * + * The parts of this format are given as follows: + * 1. describes the operation performed by the intrinsic + * 2. identifies the data type of the function's primary arguments + * + * This last part, , is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors cantain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + * + * Data (Number, Binary, Byte Index): + +------+------+-------------+------+------+-------------+ + | 1 | 2 | 3 | 4 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary + +------+------+------+------+------+------+------+------+ + | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 5 | 6 | 7 | 8 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + * Index (Byte Index): + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | + +------+------+------+------+------+------+------+------+ + * Result: + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index + +------+------+------+------+------+------+------+------+ + | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary + +------+------+------+------+------+------+------+------+ + | 256 | 2 | 5 | 6 | Number + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 3 | 7 | 4 | 8 | Number + +------+------+------+------+------+------+-------------+ + */ + +/* Set/get methods */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ + _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ + _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ + _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ + _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ +}; + +// Loads one cache line of data from address p to a location closer to the +// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx +FORCE_INLINE void _mm_prefetch(const void *p, int i) +{ + (void) i; + __builtin_prefetch(p); +} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Sets the 128-bit value to zero +// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128(void) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Clears the four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Sets the four single-precision, floating-point values to w. +// +// r0 := r1 := r2 := r3 := w +// +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to w. +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to the four inputs. +// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) +{ + float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the four single-precision, floating-point values to the four inputs in +// reverse order. +// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the 8 signed 16-bit integer values in reverse order. +// +// Return Value +// r0 := w0 +// r1 := w1 +// ... +// r7 := w7 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) +{ + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); +} + +// Sets the 4 signed 32-bit integer values in reverse order +// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +{ + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Sets the 16 signed 8-bit integer values to b. +// +// r0 := b +// r1 := b +// ... +// r15 := b +// +// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +{ + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Sets the 8 signed 16-bit integer values to w. +// +// r0 := w +// r1 := w +// ... +// r7 := w +// +// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set1_epi16(short w) +{ + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Sets the 16 signed 8-bit integer values. +// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 8 signed 16-bit integer values. +// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) +{ + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Sets the 16 signed 8-bit integer values in reverse order. +// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 4 signed 32-bit integer values to i. +// +// r0 := i +// r1 := i +// r2 := i +// r3 := I +// +// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Sets the 4 signed 32-bit integer values. +// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +{ + int64_t ALIGN_STRUCT(16) data[2] = {i2, i1}; + return vreinterpretq_m128i_s64(vld1q_s64(data)); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +{ + return _mm_set_epi64x((int64_t) i1, (int64_t) i2); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +{ + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif +} + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores the lower single - precision, floating - point value. +// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +{ + _mm_store_pd(mem_addr, a); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. +// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +{ + uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); + uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); + *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); +} + +// Stores the lower two single-precision floating point values of a to the +// address p. +// +// *p0 := a0 +// *p1 := a1 +// +// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Stores the upper two single-precision, floating-point values of a to the +// address p. +// +// *p0 := a2 +// *p1 := a3 +// +// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Loads a single single-precision, floating-point value, copying it into all +// four words +// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Sets the lower two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the upper two values are passed +// through from a. +// +// Return Value +// r0 := *p0 +// r1 := *p1 +// r2 := a2 +// r3 := a3 +// +// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[31:0] := MEM[mem_addr+127:mem_addr+96] +// dst[63:32] := MEM[mem_addr+95:mem_addr+64] +// dst[95:64] := MEM[mem_addr+63:mem_addr+32] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +{ + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Sets the upper two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the lower two values are passed +// through from a. +// +// r0 := a0 +// r1 := a1 +// r2 := *p0 +// r3 := *p1 +// +// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// +// dst[15:0] := MEM[mem_addr+15:mem_addr] +// dst[MAX:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +{ + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[MAX:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from 16-byte aligned memory, floating-point +// values. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +{ + return _mm_load_pd(p); +} + +// Loads an single - precision, floating - point value into the low word and +// clears the upper three words. +// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float *p) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +{ + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[63:0] := MEM[mem_addr+127:mem_addr+64] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) +{ +#if defined(__aarch64__) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Sets the low word to the single-precision, floating-point value of b +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Return vector of type __m128 with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ + __m128 a; + return a; +} + +/* Logic/Binary operations */ + +// Computes the bitwise AND-NOT of the four single-precision, floating-point +// values of a and b. +// +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +{ + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the +// 128-bit value in a. +// +// r := (~a) & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in +// b. +// +// r := a & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise AND of the four single-precision, floating-point values +// of a and b. +// +// r0 := a0 & b0 +// r1 := a1 & b1 +// r2 := a2 & b2 +// r3 := a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] AND b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the four single-precision, floating-point values +// of a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, +// floating-point values of a and b. +// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// +// r := a | b +// +// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in +// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Moves the upper two values of B into the lower two values of A. +// +// r3 := a3 +// r2 := a2 +// r1 := b3 +// r0 := b2 +FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +} + +// Moves the lower two values of B into the upper two values of A. +// +// r3 := b1 +// r2 := b0 +// r1 := a1 +// r0 := a0 +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +{ + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +{ + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +{ + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +{ + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// NEON does not support a general purpose permute intrinsic +// Selects four specific single-precision, floating-point values from a and b, +// based on the mask i. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +#define _mm_shuffle_ps_default(a, b, imm) \ + __extension__({ \ + float32x4_t ret; \ + ret = vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128_f32(ret); \ + }) + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = __builtin_shufflevector( \ + _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + __m128 ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps((b), (a)); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032((a), (b)); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default((a), (b), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most signficant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least signficant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +{ + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +{ + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + __extension__({ \ + int32x4_t ret; \ + ret = vmovq_n_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128i_s32(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) +// int imm) +#if defined(__aarch64__) +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ + }) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ + }) +#endif + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. +// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = __builtin_shufflevector( \ + _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032((a)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301((a)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321((a)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101((a)); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122((a)); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332((a)); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat((a), 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat((a), 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat((a), 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat((a), 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default((a), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) +// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflelo_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflehi_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[j] +// dst[i+15:i] := b[i+15:i] +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + __extension__({ \ + const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t _a = vreinterpretq_u16_m128i(a); \ + uint16x8_t _b = vreinterpretq_u16_m128i(b); \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ + }) + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// +// FOR j := 0 to 15 +// i := j*8 +// IF mask[i+7] +// dst[i+7:i] := b[i+7:i] +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +/* Shifts */ + + +// Shift packed 16-bit integers in a right by imm while shifting in sign +// bits, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +{ + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx +#define _mm_slli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) <= 0) { \ + ret = a; \ + } else if ((imm) > 15) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s16( \ + vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. : +// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +{ + if (imm <= 0) /* TODO: add constant range macro: [0, 255] */ + return a; + if (imm > 31) /* TODO: add unlikely macro */ + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (imm <= 0) /* TODO: add constant range macro: [0, 255] */ + return a; + if (imm > 63) /* TODO: add unlikely macro */ + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) == 0) { \ + ret = a; \ + } else if (0 < (imm) && (imm) < 16) { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) == 0) { \ + ret = a; \ + } else if (0 < (imm) && (imm) < 32) { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) == 0) { \ + ret = a; \ + } else if (0 < (imm) && (imm) < 64) { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) == 0) { \ + ret = a; \ + } else if (0 < (imm) && (imm) < 32) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ + } \ + ret; \ + }) + +// Shifts the 128 - bit value in a right by imm bytes while shifting in +// zeros.imm must be an immediate. +// +// r := srl(a, imm*8) +// +// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) <= 0) { \ + ret = a; \ + } else if ((imm) > 15) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm +// must be an immediate. +// +// r := a << (imm * 8) +// +// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_slli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if ((imm) <= 0) { \ + ret = a; \ + } else if ((imm) > 15) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8(vextq_s8( \ + vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ + } \ + ret; \ + }) + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (c > 15) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// r2 := a2 << count +// r3 := a3 << count +// +// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (c > 31) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// +// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (c > 63) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// ... +// r7 := srl(a7, count) +// +// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (c > 15) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// r2 := srl(a2, count) +// r3 := srl(a3, count) +// +// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (c > 31) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// +// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (c > 63) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// NEON does not provide a version of this function. +// Creates a 16-bit mask from the most significant bits of the 16 signed or +// unsigned 8-bit integers in a and zero extends the upper bits. +// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ +#if defined(__aarch64__) + uint8x16_t input = vreinterpretq_u8_m128i(a); + const int8_t ALIGN_STRUCT(16) + xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0}; + const uint8x16_t mask_and = vdupq_n_u8(0x80); + const int8x16_t mask_shift = vld1q_s8(xr); + const uint8x16_t mask_result = + vshlq_u8(vandq_u8(input, mask_and), mask_shift); + uint8x8_t lo = vget_low_u8(mask_result); + uint8x8_t hi = vget_high_u8(mask_result); + + return vaddv_u8(lo) + (vaddv_u8(hi) << 8); +#else + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +#endif +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +{ + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four +// single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 + : 1; +} + +/* Math operations */ + +// Subtracts the four single-precision, floating-point values of a and b. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// +// dst[31:0] := a[31:0] - b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, +// and store the results in dst. +// r0 := a0 - b0 +// r1 := a1 - b1 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or +// unsigned 32-bit integers of a. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// +// dst[63:0] := a[63:0] - b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit +// integers of a and saturates.. +// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit +// integers of a and saturates. +// +// r0 := UnsignedSaturate(a0 - b0) +// r1 := UnsignedSaturate(a1 - b1) +// ... +// r15 := UnsignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r15 := SignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r7 := SignedSaturate(a7 - b7) +// +// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..15 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) +{ + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..7 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..3 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 3 +// i := j*16 +// IF b[i+15:i] < 0 +// dst[i+15:i] := -(a[i+15:i]) +// ELSE IF b[i+15:i] == 0 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 1 +// i := j*32 +// IF b[i+31:i] < 0 +// dst[i+31:i] := -(a[i+31:i]) +// ELSE IF b[i+31:i] == 0 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := a[i+31:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7:i] < 0 +// dst[i+7:i] := -(a[i+7:i]) +// ELSE IF b[i+7:i] == 0 +// dst[i+7:i] := 0 +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +{ + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Computes the average of the 16 unsigned 8-bit integers in a and the 16 +// unsigned 8-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r15 := (a15 + b15) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the average of the 8 unsigned 16-bit integers in a and the 8 +// unsigned 16-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r7 := (a7 + b7) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +{ + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Adds the four single-precision, floating-point values of a and b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// adds the scalar single-precision floating point values of a and b. +// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of . + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or +// unsigned 16-bit integers in b. +// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or +// unsigned 8-bit integers in b. +// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b +// and saturates. +// +// r0 := SignedSaturate(a0 + b0) +// r1 := SignedSaturate(a1 + b1) +// ... +// r7 := SignedSaturate(a7 + b7) +// +// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in +// b and saturates.. +// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or +// unsigned 16-bit integers from b. +// +// r0 := (a0 * b0)[15:0] +// r1 := (a1 * b1)[15:0] +// ... +// r7 := (a7 * b7)[15:0] +// +// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or +// unsigned 32-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// tmp[31:0] := a[i+15:i] * b[i+15:i] +// dst[i+15:i] := tmp[31:16] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Multiplies the four single-precision, floating-point values of a and b. +// +// r0 := a0 * b0 +// r1 := a1 * b1 +// r2 := a2 * b2 +// r3 := a3 * b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// +// dst[31:0] := a[31:0] * b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// +// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) +// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// +// dst[63:0] := a[31:0] * b[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// +// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 +// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0) + (a1 * b1) +// r1 := (a2 * b2) + (a3 * b3) +// r2 := (a4 * b4) + (a5 * b5) +// r3 := (a6 * b6) + (a7 * b7) +// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// +// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) +// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) +// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) +// ... +// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + +// a[i+7:i]*b[i+7:i] ) +// ENDFOR +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Computes the fused multiple add product of 32-bit floating point numbers. +// +// Return Value +// Multiplies A and B, and adds C to the temporary result before returning it. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd +FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +{ + __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + return _mm_fmadd_ps(b, mask, a); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + uint16_t r4 = t[4] + t[5] + t[6] + t[7]; + uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); + return (__m128i) vsetq_lane_u16(r4, r, 4); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +{ + uint16x4_t t = + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// +// FOR j := 0 to 7 +// i := j*8 +// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +// ENDFOR +// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + +// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Divides the four single-precision, floating-point values of a and b. +// +// r0 := a0 / b0 +// r1 := a1 / b1 +// r2 := a2 / b2 +// r3 := a3 / b3 +// +// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b)); + float32x4_t recip1 = + vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1)); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. +// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the approximations of reciprocals of the four single-precision, +// floating-point values of a. +// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// +// dst[31:0] := (1.0 / a[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) +{ + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Computes the approximations of square roots of the four single-precision, +// floating-point values of a. First computes reciprocal square roots and then +// reciprocals of the four values. +// +// r0 := sqrt(a0) +// r1 := sqrt(a1) +// r2 := sqrt(a2) +// r3 := sqrt(a3) +// +// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t sq = vrecpeq_f32(recipsq); + // ??? use step versions of both sqrt and recip for better accuracy? + return vreinterpretq_m128_f32(sq); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision +// floating point value of in. +// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Computes the approximations of the reciprocal square roots of the four +// single-precision floating point values of in. +// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in))); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Computes the maximums of the four single-precision, floating-point values of +// a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Computes the minima of the four single-precision, floating-point values of a +// and b. +// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_a, _b), _a, _b); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the maximum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the minimum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 > b0) ? a0 : b0 +// r1 := (a1 > b1) ? a1 : b1 +// r2 := (a2 > b2) ? a2 : b2 +// r3 := (a3 > b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 < b0) ? a0 : b0 +// r1 := (a1 < b1) ? a1 : b1 +// r2 := (a2 < b2) ? a2 : b2 +// r3 := (a3 < b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0)[31:16] +// r1 := (a1 * b1)[31:16] +// ... +// r7 := (a7 * b7)[31:16] +// +// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Computes pairwise add of each argument as single-precision, floating-point +// values a and b. +// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Computes pairwise add of each argument as a 16-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally substract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsubq_f32( + vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), + vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); +#else + float32x4x2_t c = + vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Computes pairwise difference of each argument as a 16-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Subtract + return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); +} + +// Computes saturated pairwise sub of each argument as a 16-bit signed +// integer values a and b. +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Computes saturated pairwise difference of each argument as a 16-bit signed +// integer values a and b. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated subtract + return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); +#endif +} + +// Computes pairwise add of each argument as a 32-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +} + +// Computes pairwise difference of each argument as a 32-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int64x2_t a = vreinterpretq_s64_m128i(_a); + int64x2_t b = vreinterpretq_s64_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|b0|b2] + // [a1|a2|b1|b3] + int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); + int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); + // Subtract + return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); +} + +// Kahan summation for accurate summation of floating-point numbers. +// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html +FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y) +{ + y -= *c; + float t = *sum + y; + *c = (t - *sum) - y; + *sum = t; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +{ +#if defined(__aarch64__) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); + } + if (imm == 0x7F) { + float32x4_t m = _mm_mul_ps(a, b); + m[3] = 0; + return _mm_set1_ps(vaddvq_f32(m)); + } +#endif + + float s = 0, c = 0; + float32x4_t f32a = vreinterpretq_f32_m128(a); + float32x4_t f32b = vreinterpretq_f32_m128(b); + + /* To improve the accuracy of floating-point summation, Kahan algorithm + * is used for each operation. + */ + if (imm & (1 << 4)) + sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); + if (imm & (1 << 5)) + sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); + if (imm & (1 << 6)) + sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); + if (imm & (1 << 7)) + sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); + s += c; + + float32x4_t res = { + (imm & 0x1) ? s : 0, + (imm & 0x2) ? s : 0, + (imm & 0x4) ? s : 0, + (imm & 0x8) ? s : 0, + }; + return vreinterpretq_m128_f32(res); +} + +/* Compare operations */ + +// Compares for less than +// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compares for greater than. +// +// r0 := (a0 > b0) ? 0xffffffff : 0x0 +// r1 := (a1 > b1) ? 0xffffffff : 0x0 +// r2 := (a2 > b2) ? 0xffffffff : 0x0 +// r3 := (a3 > b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compares for greater than or equal. +// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compares for less than or equal. +// +// r0 := (a0 <= b0) ? 0xffffffff : 0x0 +// r1 := (a1 <= b1) ? 0xffffffff : 0x0 +// r2 := (a2 <= b2) ? 0xffffffff : 0x0 +// r3 := (a3 <= b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compares for equality. +// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for equality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compares for inequality. +// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compares for inequality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +{ + return _mm_cmplt_ps(a, b); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +{ + return _mm_cmplt_ss(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +{ + return _mm_cmple_ps(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +{ + return _mm_cmple_ss(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return _mm_cmpgt_ps(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +{ + return _mm_cmpgt_ss(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return _mm_cmpge_ps(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_cmpge_ss(a, b); +} + +// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or +// unsigned 8-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or +// unsigned 16-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for lesser than. +// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xff : 0x0 +// r1 := (a1 > b1) ? 0xff : 0x0 +// ... +// r15 := (a15 > b15) ? 0xff : 0x0 +// +// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for less than. +// +// r0 := (a0 < b0) ? 0xffff : 0x0 +// r1 := (a1 < b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 < b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xffff : 0x0 +// r1 := (a1 > b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 > b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for less than. +// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for greater than. +// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + // ARMv7 lacks vcgtq_s64. + // This is based off of Clang's SSE2 polyfill: + // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) + + // Mask the sign bit out since we need a signed AND an unsigned comparison + // and it is ugly to try and split them. + int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); + int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); + int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); + // Check if a > b + int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi > b_hi + int64x2_t gt_hi = vshrq_n_s64(greater, 63); + // Copy lower mask to upper mask + // a_lo > b_lo + int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); + // Compare for equality + int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi == b_hi + int64x2_t eq_hi = vshrq_n_s64(equal, 63); + // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) + int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); + return vreinterpretq_m128i_s64(ret); +#endif +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. +// Ordered compare between each value returns true for "orderable" and false for +// "not orderable" (NaN). +// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see +// also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compares for ordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than operation. : +// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important +// note!! The documentation on MSDN is incorrect! If either of the values is a +// NAN the docs say you will get a one, but in fact, it will return a zero!! +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than operation. : +// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than or equal operation. : +// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than or equal operation. : +// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an equality operation. : +// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an inequality operation. : +// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_neq_b = vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; +} + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +/* Conversions */ + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + __m128 ret = a; + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int32_t) ceil(data); + if (diff == 0.5) { + int32_t f = (int32_t) floor(data); + int32_t c = (int32_t) ceil(data); + return c & 1 ? f : c; + } + return (int32_t) floor(data); +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then covert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values using truncate. +// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Converts the four signed 32-bit integer values of a to single-precision, +// floating-point values +// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Converts the two unsigned 8-bit integers in the lower 16 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Converts the two signed 8-bit integers in the lower 32 bits to four +// signed 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four signed 16-bit integers in the lower 64 bits to four signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Converts the two signed 16-bit integers in the lower 32 bits two signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four unsigned 16-bit integers in the lower 64 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +{ + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Converts the two unsigned 16-bit integers in the lower 32 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +{ + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the two unsigned 32-bit integers in the lower 64 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +{ + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Converts the two signed 32-bit integers in the lower 64 bits to two signed +// 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values. +// +// r0 := (int) a0 +// r1 := (int) a1 +// r2 := (int) a2 +// r3 := (int) a3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); +#else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = + vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); +#endif +} + +// Copy the lower 32-bit integer in a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +{ + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +// r2 := 0x0 +// r3 := 0x0 +// +// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +{ + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +{ + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Applies a type cast to reinterpret four 32-bit floating point values passed +// in as a 128-bit parameter as packed 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a +// 128-bit parameter as packed 32-bit floating point values. +// https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Loads 128-bit value. : +// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := a[63:0] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Loads 128-bit value. : +// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[MAX:32] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +{ + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) +// ENDFOR +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float) ((double *) &a)[0]; + float a1 = (float) ((double *) &a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +{ +#if defined(__aarch64__) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *) &a)[0]; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// k := 32*j +// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask) +{ + return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + __m128 zero, neg_inf, pos_inf; + + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return (__m128){floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])}; + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), + ceilf(v_float[3])}; + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); + neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])); + pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), + ceilf(v_float[2]), ceilf(v_float[3])); + return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); + default: //_MM_FROUND_CUR_DIRECTION + return (__m128){roundf(v_float[0]), roundf(v_float[1]), + roundf(v_float[2]), roundf(v_float[3])}; + } +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} + + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +/* Miscellaneous Operations */ + +// Shifts the 8 signed 16-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// ... +// r7 := a7 >> count +// +// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (c > 15) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); +} + +// Shifts the 4 signed 32-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// r2 := a2 >> count +// r3 := a3 >> count +// +// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (c > 31) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); +} + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and +// saturates. +// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// ... +// r7 := UnsignedSaturate(a7) +// r8 := UnsignedSaturate(b0) +// r9 := UnsignedSaturate(b1) +// ... +// r15 := UnsignedSaturate(b7) +// +// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers +// and saturates. +// +// r0 := SignedSaturate(a0) +// r1 := SignedSaturate(a1) +// r2 := SignedSaturate(a2) +// r3 := SignedSaturate(a3) +// r4 := SignedSaturate(b0) +// r5 := SignedSaturate(b1) +// r6 := SignedSaturate(b2) +// r7 := SignedSaturate(b3) +// +// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// r2 := UnsignedSaturate(a2) +// r3 := UnsignedSaturate(a3) +// r4 := UnsignedSaturate(b0) +// r5 := UnsignedSaturate(b1) +// r6 := UnsignedSaturate(b2) +// r7 := UnsignedSaturate(b3) +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// ... +// r14 := a7 +// r15 := b7 +// +// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the +// lower 4 signed or unsigned 16-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// r4 := a2 +// r5 := b2 +// r6 := a3 +// r7 := b3 +// +// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the +// lower 2 signed or unsigned 32 - bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +} + +// Selects and interleaves the lower two single-precision, floating-point values +// from a and b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Selects and interleaves the upper two single-precision, floating-point values +// from a and b. +// +// r0 := a2 +// r1 := b2 +// r2 := a3 +// r3 := b3 +// +// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a8 +// r1 := b8 +// r2 := a9 +// r3 := b9 +// ... +// r14 := a15 +// r15 := b15 +// +// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the +// upper 4 signed or unsigned 16-bit integers in b. +// +// r0 := a4 +// r1 := b4 +// r2 := a5 +// r3 := b5 +// r4 := a6 +// r5 := b6 +// r6 := a7 +// r7 := b7 +// +// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the +// upper 2 signed or unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper signed or unsigned 64-bit integer in a with the +// upper signed or unsigned 64-bit integer in b. +// +// r0 := a1 +// r1 := b1 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// +// index[2:0] := 0 +// min[15:0] := a[15:0] +// FOR j := 0 to 7 +// i := j*16 +// IF a[i+15:i] < min[15:0] +// index[2:0] := j +// min[15:0] := a[i+15:i] +// FI +// ENDFOR +// dst[15:0] := min[15:0] +// dst[18:16] := index[2:0] +// dst[127:19] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) +{ + __m128i dst; + uint16_t min, idx = 0; + // Find the minimum value +#if defined(__aarch64__) + min = vminvq_u16(vreinterpretq_u16_m128i(a)); +#else + __m64 tmp; + tmp = vreinterpret_m64_u16( + vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); +#endif + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t) i; + break; + } + a = _mm_srli_si128(a, 2); + } + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// shift to right +// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx +// http://blog.csdn.net/hemmingway/article/details/44828303 +// Clang requires a macro here, as it is extremely picky about c being a +// literal. +#define _mm_alignr_epi8(a, b, c) \ + ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c))) + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), + vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Extracts the selected signed or unsigned 8-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Inserts the least significant 8 bits of b into the selected 8-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 16-bit integer from a and zero +// extends. +// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Inserts the least significant 16 bits of b into the selected 16-bit integer +// of a. +// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 32-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Inserts the least significant 32 bits of b into the selected 32-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 64-bit integer from a and zero +// extends. +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Inserts the least significant 64 bits of b into the selected 64-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ + }) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#else + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#else + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +/* Crypto Extensions */ + +#if defined(__ARM_FEATURE_CRYPTO) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +{ + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +#if !defined(__ARM_FEATURE_CRYPTO) +/* clang-format off */ +#define SSE2NEON_AES_DATA(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +// In the absence of crypto extensions, implement aesenc using regular neon +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 +// for more information Reproduced with permission of the author. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, + 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, + 0xc, 0x1, 0x6, 0xb}; + static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w); + v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ + (b0)) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_DATA(SSE2NEON_AES_U0), + SSE2NEON_AES_DATA(SSE2NEON_AES_U1), + SSE2NEON_AES_DATA(SSE2NEON_AES_U2), + SSE2NEON_AES_DATA(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(EncBlock); + uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); + uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); + uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); + + __m128i out = _mm_set_epi32( + (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + /* FIXME: optimized for NEON */ + uint8_t v[4][4] = { + [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, + [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, + [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, + [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, + }; + for (int i = 0; i < 16; i++) + vreinterpretq_nth_u8_m128i(a, i) = + v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); + return a; +} + +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// +// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) +{ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +} +#undef SSE2NEON_AES_DATA + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ + vreinterpretq_u8_m128i(b)); +} + +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( + vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +} +#endif + +/* Streaming Extensions */ + +// Guarantees that every preceding store is globally visible before any +// subsequent store. +// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Stores the data in a to the address p without polluting the caches. If the +// cache line containing address p is already in the cache, the cache will be +// updated. +// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Cache line containing p is flushed and invalidated from all caches in the +// coherency domain. : +// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const *p) +{ + (void) p; + // no corollary for Neon? +} + +// Allocate aligned blocks of memory. +// https://software.intel.com/en-us/ +// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) +{ + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} + +FORCE_INLINE void _mm_free(void *addr) +{ + free(addr); +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); + } +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) +#pragma GCC pop_options +#endif + +#endif diff --git a/pffft/test_fft_factors.c b/pffft/test_fft_factors.c new file mode 100644 index 0000000..cefb2cc --- /dev/null +++ b/pffft/test_fft_factors.c @@ -0,0 +1,142 @@ + +#ifdef PFFFT_ENABLE_FLOAT +#include "pffft.h" +#endif + + +#ifdef PFFFT_ENABLE_DOUBLE +#include "pffft_double.h" +#endif + +#include +#include +#include + + + +#ifdef PFFFT_ENABLE_FLOAT +int test_float(int TL) +{ + PFFFT_Setup * S; + + for (int dir_i = 0; dir_i <= 1; ++dir_i) + { + for (int cplx_i = 0; cplx_i <= 1; ++cplx_i) + { + const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD; + const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX; + const int N_min = pffft_min_fft_size(cplx); + const int N_max = N_min * 11 + N_min; + int NTL = pffft_nearest_transform_size(TL, cplx, (!dir_i)); + double near_off = (NTL - TL) * 100.0 / (double)TL; + + fprintf(stderr, "testing float, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n", + (!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off ); + + for (int N = (N_min/2); N <= N_max; N += (N_min/2)) + { + int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f; + const int factorizable = pffft_is_valid_size(N, cplx); + while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; } + while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; } + while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; } + tmp_f = (R == N_min) ? 1 : 0; + assert( factorizable == tmp_f ); + + S = pffft_new_setup(N, cplx); + + if ( S && !factorizable ) + { + fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R); + return 1; + } + else if ( !S && factorizable) + { + fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R); + return 1; + } + + if (S) + pffft_destroy_setup(S); + } + + } + } + return 0; +} + +#endif + + +#ifdef PFFFT_ENABLE_DOUBLE +int test_double(int TL) +{ + PFFFTD_Setup * S; + for (int dir_i = 0; dir_i <= 1; ++dir_i) + { + for (int cplx_i = 0; cplx_i <= 1; ++cplx_i) + { + const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD; + const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX; + const int N_min = pffftd_min_fft_size(cplx); + const int N_max = N_min * 11 + N_min; + int NTL = pffftd_nearest_transform_size(TL, cplx, (!dir_i)); + double near_off = (NTL - TL) * 100.0 / (double)TL; + + fprintf(stderr, "testing double, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n", + (!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off ); + + for (int N = (N_min/2); N <= N_max; N += (N_min/2)) + { + int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f; + const int factorizable = pffftd_is_valid_size(N, cplx); + while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; } + while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; } + while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; } + tmp_f = (R == N_min) ? 1 : 0; + assert( factorizable == tmp_f ); + + S = pffftd_new_setup(N, cplx); + + if ( S && !factorizable ) + { + fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R); + return 1; + } + else if ( !S && factorizable) + { + fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R); + return 1; + } + + if (S) + pffftd_destroy_setup(S); + } + + } + } + return 0; +} + +#endif + + + +int main(int argc, char *argv[]) +{ + int N = (1 < argc) ? atoi(argv[1]) : 2; + + int r = 0; +#ifdef PFFFT_ENABLE_FLOAT + r = test_float(N); + if (r) + return r; +#endif + +#ifdef PFFFT_ENABLE_DOUBLE + r = test_double(N); +#endif + + return r; +} + diff --git a/pffft/test_pffastconv.c b/pffft/test_pffastconv.c new file mode 100644 index 0000000..4fdd94d --- /dev/null +++ b/pffft/test_pffastconv.c @@ -0,0 +1,991 @@ +/* + Copyright (c) 2013 Julien Pommier. + Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de ) + */ + +#define _WANT_SNAN 1 + +#include "pffft.h" +#include "pffastconv.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_SYS_TIMES +# include +# include +#endif + +/* benchmark duration: 250 ms */ +#define BENCH_TEST_DURATION_IN_SEC 0.5 + +/* + vector support macros: the rest of the code is independant of + SSE/Altivec/NEON -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ +#if 0 +#include "simd/pf_float.h" +#endif + +#if defined(_MSC_VER) +# define RESTRICT __restrict +#elif defined(__GNUC__) +# define RESTRICT __restrict +#else +# define RESTRICT +#endif + + +#if defined(_MSC_VER) +#pragma warning( disable : 4244 ) +#endif + + +#ifdef SNANF + #define INVALID_FLOAT_VAL SNANF +#elif defined(SNAN) + #define INVALID_FLOAT_VAL SNAN +#elif defined(NAN) + #define INVALID_FLOAT_VAL NAN +#elif defined(INFINITY) + #define INVALID_FLOAT_VAL INFINITY +#else + #define INVALID_FLOAT_VAL FLT_MAX +#endif + + +#if defined(HAVE_SYS_TIMES) + inline double uclock_sec(void) { + static double ttclk = 0.; + struct tms t; + if (ttclk == 0.) + ttclk = sysconf(_SC_CLK_TCK); + times(&t); + /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ + return ((double)t.tms_utime)) / ttclk; + } +# else + double uclock_sec(void) +{ return (double)clock()/(double)CLOCKS_PER_SEC; } +#endif + + + +typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush); +typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags); +typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup); +typedef void (*pfnConvDestroy) (void * setup); + + +struct ConvSetup +{ + pfnConvolution pfn; + int N; + int B; + float * H; + int flags; +}; + + +void * convSetupRev( float * H, int N, int * BlkLen, int flags ) +{ + struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) ); + int i, Nr = N; + if (flags & PFFASTCONV_CPLX_INP_OUT) + Nr *= 2; + Nr += 4; + s->pfn = NULL; + s->N = N; + s->B = *BlkLen; + s->H = pffastconv_malloc((unsigned)Nr * sizeof(float)); + s->flags = flags; + memset(s->H, 0, (unsigned)Nr * sizeof(float)); + if (flags & PFFASTCONV_CPLX_INP_OUT) + { + for ( i = 0; i < N; ++i ) { + s->H[2*(N-1 -i) ] = H[i]; + s->H[2*(N-1 -i)+1] = H[i]; + } + /* simpler detection of overruns */ + s->H[ 2*N ] = INVALID_FLOAT_VAL; + s->H[ 2*N +1 ] = INVALID_FLOAT_VAL; + s->H[ 2*N +2 ] = INVALID_FLOAT_VAL; + s->H[ 2*N +3 ] = INVALID_FLOAT_VAL; + } + else + { + for ( i = 0; i < N; ++i ) + s->H[ N-1 -i ] = H[i]; + /* simpler detection of overruns */ + s->H[ N ] = INVALID_FLOAT_VAL; + s->H[ N +1 ] = INVALID_FLOAT_VAL; + s->H[ N +2 ] = INVALID_FLOAT_VAL; + s->H[ N +3 ] = INVALID_FLOAT_VAL; + } + return s; +} + +void convDestroyRev( void * setup ) +{ + struct ConvSetup * s = (struct ConvSetup*)setup; + pffastconv_free(s->H); + pffastconv_free(setup); +} + + +pfnConvolution ConvGetFnPtrRev( void * setup ) +{ + struct ConvSetup * s = (struct ConvSetup*)setup; + if (!s) + return NULL; + return s->pfn; +} + + +void convSimdDestroy( void * setup ) +{ + convDestroyRev(setup); +} + + +void * fastConvSetup( float * H, int N, int * BlkLen, int flags ) +{ + void * p = pffastconv_new_setup( H, N, BlkLen, flags ); + if (!p) + printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags); + return p; +} + + +void fastConvDestroy( void * setup ) +{ + pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup ); +} + + + +int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush) +{ + struct ConvSetup * p = (struct ConvSetup*)setup; + const float * RESTRICT X = input; + const float * RESTRICT Hrev = p->H; + float * RESTRICT Y = output; + const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; + const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); + int i, j; + (void)Yref; + (void)applyFlush; + + if (p->flags & PFFASTCONV_CPLX_INP_OUT) + { + for ( i = 0; i <= lenNr; i += 2 ) + { + float sumRe = 0.0F, sumIm = 0.0F; + for ( j = 0; j < Nr; j += 2 ) + { + sumRe += X[i+j ] * Hrev[j]; + sumIm += X[i+j+1] * Hrev[j+1]; + } + Y[i ] = sumRe; + Y[i+1] = sumIm; + } + return i/2; + } + else + { + for ( i = 0; i <= lenNr; ++i ) + { + float sum = 0.0F; + for (j = 0; j < Nr; ++j ) + sum += X[i+j] * Hrev[j]; + Y[i] = sum; + } + return i; + } +} + + + +int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush) +{ + float sum[4]; + struct ConvSetup * p = (struct ConvSetup*)setup; + const float * RESTRICT X = input; + const float * RESTRICT Hrev = p->H; + float * RESTRICT Y = output; + const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; + const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); + int i, j; + (void)Yref; + (void)applyFlush; + + if (p->flags & PFFASTCONV_CPLX_INP_OUT) + { + if ( (Nr & 3) == 0 ) + { + for ( i = 0; i <= lenNr; i += 2 ) + { + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < Nr; j += 4 ) + { + sum[0] += X[i+j] * Hrev[j]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + Y[i ] = sum[0] + sum[2]; + Y[i+1] = sum[1] + sum[3]; + } + } + else + { + const int M = Nr & (~3); + for ( i = 0; i <= lenNr; i += 2 ) + { + float tailSumRe = 0.0F, tailSumIm = 0.0F; + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < M; j += 4 ) + { + sum[0] += X[i+j ] * Hrev[j ]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + for ( ; j < Nr; j += 2 ) { + tailSumRe += X[i+j ] * Hrev[j ]; + tailSumIm += X[i+j+1] * Hrev[j+1]; + } + Y[i ] = ( sum[0] + sum[2] ) + tailSumRe; + Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm; + } + } + return i/2; + } + else + { + if ( (Nr & 3) == 0 ) + { + for ( i = 0; i <= lenNr; ++i ) + { + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < Nr; j += 4 ) + { + sum[0] += X[i+j] * Hrev[j]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + Y[i] = sum[0] + sum[1] + sum[2] + sum[3]; + } + return i; + } + else + { + const int M = Nr & (~3); + /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */ + for ( i = 0; i <= lenNr; ++i ) + { + float tailSum = 0.0; + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < M; j += 4 ) + { + sum[0] += X[i+j] * Hrev[j]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + for ( ; j < Nr; ++j ) + tailSum += X[i+j] * Hrev[j]; + Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum; + } + return i; + } + } +} + + +int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush) +{ + float sum[4]; + struct ConvSetup * p = (struct ConvSetup*)setup; + (void)Yref; + (void)applyFlush; + if (p->flags & PFFASTCONV_SYMMETRIC) + { + const float * RESTRICT X = input; + const float * RESTRICT Hrev = p->H; + float * RESTRICT Y = output; + const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; + const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); + const int h = Nr / 2 -4; + const int E = Nr -4; + int i, j; + + if (p->flags & PFFASTCONV_CPLX_INP_OUT) + { + for ( i = 0; i <= lenNr; i += 2 ) + { + const int k = i + E; + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j <= h; j += 4 ) + { + sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] ); + sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] ); + sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] ); + sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] ); + } + Y[i ] = sum[0] + sum[2]; + Y[i+1] = sum[1] + sum[3]; + } + return i/2; + } + else + { + for ( i = 0; i <= lenNr; ++i ) + { + const int k = i + E; + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j <= h; j += 4 ) + { + sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] ); + sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] ); + sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] ); + sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] ); + } + Y[i] = sum[0] + sum[1] + sum[2] + sum[3]; + } + return i; + } + } + else + { + const float * RESTRICT X = input; + const float * RESTRICT Hrev = p->H; + float * RESTRICT Y = output; + const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; + const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); + int i, j; + + if (p->flags & PFFASTCONV_CPLX_INP_OUT) + { + for ( i = 0; i <= lenNr; i += 2 ) + { + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < Nr; j += 4 ) + { + sum[0] += X[i+j] * Hrev[j]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + Y[i ] = sum[0] + sum[2]; + Y[i+1] = sum[1] + sum[3]; + } + return i/2; + } + else + { + if ( (Nr & 3) == 0 ) + { + for ( i = 0; i <= lenNr; ++i ) + { + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < Nr; j += 4 ) + { + sum[0] += X[i+j] * Hrev[j]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]); + } + return i; + } + else + { + const int M = Nr & (~3); + /* printf("B: Nr = %d\n", Nr ); */ + for ( i = 0; i <= lenNr; ++i ) + { + float tailSum = 0.0; + sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; + for (j = 0; j < M; j += 4 ) + { + sum[0] += X[i+j] * Hrev[j]; + sum[1] += X[i+j+1] * Hrev[j+1]; + sum[2] += X[i+j+2] * Hrev[j+2]; + sum[3] += X[i+j+3] * Hrev[j+3]; + } + for ( ; j < Nr; ++j ) + tailSum += X[i+j] * Hrev[j]; + Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum; + } + return i; + } + } + } + +} + + +int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush) +{ + (void)Yref; + return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush ); +} + + + +void printFirst( const float * V, const char * st, const int N, const int perLine ) +{ + (void)V; (void)st; (void)N; (void)perLine; + return; +#if 0 + int i; + for ( i = 0; i < N; ++i ) + { + if ( (i % perLine) == 0 ) + printf("\n%s[%d]", st, i); + printf("\t%.1f", V[i]); + } + printf("\n"); +#endif +} + + + +#define NUMY 15 + + +int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed, int abortFirstFastAlgo, int printErrValues, int printAsCSV, int *pIsFirstFilterLen) { + double t0, t1, tstop, td, tdref; + float *X, *H; + float *Y[NUMY]; + int64_t outN[NUMY]; + /* 256 KFloats or 16 MFloats data */ +#if 1 + const int len = testOutLen ? (1 << 18) : (1 << 24); +#elif 0 + const int len = testOutLen ? (1 << 18) : (1 << 13); +#else + const int len = testOutLen ? (1 << 18) : (1024); +#endif + const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1; + const int lenC = len / cplxFactor; + + int yi, yc, posMaxErr; + float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0; + int i, j, numErrOverLimit, iter; + int retErr = 0; + + /* 0 1 2 3 4 5 6 7 8 9, 10, 11, 12, 13 */ + pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, }; + pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, }; + pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; + pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, }; + const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K", "fast_conv_8K", "fast_conv_16K", "fast_conv_32K", "fast_conv_64K", }; + int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }; + void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; +//int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, }; + int aBlkLen[NUMY] = { 8192, 8192, 8192, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, }; +#if 1 + int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096, FILTERLEN<8192, FILTERLEN<16384, FILTERLEN<32768, FILTERLEN<65536, }; +#elif 0 + int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, }; +#else + int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, }; +#endif + double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY]; + int aNumIters[NUMY], aNumLoops[NUMY]; + + X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) ); + for ( i=0; i < NUMY; ++i) + { + if ( 1 || i < 2 ) + Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) ); + else + Y[i] = Y[1]; + + Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */ + aSpeedFactor[i] = -1.0; + aDuration[i] = -1.0; + procSmpPerSec[i] = -1.0; + aNumIters[i] = 0; + aNumLoops[i] = 0; + } + + H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float)); + + /* initialize input */ + if ( convFlags & PFFASTCONV_CPLX_INP_OUT ) + { + for ( i = 0; i < lenC; ++i ) + { + X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */ + X[2*i+1] = (float)((i+2048) % 4093); + } + } + else + { + for ( i = 0; i < len; ++i ) + X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */ + } + X[ len ] = INVALID_FLOAT_VAL; + X[ len +1 ] = INVALID_FLOAT_VAL; + X[ len +2 ] = INVALID_FLOAT_VAL; + X[ len +3 ] = INVALID_FLOAT_VAL; + + if (!testOutLen) + printFirst( X, "X", 64, 8 ); + + /* filter coeffs */ + memset( H, 0, FILTERLEN * sizeof(float) ); +#if 1 + if ( convFlags & PFFASTCONV_SYMMETRIC ) + { + const int half = FILTERLEN / 2; + for ( j = 0; j < half; ++j ) { + switch (j % 3) { + case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break; + case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break; + case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break; + } + } + } + else + { + for ( j = 0; j < FILTERLEN; ++j ) { + switch (j % 3) { + case 0: H[j] = -1.0F; break; + case 1: H[j] = 1.0F; break; + case 2: H[j] = 0.5F; break; + } + } + } +#else + H[0] = 1.0F; + H[FILTERLEN -1] = 1.0F; +#endif + if (!testOutLen) + printFirst( H, "H", FILTERLEN, 8 ); + + if (!printAsCSV) + { + printf("\n"); + printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN, + ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"), + (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "", + ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") ); + } + + int hadFastAlgo = 0; + + while (1) + { + + for ( yi = 0; yi < NUMY; ++yi ) + { + if (!aRunAlgo[yi]) + continue; + + if ( aFastAlgo[yi] && abortFirstFastAlgo && hadFastAlgo ) + { + aRunAlgo[yi] = 0; + continue; + } + + hadFastAlgo = hadFastAlgo | aFastAlgo[yi]; + + aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags ); + + /* get effective apply function ptr */ + if ( aSetupCfg[yi] && aGetFnPtr[yi] ) + aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] ); + + if ( aSetupCfg[yi] && aConv[yi] ) + { + if (testOutLen) + { + t0 = uclock_sec(); + outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ ); + t1 = uclock_sec(); + td = t1 - t0; + } + else + { + //const int blkLen = 4096; /* required for 'fast_conv_4K' */ + const int blkLen = aBlkLen[yi]; + int64_t offC = 0, offS, Nout; + int k; + iter = 0; + outN[yi] = 0; + aNumLoops[yi] = 1; + t0 = uclock_sec(); + tstop = t0 + BENCH_TEST_DURATION_IN_SEC; + do + { + const int prev_iter = iter; + for ( k = 0; k < 128 && offC +blkLen < lenC; ++k ) + { + offS = cplxFactor * offC; + Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], 0 /* applyFlush */ ); + offC += Nout; + ++iter; + if ( !Nout ) + break; + } + //if ( !Nout ) + // break; + t1 = uclock_sec(); + if ( prev_iter == iter ) // restart from begin of input? + { + offC = 0; + ++aNumLoops[yi]; + } + } while ( t1 < tstop ); + outN[yi] = offC; + td = t1 - t0; + procSmpPerSec[yi] = cplxFactor * (double)outN[yi] * (1.0 / td); + aNumIters[yi] = iter; + aDuration[yi] = td; + + //printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%.1f ms\n", + // convText[yi], (double)outN[yi]/(1000.0 * 1000.0), 1000.0 * aDuration[yi], procSmpPerSec[yi] * 0.001, aNumIters[yi], 1000.0 * td ); + } + } + else + { + outN[yi] = 0; + } + if ( yi == 0 ) { + const float * Yvals = Y[0]; + const int64_t refOutLen = cplxFactor * outN[0]; + tdref = td; + if (printDbg) { + printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0); + printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor); + } + aSpeedFactor[yi] = 1.0; + /* */ + yRangeMin = FLT_MAX; + yRangeMax = FLT_MIN; + for ( i = 0; i < refOutLen; ++i ) + { + if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i]; + if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i]; + } + yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F ); + /* yErrLimit = 0.01F; */ + if (testOutLen) { + if (1) { + printf("reference output len = %" PRId64 " smp\n", outN[0]); + printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit); + } + printFirst( Yvals, "Yref", 64, 8 ); + } + } + else + { + aSpeedFactor[yi] = tdref / td; + if (printDbg) { + printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td); + printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor); + } + } + } + + int iMaxSpeedSlowAlgo = -1; + int iFirstFastAlgo = -1; + int iMaxSpeedFastAlgo = -1; + int iPrintedRefOutLen = 0; + { + for ( yc = 1; yc < NUMY; ++yc ) + { + if (!aRunAlgo[yc]) + continue; + if (aFastAlgo[yc]) { + if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] ) + iMaxSpeedFastAlgo = yc; + + if (iFirstFastAlgo < 0) + iFirstFastAlgo = yc; + } + else + { + if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] ) + iMaxSpeedSlowAlgo = yc; + } + } + + if (printSpeed) + { + if (testOutLen) + { + if (iMaxSpeedSlowAlgo >= 0 ) + printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]); + if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0]) + printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]); + if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1]) + printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]); + + if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo]) + printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]); + if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1]) + printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]); + + if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] ) + { + printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]); + if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] ) + printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] ); + } + printf("\n"); + } + else + { + // print columns in 1st line + if (printAsCSV && *pIsFirstFilterLen) + { + printf("\n# filterLen, filterOrder, Re/Cx, type, sym, "); + for ( yc = 0; yc < NUMY; ++yc ) + { + if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0) + continue; + if (printAsCSV) + printf("%s, ", convText[yc]); + } + *pIsFirstFilterLen = 0; + } + + for ( yc = 0; yc < NUMY; ++yc ) + { + if (!yc) + { + double filterExp = log10((double)FILTERLEN) / log10(2.0); + printf("\n%5d, %5.1f, %s, %s, %s, ", FILTERLEN, filterExp, + ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"), + (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "", + ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") + ); + } + if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0) + continue; + if (printAsCSV) + printf("%.0f, ", procSmpPerSec[yc] * 0.001); + else + printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%d loops\n", + convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001, aNumIters[yc], aNumLoops[yc] ); + } + } + + } + } + + + for ( yc = 1; yc < NUMY; ++yc ) + { + const float * Yref; + const float * Ycurr; + int outMin; + + if (!aRunAlgo[yc]) + continue; + + if (printDbg) + printf("\n"); + + if ( outN[yc] == 0 ) + { + if (!printAsCSV) + printf("output size 0: '%s' not implemented\n", convText[yc]); + } + else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen ) + { + if (!iPrintedRefOutLen) + { + printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]); + iPrintedRefOutLen = 1; + } + printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n", + FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] ); + retErr = 1; + } + + posMaxErr = 0; + maxErr = -1.0; + Yref = Y[0]; + Ycurr = Y[yc]; + outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0]; + numErrOverLimit = 0; + for ( i = 0; i < outMin; ++i ) + { + if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit && printErrValues ) + { + printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n", + convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] ); + ++numErrOverLimit; + } + + if ( fabs(Ycurr[i] - Yref[i]) > maxErr ) + { + maxErr = fabsf(Ycurr[i] - Yref[i]); + posMaxErr = i; + } + } + + if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) ) + printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 ); + } + + break; + } + + pffastconv_free(X); + for ( i=0; i < NUMY; ++i) + { + if ( 1 || i < 2 ) + pffastconv_free( Y[i] ); + if (!aRunAlgo[i]) + continue; + aDestroy[i]( aSetupCfg[i] ); + } + + pffastconv_free(H); + + return retErr; +} + +/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */ +void validate_pffft_simd(); +int validate_pffft_simd_ex(FILE * DbgOut); + + +int main(int argc, char **argv) +{ + int result = 0; + int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed; + int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0; + int testReal = 1, testCplx = 1, testSymetric = 0, abortFirstFastAlgo = 1, printErrValues = 0, printAsCSV = 1; + int isFirstFilterLen = 1; + + for ( i = 1; i < argc; ++i ) { + + if (!strcmp(argv[i], "--test-simd")) { + int numErrs = validate_pffft_simd_ex(stdout); + fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs); + return ( numErrs > 0 ? 1 : 0 ); + } + + if (!strcmp(argv[i], "--no-len")) { + testOutLens = 0; + } + else if (!strcmp(argv[i], "--no-bench")) { + benchConv = 0; + } + else if (!strcmp(argv[i], "--quick")) { + quickTest = 1; + } + else if (!strcmp(argv[i], "--slow")) { + slowTest = 1; + } + else if (!strcmp(argv[i], "--real")) { + testCplx = 0; + } + else if (!strcmp(argv[i], "--cplx")) { + testReal = 0; + } + else if (!strcmp(argv[i], "--sym")) { + testSymetric = 1; + } + else /* if (!strcmp(argv[i], "--help")) */ { + printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]); + exit(1); + } + } + + + if (testOutLens) + { + for ( k = 0; k < 3; ++k ) + { + if ( (k == 0 && !testReal) || (k > 0 && !testCplx) ) + continue; + printf("\n\n==========\n"); + printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) ); + printf("==========\n"); + flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT; + flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 ); + flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT; + testOutLen = 1; + printDbg = 0; + printSpeed = 0; + for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M ) + { + if ( (M % 16) != 0 && testSymetric ) + continue; + result |= test(M, flagsB, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, 0, &isFirstFilterLen); + } + } + } + + if (benchConv) + { + printf("quickTest is %d\n", quickTest); + printf("slowTest is %d\n", slowTest); + + for ( k = 0; k < 3; ++k ) + { + if ( (k == 0 && !testReal) || (k > 0 && !testCplx) ) + continue; + if (!printAsCSV) + { + printf("\n\n==========\n"); + printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) ); + printf("==========\n"); + } + flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT; + flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 ); + flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 ); + testOutLen = 0; + printDbg = 0; + printSpeed = 1; + if (!slowTest) { + if (!quickTest) { + result |= test(32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(32 + 16, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + } + result |= test(64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + if (!quickTest) { + result |= test(64 + 32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + } + } + if (!quickTest) { + result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(256, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(512, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(1024, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + + result |= test(2048, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(4096, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(8192, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(16384, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + result |= test(32768, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen); + } + if (printAsCSV) + printf("\n"); + } + } + + return result; +} + diff --git a/pffft/test_pffft.c b/pffft/test_pffft.c new file mode 100644 index 0000000..a86bdb4 --- /dev/null +++ b/pffft/test_pffft.c @@ -0,0 +1,371 @@ +/* + Copyright (c) 2013 Julien Pommier. + + Small test for PFFFT + + How to build: + + on linux, with fftw3: + gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm + + on macos, without fftw3: + clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate + + on macos, with fftw3: + clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate + + as alternative: replace clang by gcc. + + on windows, with visual c++: + cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c + + build without SIMD instructions: + gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm + + */ + +#ifdef PFFFT_ENABLE_FLOAT +#include "pffft.h" + +typedef float pffft_scalar; +#else +/* +Note: adapted for double precision dynamic range version. +*/ +#include "pffft_double.h" + +typedef double pffft_scalar; +#endif + +#include +#include +#include +#include +#include +#include + +/* define own constants required to turn off g++ extensions .. */ +#ifndef M_PI + #define M_PI 3.14159265358979323846 /* pi */ +#endif + +/* EXPECTED_DYN_RANGE in dB: + * single precision float has 24 bits mantissa + * => 24 Bits * 6 dB = 144 dB + * allow a few dB tolerance (even 144 dB looks good on my PC) + */ +#ifdef PFFFT_ENABLE_FLOAT +#define EXPECTED_DYN_RANGE 140.0 +#else +#define EXPECTED_DYN_RANGE 215.0 +#endif + +/* maximum allowed phase error in degree */ +#define DEG_ERR_LIMIT 1E-4 + +/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */ +#define MAG_ERR_LIMIT 1E-6 + + +#define PRINT_SPEC 0 + +#define PWR2LOG(PWR) ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) ) + + + +int test(int N, int cplx, int useOrdered) { + int Nfloat = (cplx ? N*2 : N); +#ifdef PFFFT_ENABLE_FLOAT + pffft_scalar *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *R = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); +#else + pffft_scalar *X = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *Y = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *R = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *Z = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); + pffft_scalar *W = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar)); +#endif + pffft_scalar amp = (pffft_scalar)1.0; + double freq, dPhi, phi, phi0; + double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag; + int k, j, m, iter, kmaxOther, retError = 0; + +#ifdef PFFFT_ENABLE_FLOAT + assert( pffft_is_power_of_two(N) ); + PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); +#else + assert( pffftd_is_power_of_two(N) ); + PFFFTD_Setup *s = pffftd_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); +#endif + assert(s); + if (!s) { + printf("Error setting up PFFFT!\n"); + return 1; + } + + for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m ) + { + amp = (pffft_scalar)( ( (m % 3) == 0 ) ? 1.0 : 1.1 ); + freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N); + dPhi = 2.0 * M_PI * freq; + if ( dPhi < 0.0 ) + dPhi += 2.0 * M_PI; + + iter = -1; + while (1) + { + ++iter; + + if (iter) + printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq); + + /* generate cosine carrier as time signal - start at defined phase phi0 */ + phi = phi0 = (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */ + for ( j = 0; j < N; ++j ) + { + if (cplx) { + X[2*j] = amp * (pffft_scalar)cos(phi); /* real part */ + X[2*j+1] = amp * (pffft_scalar)sin(phi); /* imag part */ + } + else + X[j] = amp * (pffft_scalar)cos(phi); /* only real part */ + + /* phase increment .. stay normalized - cos()/sin() might degrade! */ + phi += dPhi; + if ( phi >= M_PI ) + phi -= 2.0 * M_PI; + } + + /* forward transform from X --> Y .. using work buffer W */ +#ifdef PFFFT_ENABLE_FLOAT + if ( useOrdered ) + pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD ); + else + { + pffft_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */ + pffft_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */ + } +#else + if ( useOrdered ) + pffftd_transform_ordered(s, X, Y, W, PFFFT_FORWARD ); + else + { + pffftd_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */ + pffftd_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */ + } +#endif + + pwrOther = -1.0; + pwrCar = 0; + + + /* for positive frequencies: 0 to 0.5 * samplerate */ + /* and also for negative frequencies: -0.5 * samplerate to 0 */ + for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j ) + { + if (!cplx && !j) /* special treatment for DC for real input */ + pwr = Y[j]*Y[j]; + else if (!cplx && j == N/2) /* treat 0.5 * samplerate */ + pwr = Y[1] * Y[1]; /* despite j (for freq calculation) we have index 1 */ + else + pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1]; + if (iter || PRINT_SPEC) + printf("%s fft %d: pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) ); + if (k == j) + pwrCar = pwr; + else if ( pwr > pwrOther ) { + pwrOther = pwr; + kmaxOther = j; + } + } + + if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) { + printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter); + printf(" carrier power at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) ); + printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) ); + printf(" max other pwr at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) ); + printf(" dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) ); + retError = 1; + if ( iter == 0 ) + continue; + } + + if ( k > 0 && k != N/2 ) + { + phi = atan2( Y[2*k+1], Y[2*k] ); + if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 ) + { + retError = 1; + printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg expected = %f deg\n", + (cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI ); + } + } + + expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) ); + mag = sqrt(pwrCar) / N; + if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT ) + { + retError = 1; + printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag ); + } + + + /* now convert spectrum back */ +#ifdef PFFFT_ENABLE_FLOAT + if (useOrdered) + pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD); + else + pffft_transform(s, R, Z, W, PFFFT_BACKWARD); +#else + if (useOrdered) + pffftd_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD); + else + pffftd_transform(s, R, Z, W, PFFFT_BACKWARD); +#endif + + errSum = 0.0; + for ( j = 0; j < (cplx ? (2*N) : N); ++j ) + { + /* scale back */ + Z[j] /= N; + /* square sum errors over real (and imag parts) */ + err = (X[j]-Z[j]) * (X[j]-Z[j]); + errSum += err; + } + + if ( errSum > N * 1E-7 ) + { + retError = 1; + printf("%s fft %d bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N); + } + + break; + } + + } +#ifdef PFFFT_ENABLE_FLOAT + pffft_destroy_setup(s); + pffft_aligned_free(X); + pffft_aligned_free(Y); + pffft_aligned_free(Z); + pffft_aligned_free(R); + pffft_aligned_free(W); +#else + pffftd_destroy_setup(s); + pffftd_aligned_free(X); + pffftd_aligned_free(Y); + pffftd_aligned_free(Z); + pffftd_aligned_free(R); + pffftd_aligned_free(W); +#endif + + return retError; +} + +/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */ +void validate_pffft_simd(); +int validate_pffft_simd_ex(FILE * DbgOut); +void validate_pffftd_simd(); +int validate_pffftd_simd_ex(FILE * DbgOut); + + + +int main(int argc, char **argv) +{ + int N, result, resN, resAll, i, k, resNextPw2, resIsPw2, resFFT; + + int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 }; + int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 }; + + for ( i = 1; i < argc; ++i ) { + + if (!strcmp(argv[i], "--test-simd")) { +#ifdef PFFFT_ENABLE_FLOAT + int numErrs = validate_pffft_simd_ex(stdout); +#else + int numErrs = validate_pffftd_simd_ex(stdout); +#endif + fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs); + return ( numErrs > 0 ? 1 : 0 ); + } + } + + resNextPw2 = 0; + resIsPw2 = 0; + for ( k = 0; k < (sizeof(inp_power_of_two)/sizeof(inp_power_of_two[0])); ++k) { +#ifdef PFFFT_ENABLE_FLOAT + N = pffft_next_power_of_two(inp_power_of_two[k]); +#else + N = pffftd_next_power_of_two(inp_power_of_two[k]); +#endif + if (N != ref_power_of_two[k]) { + resNextPw2 = 1; + printf("pffft_next_power_of_two(%d) does deliver %d, which is not reference result %d!\n", + inp_power_of_two[k], N, ref_power_of_two[k] ); + } + +#ifdef PFFFT_ENABLE_FLOAT + result = pffft_is_power_of_two(inp_power_of_two[k]); +#else + result = pffftd_is_power_of_two(inp_power_of_two[k]); +#endif + if (inp_power_of_two[k] == ref_power_of_two[k]) { + if (!result) { + resIsPw2 = 1; + printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", inp_power_of_two[k]); + } + } else { + if (result) { + resIsPw2 = 1; + printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", inp_power_of_two[k]); + } + } + } + if (!resNextPw2) + printf("tests for pffft_next_power_of_two() succeeded successfully.\n"); + if (!resIsPw2) + printf("tests for pffft_is_power_of_two() succeeded successfully.\n"); + + resFFT = 0; + for ( N = 32; N <= 65536; N *= 2 ) + { + result = test(N, 1 /* cplx fft */, 1 /* useOrdered */); + resN = result; + resFFT |= result; + + result = test(N, 0 /* cplx fft */, 1 /* useOrdered */); + resN |= result; + resFFT |= result; + + result = test(N, 1 /* cplx fft */, 0 /* useOrdered */); + resN |= result; + resFFT |= result; + + result = test(N, 0 /* cplx fft */, 0 /* useOrdered */); + resN |= result; + resFFT |= result; + + if (!resN) + printf("tests for size %d succeeded successfully.\n", N); + } + + if (!resFFT) { +#ifdef PFFFT_ENABLE_FLOAT + printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, float) succeeded successfully.\n"); +#else + printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, double) succeeded successfully.\n"); +#endif + } + + resAll = resNextPw2 | resIsPw2 | resFFT; + if (!resAll) + printf("all tests succeeded successfully.\n"); + else + printf("there are failed tests!\n"); + + return resAll; +} + diff --git a/pffft/test_pffft.cpp b/pffft/test_pffft.cpp new file mode 100644 index 0000000..d388563 --- /dev/null +++ b/pffft/test_pffft.cpp @@ -0,0 +1,377 @@ +/* + Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + Small test & bench for PFFFT, comparing its performance with the scalar + FFTPACK, FFTW, and Apple vDSP + + How to build: + + on linux, with fftw3: + gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c + test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm + + on macos, without fftw3: + clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c + -L/usr/local/lib -I/usr/local/include/ -framework Accelerate + + on macos, with fftw3: + clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c + test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f + -framework Accelerate + + as alternative: replace clang by gcc. + + on windows, with visual c++: + cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c + + build without SIMD instructions: + gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c + fftpack.c -lm + + */ + +#include "pffft.hpp" + +#include +#include +#include +#include +#include +#include + +/* define own constants required to turn off g++ extensions .. */ +#ifndef M_PI + #define M_PI 3.14159265358979323846 /* pi */ +#endif + +/* maximum allowed phase error in degree */ +#define DEG_ERR_LIMIT 1E-4 + +/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */ +#define MAG_ERR_LIMIT 1E-6 + +#define PRINT_SPEC 0 + +#define PWR2LOG(PWR) ((PWR) < 1E-30 ? 10.0 * log10(1E-30) : 10.0 * log10(PWR)) + +template +bool +Ttest(int N, bool useOrdered) +{ + typedef pffft::Fft Fft; + typedef typename pffft::Fft::Scalar FftScalar; + typedef typename Fft::Complex FftComplex; + + const bool cplx = pffft::Fft::isComplexTransform(); + const double EXPECTED_DYN_RANGE = Fft::isDoubleScalar() ? 215.0 : 140.0; + + assert(Fft::isPowerOfTwo(N)); + + Fft fft = Fft(N); // instantiate and prepareLength() for length N + +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) + + // possible ways to declare/instatiate aligned vectors with C++11 + // some lines require a typedef of above + auto X = fft.valueVector(); // for X = input vector + pffft::AlignedVector Y = fft.spectrumVector(); // for Y = forward(X) + pffft::AlignedVector R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X) + pffft::AlignedVector Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) ) + // or Z = inverseInternalLayout(R) +#else + + // possible ways to declare/instatiate aligned vectors with C++98 + pffft::AlignedVector X = fft.valueVector(); // for X = input vector + pffft::AlignedVector Y = fft.spectrumVector(); // for Y = forward(X) + pffft::AlignedVector R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X) + pffft::AlignedVector Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) ) + // or Z = inverseInternalLayout(R) +#endif + + // work with complex - without the capabilities of a higher c++ standard + FftScalar* Xs = reinterpret_cast(X.data()); // for X = input vector + FftScalar* Ys = reinterpret_cast(Y.data()); // for Y = forward(X) + FftScalar* Zs = reinterpret_cast(Z.data()); // for Z = inverse(Y) = inverse( forward(X) ) + + int k, j, m, iter, kmaxOther; + bool retError = false; + double freq, dPhi, phi, phi0; + double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag; + double amp = 1.0; + + for (k = m = 0; k < (cplx ? N : (1 + N / 2)); k += N / 16, ++m) { + amp = ((m % 3) == 0) ? 1.0F : 1.1F; + freq = (k < N / 2) ? ((double)k / N) : ((double)(k - N) / N); + dPhi = 2.0 * M_PI * freq; + if (dPhi < 0.0) + dPhi += 2.0 * M_PI; + + iter = -1; + while (1) { + ++iter; + + if (iter) + printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq); + + /* generate cosine carrier as time signal - start at defined phase phi0 */ + phi = phi0 = + (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */ + for (j = 0; j < N; ++j) { + if (cplx) { + Xs[2 * j] = (FftScalar)( amp * cos(phi) ); /* real part */ + Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */ + } else + Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */ + + /* phase increment .. stay normalized - cos()/sin() might degrade! */ + phi += dPhi; + if (phi >= M_PI) + phi -= 2.0 * M_PI; + } + + /* forward transform from X --> Y .. using work buffer W */ + if (useOrdered) + fft.forward(X, Y); + else { + fft.forwardToInternalLayout(X, R); /* use R for reordering */ + fft.reorderSpectrum(R, Y); /* have canonical order in Y[] for power calculations */ + } + + pwrOther = -1.0; + pwrCar = 0; + + /* for positive frequencies: 0 to 0.5 * samplerate */ + /* and also for negative frequencies: -0.5 * samplerate to 0 */ + for (j = 0; j < (cplx ? N : (1 + N / 2)); ++j) { + if (!cplx && !j) /* special treatment for DC for real input */ + pwr = Ys[j] * Ys[j]; + else if (!cplx && j == N / 2) /* treat 0.5 * samplerate */ + pwr = Ys[1] * + Ys[1]; /* despite j (for freq calculation) we have index 1 */ + else + pwr = Ys[2 * j] * Ys[2 * j] + Ys[2 * j + 1] * Ys[2 * j + 1]; + if (iter || PRINT_SPEC) + printf("%s fft %d: pwr[j = %d] = %g == %f dB\n", + (cplx ? "cplx" : "real"), + N, + j, + pwr, + PWR2LOG(pwr)); + if (k == j) + pwrCar = pwr; + else if (pwr > pwrOther) { + pwrOther = pwr; + kmaxOther = j; + } + } + + if (PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE) { + printf("%s fft %d amp %f iter %d:\n", + (cplx ? "cplx" : "real"), + N, + amp, + iter); + printf(" carrier power at bin %d: %g == %f dB\n", + k, + pwrCar, + PWR2LOG(pwrCar)); + printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar)); + printf(" max other pwr at bin %d: %g == %f dB\n", + kmaxOther, + pwrOther, + PWR2LOG(pwrOther)); + printf(" dynamic range: %f dB\n\n", + PWR2LOG(pwrCar) - PWR2LOG(pwrOther)); + retError = true; + if (iter == 0) + continue; + } + + if (k > 0 && k != N / 2) { + phi = atan2(Ys[2 * k + 1], Ys[2 * k]); + if (fabs(phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0) { + retError = true; + printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg " + "expected = %f deg\n", + (cplx ? "cplx" : "real"), + N, + k, + amp, + phi * 180.0 / M_PI, + phi0 * 180.0 / M_PI); + } + } + + expextedMag = cplx ? amp : ((k == 0 || k == N / 2) ? amp : (amp / 2)); + mag = sqrt(pwrCar) / N; + if (fabs(mag - expextedMag) > MAG_ERR_LIMIT) { + retError = true; + printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n", + (cplx ? "cplx" : "real"), + N, + k, + amp, + mag, + expextedMag); + } + + /* now convert spectrum back */ + if (useOrdered) + fft.inverse(Y, Z); + else + fft.inverseFromInternalLayout(R, Z); /* inverse() from internal Layout */ + + errSum = 0.0; + for (j = 0; j < (cplx ? (2 * N) : N); ++j) { + /* scale back */ + Zs[j] /= N; + /* square sum errors over real (and imag parts) */ + err = (Xs[j] - Zs[j]) * (Xs[j] - Zs[j]); + errSum += err; + } + + if (errSum > N * 1E-7) { + retError = true; + printf("%s fft %d bin %d : inverse FFT doesn't match original signal! " + "errSum = %g ; mean err = %g\n", + (cplx ? "cplx" : "real"), + N, + k, + errSum, + errSum / N); + } + + break; + } + } + + // using the std::vector<> base classes .. no need for alignedFree() for X, Y, Z and R + + return retError; +} + +bool +test(int N, bool useComplex, bool useOrdered) +{ + if (useComplex) { + return +#ifdef PFFFT_ENABLE_FLOAT + Ttest< std::complex >(N, useOrdered) +#endif +#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE) + && +#endif +#ifdef PFFFT_ENABLE_DOUBLE + Ttest< std::complex >(N, useOrdered) +#endif + ; + } else { + return +#ifdef PFFFT_ENABLE_FLOAT + Ttest(N, useOrdered) +#endif +#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE) + && +#endif +#ifdef PFFFT_ENABLE_DOUBLE + Ttest(N, useOrdered) +#endif + ; + } +} + +int +main(int argc, char** argv) +{ + int N, result, resN, resAll, k, resNextPw2, resIsPw2, resFFT; + + int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 }; + int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 }; + + resNextPw2 = 0; + resIsPw2 = 0; + for (k = 0; k < (sizeof(inp_power_of_two) / sizeof(inp_power_of_two[0])); + ++k) { +#ifdef PFFFT_ENABLE_FLOAT + N = pffft::Fft::nextPowerOfTwo(inp_power_of_two[k]); +#else + N = pffft::Fft::nextPowerOfTwo(inp_power_of_two[k]); +#endif + if (N != ref_power_of_two[k]) { + resNextPw2 = 1; + printf("pffft_next_power_of_two(%d) does deliver %d, which is not " + "reference result %d!\n", + inp_power_of_two[k], + N, + ref_power_of_two[k]); + } + +#ifdef PFFFT_ENABLE_FLOAT + result = pffft::Fft::isPowerOfTwo(inp_power_of_two[k]); +#else + result = pffft::Fft::isPowerOfTwo(inp_power_of_two[k]); +#endif + if (inp_power_of_two[k] == ref_power_of_two[k]) { + if (!result) { + resIsPw2 = 1; + printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", + inp_power_of_two[k]); + } + } else { + if (result) { + resIsPw2 = 1; + printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", + inp_power_of_two[k]); + } + } + } + if (!resNextPw2) + printf("tests for pffft_next_power_of_two() succeeded successfully.\n"); + if (!resIsPw2) + printf("tests for pffft_is_power_of_two() succeeded successfully.\n"); + + resFFT = 0; + for (N = 32; N <= 65536; N *= 2) { + result = test(N, 1 /* cplx fft */, 1 /* useOrdered */); + resN = result; + resFFT |= result; + + result = test(N, 0 /* cplx fft */, 1 /* useOrdered */); + resN |= result; + resFFT |= result; + + result = test(N, 1 /* cplx fft */, 0 /* useOrdered */); + resN |= result; + resFFT |= result; + + result = test(N, 0 /* cplx fft */, 0 /* useOrdered */); + resN |= result; + resFFT |= result; + + if (!resN) + printf("tests for size %d succeeded successfully.\n", N); + } + + if (!resFFT) + printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, " +#ifdef PFFFT_ENABLE_FLOAT + "float" +#endif +#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE) + "/" +#endif +#ifdef PFFFT_ENABLE_DOUBLE + "double" +#endif + ") succeeded successfully.\n"); + + resAll = resNextPw2 | resIsPw2 | resFFT; + if (!resAll) + printf("all tests succeeded successfully.\n"); + else + printf("there are failed tests!\n"); + + return resAll; +} diff --git a/pffft/uninstall.cmake b/pffft/uninstall.cmake new file mode 100644 index 0000000..290d1f1 --- /dev/null +++ b/pffft/uninstall.cmake @@ -0,0 +1,24 @@ +set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt") + +if(NOT EXISTS ${MANIFEST}) + message(FATAL_ERROR "Cannot find install manifest: '${MANIFEST}'") +endif() + +file(STRINGS ${MANIFEST} files) +foreach(file ${files}) + if(EXISTS ${file}) + message(STATUS "Removing file: '${file}'") + + exec_program( + ${CMAKE_COMMAND} ARGS "-E remove ${file}" + OUTPUT_VARIABLE stdout + RETURN_VALUE result + ) + + if(NOT "${result}" STREQUAL 0) + message(FATAL_ERROR "Failed to remove file: '${file}'.") + endif() + else() + MESSAGE(STATUS "File '${file}' does not exist.") + endif() +endforeach(file) diff --git a/pffft/use_gcc8.inc b/pffft/use_gcc8.inc new file mode 100644 index 0000000..c4535f1 --- /dev/null +++ b/pffft/use_gcc8.inc @@ -0,0 +1,2 @@ +export GCC_WITH_CMAKE=$(which gcc-8) +export GPP_WITH_CMAKE=$(which g++-8)