add pffft
This commit is contained in:
279
pffft/.github/workflows/c-cpp.yml
vendored
Normal file
279
pffft/.github/workflows/c-cpp.yml
vendored
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
name: C/C++ CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
- github_actions
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
- github_actions
|
||||||
|
|
||||||
|
env:
|
||||||
|
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
|
||||||
|
BUILD_TYPE: Release
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build_w_mipp_ubuntu-amd64:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: check out MIPP
|
||||||
|
uses: actions/checkout@master
|
||||||
|
with:
|
||||||
|
repository: hayguen/MIPP
|
||||||
|
path: ./MIPP
|
||||||
|
- name: cmake configure MIPP
|
||||||
|
run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
|
||||||
|
- name: cmake install MIPP headers
|
||||||
|
run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: cmake_make_simd_float_double
|
||||||
|
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||||
|
- name: cmake_make_simd_float
|
||||||
|
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||||
|
- name: cmake_make_simd_double
|
||||||
|
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||||
|
- name: cmake_make_no-simd_float_double
|
||||||
|
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||||
|
- name: cmake_make_no-simd_scalar_float_double
|
||||||
|
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||||
|
- name: compress
|
||||||
|
run: tar zcvf pffft_w_mipp_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_ubuntu_builds
|
||||||
|
path: pffft_w_mipp_ubuntu-amd64.tar.gz
|
||||||
|
|
||||||
|
build_ubuntu-amd64:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: cmake_make_simd_float_double
|
||||||
|
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||||
|
- name: cmake_make_simd_float
|
||||||
|
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||||
|
- name: cmake_make_simd_double
|
||||||
|
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||||
|
- name: cmake_make_no-simd_float_double
|
||||||
|
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||||
|
- name: cmake_make_no-simd_scalar_float_double
|
||||||
|
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||||
|
- name: compress
|
||||||
|
run: tar zcvf pffft_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_ubuntu_builds
|
||||||
|
path: pffft_ubuntu-amd64.tar.gz
|
||||||
|
|
||||||
|
cross_build_win_from_linux:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: prerequisites
|
||||||
|
run: sudo apt -qq update && sudo apt -yqq install gcc-mingw-w64 g++-mingw-w64
|
||||||
|
|
||||||
|
- name: check out MIPP
|
||||||
|
uses: actions/checkout@master
|
||||||
|
with:
|
||||||
|
repository: hayguen/MIPP
|
||||||
|
path: ./MIPP
|
||||||
|
- name: cmake configure MIPP
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
|
||||||
|
- name: cmake install MIPP headers
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake --build MIPP_build --target install
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: build_w32_no-simd
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
|
||||||
|
- name: build_w32_simd_full
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=pentium4 -DTARGET_C_ARCH=pentium4 -DMIPP_INCLUDE_DIRS=$X/include/mipp
|
||||||
|
|
||||||
|
- name: build_w64_no-simd
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
|
||||||
|
- name: build_w64_simd_full
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=sandybridge -DTARGET_C_ARCH=sandybridge -DMIPP_INCLUDE_DIRS=$X/include/mipp
|
||||||
|
|
||||||
|
- name: compress
|
||||||
|
run: tar zcvf pffft_cross-build-windows-from-linux-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_w32_no-simd build_w32_simd_full build_w64_no-simd build_w64_simd_full
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_windows_from_cross_builds
|
||||||
|
path: pffft_cross-build-windows-from-linux-amd64.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
build_win_msvc:
|
||||||
|
# The CMake configure and build commands are platform agnostic and should work equally
|
||||||
|
# well on Windows or Mac. You can convert this to a matrix build if you need
|
||||||
|
# cross-platform coverage.
|
||||||
|
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
|
||||||
|
runs-on: windows-2019
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: check out MIPP
|
||||||
|
uses: actions/checkout@master
|
||||||
|
with:
|
||||||
|
repository: hayguen/MIPP
|
||||||
|
path: ./MIPP
|
||||||
|
- name: cmake configure MIPP
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
|
||||||
|
- name: cmake install MIPP headers
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake --build MIPP_build --target install
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Configure CMake No-SIMD
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake -S $GITHUB_WORKSPACE -B build_no-simd -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DPFFFT_USE_SIMD=OFF -DTARGET_CXX_ARCH=none -DTARGET_C_ARCH=none
|
||||||
|
- name: Build No-SIMD
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||||
|
run: cmake --build build_no-simd --config $BUILD_TYPE
|
||||||
|
|
||||||
|
- name: Configure CMake SSE2
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake -S $GITHUB_WORKSPACE -B build_sse2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=SSE2 -DTARGET_C_ARCH=SSE2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
|
||||||
|
- name: Build SSE2
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||||
|
run: cmake --build build_sse2 --config $BUILD_TYPE
|
||||||
|
|
||||||
|
- name: Configure CMake AVX
|
||||||
|
# Use a bash shell so we can use the same syntax for environment variable
|
||||||
|
# access regardless of the host operating system
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake -S $GITHUB_WORKSPACE -B build_avx -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX -DTARGET_C_ARCH=AVX -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
|
||||||
|
- name: Build AVX
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
shell: bash
|
||||||
|
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||||
|
run: cmake --build build_avx --config $BUILD_TYPE
|
||||||
|
|
||||||
|
- name: Configure CMake AVX2
|
||||||
|
# Use a bash shell so we can use the same syntax for environment variable
|
||||||
|
# access regardless of the host operating system
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: cmake -S $GITHUB_WORKSPACE -B build_avx2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX2 -DTARGET_C_ARCH=AVX2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
|
||||||
|
- name: Build AVX2
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
shell: bash
|
||||||
|
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||||
|
run: cmake --build build_avx2 --config $BUILD_TYPE
|
||||||
|
|
||||||
|
- name: compress
|
||||||
|
working-directory: ${{runner.workspace}}
|
||||||
|
run: tar zcvf pffft_windows-msvc-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_no-simd build_sse2 build_avx build_avx2
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_windows_msvc_builds
|
||||||
|
path: ${{runner.workspace}}/pffft_windows-msvc-amd64.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
build_win_mingw:
|
||||||
|
runs-on: windows-2019
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
compiler: [gcc]
|
||||||
|
msystem: [MINGW64]
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: msys2 {0}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- uses: msys2/setup-msys2@v2
|
||||||
|
with:
|
||||||
|
msystem: MINGW64
|
||||||
|
install: gcc cmake make
|
||||||
|
- name: Configure cmake
|
||||||
|
run: CC=gcc cmake -DMINGW=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native -S . -B build_mgw64
|
||||||
|
- name: Build
|
||||||
|
run: cmake --build build_mgw64
|
||||||
|
|
||||||
|
- name: compress
|
||||||
|
run: tar zcvf pffft_windows-mingw-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_mgw64
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_windows_mingw_builds
|
||||||
|
path: pffft_windows-mingw-amd64.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
build_macos11:
|
||||||
|
# copied from build_ubuntu-amd64 with minor renaming
|
||||||
|
runs-on: macos-11
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: cmake_make_simd_float_double
|
||||||
|
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||||
|
- name: cmake_make_simd_float
|
||||||
|
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||||
|
- name: cmake_make_simd_double
|
||||||
|
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||||
|
- name: cmake_make_no-simd_float_double
|
||||||
|
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||||
|
- name: cmake_make_no-simd_scalar_float_double
|
||||||
|
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||||
|
- name: compress
|
||||||
|
run: tar zcvf pffft_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_macos_builds
|
||||||
|
path: pffft_macos-11.tar.gz
|
||||||
|
|
||||||
|
build_w_mipp_macos11:
|
||||||
|
# copied from build_w_mipp_ubuntu-amd64 with minor renaming
|
||||||
|
runs-on: macos-11
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: check out MIPP
|
||||||
|
uses: actions/checkout@master
|
||||||
|
with:
|
||||||
|
repository: hayguen/MIPP
|
||||||
|
path: ./MIPP
|
||||||
|
- name: cmake configure MIPP
|
||||||
|
run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
|
||||||
|
- name: cmake install MIPP headers
|
||||||
|
run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: cmake_make_simd_float_double
|
||||||
|
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||||
|
- name: cmake_make_simd_float
|
||||||
|
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||||
|
- name: cmake_make_simd_double
|
||||||
|
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||||
|
- name: cmake_make_no-simd_float_double
|
||||||
|
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||||
|
- name: cmake_make_no-simd_scalar_float_double
|
||||||
|
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||||
|
- name: compress
|
||||||
|
run: tar zcvf pffft_w_mipp_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: pffft_macos_builds
|
||||||
|
path: pffft_w_mipp_macos-11.tar.gz
|
||||||
4
pffft/.gitignore
vendored
Normal file
4
pffft/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
build
|
||||||
|
build_benches
|
||||||
|
build_*
|
||||||
|
.vscode
|
||||||
9
pffft/.gitmodules
vendored
Normal file
9
pffft/.gitmodules
vendored
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[submodule "greenffts"]
|
||||||
|
path = greenffts
|
||||||
|
url = https://github.com/hayguen/greenffts.git
|
||||||
|
[submodule "kissfft"]
|
||||||
|
path = kissfft
|
||||||
|
url = https://github.com/hayguen/kissfft.git
|
||||||
|
[submodule "pocketfft"]
|
||||||
|
path = pocketfft
|
||||||
|
url = https://github.com/hayguen/pocketfft.git
|
||||||
663
pffft/CMakeLists.txt
Normal file
663
pffft/CMakeLists.txt
Normal file
@@ -0,0 +1,663 @@
|
|||||||
|
cmake_minimum_required(VERSION 2.8)
|
||||||
|
project(PRETTY_FAST_FFT)
|
||||||
|
|
||||||
|
# smaller library size?
|
||||||
|
option(PFFFT_USE_TYPE_FLOAT "activate single precision 'float'?" ON)
|
||||||
|
option(PFFFT_USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
|
||||||
|
|
||||||
|
# architecture/optimization options
|
||||||
|
option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
|
||||||
|
option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
|
||||||
|
|
||||||
|
# what to install?
|
||||||
|
option(INSTALL_PFFFT "install pffft to CMAKE_INSTALL_PREFIX?" ON)
|
||||||
|
option(INSTALL_PFDSP "install pfdsp to CMAKE_INSTALL_PREFIX?" OFF)
|
||||||
|
option(INSTALL_PFFASTCONV "install pffastconv to CMAKE_INSTALL_PREFIX?" OFF)
|
||||||
|
|
||||||
|
# test options
|
||||||
|
option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
|
||||||
|
option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
|
||||||
|
option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
|
||||||
|
option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
|
||||||
|
option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF)
|
||||||
|
option(PFFFT_USE_FFTPACK "compile and use FFTPACK in fft benchmark & validation?" ON)
|
||||||
|
|
||||||
|
option(PFFFT_USE_DEBUG_ASAN "use GCC's address sanitizer?" OFF)
|
||||||
|
|
||||||
|
option(PFFFT_DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
|
||||||
|
|
||||||
|
# C90 requires the gcc extensions for function attributes like always_inline
|
||||||
|
# C99 provides the function attributes: no gcc extensions required
|
||||||
|
set(CMAKE_C_STANDARD 99)
|
||||||
|
set(CMAKE_C_EXTENSIONS OFF)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 98)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||||
|
|
||||||
|
# populate what to install
|
||||||
|
set(INSTALL_TARGETS "")
|
||||||
|
set(INSTALL_HEADERS "")
|
||||||
|
|
||||||
|
|
||||||
|
if ( (NOT PFFFT_USE_TYPE_FLOAT) AND (NOT PFFFT_USE_TYPE_DOUBLE) )
|
||||||
|
message(FATAL_ERROR "activate at least one of PFFFT_USE_TYPE_FLOAT or PFFFT_USE_TYPE_DOUBLE")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||||
|
include(cmake/target_optimizations.cmake)
|
||||||
|
include(cmake/compiler_warnings.cmake)
|
||||||
|
find_package(PAPI)
|
||||||
|
find_package(MIPP)
|
||||||
|
if (MIPP_FOUND)
|
||||||
|
# if (TARGET MIPP)
|
||||||
|
message(STATUS "found MIPP")
|
||||||
|
else()
|
||||||
|
message(STATUS "NOT found MIPP")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
set(ASANLIB "asan")
|
||||||
|
else()
|
||||||
|
set(ASANLIB "")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message(STATUS "INFO: CMAKE_C_COMPILER_ID is ${CMAKE_C_COMPILER_ID}")
|
||||||
|
message(STATUS "INFO: CMAKE_CXX_COMPILER_ID is ${CMAKE_CXX_COMPILER_ID}")
|
||||||
|
if (WIN32)
|
||||||
|
message(STATUS "INFO: detected WIN32")
|
||||||
|
else()
|
||||||
|
message(STATUS "INFO: NOT WIN32")
|
||||||
|
endif()
|
||||||
|
if (MINGW)
|
||||||
|
message(STATUS "INFO: detected MINGW with compiler ${CMAKE_C_COMPILER_ID}")
|
||||||
|
else()
|
||||||
|
message(STATUS "INFO: NOT MINGW")
|
||||||
|
endif()
|
||||||
|
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
|
||||||
|
message(STATUS "INFO: detected MSVC with compiler ${CMAKE_C_COMPILER_ID}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_GREEN)
|
||||||
|
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/greenffts/CMakeLists.txt")
|
||||||
|
message(STATUS "found subdir greenffts")
|
||||||
|
set(PATH_GREEN "${CMAKE_CURRENT_LIST_DIR}/greenffts")
|
||||||
|
add_subdirectory( "${PATH_GREEN}" )
|
||||||
|
else()
|
||||||
|
message(WARNING "GreenFFT not found in subdir greenffts")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_KISS)
|
||||||
|
# git submodule add https://github.com/hayguen/kissfft.git
|
||||||
|
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/kissfft/CMakeLists.txt")
|
||||||
|
message(STATUS "found subdir kissfft")
|
||||||
|
set(PATH_KISS "${CMAKE_CURRENT_LIST_DIR}/kissfft")
|
||||||
|
add_subdirectory( "${PATH_KISS}" )
|
||||||
|
else()
|
||||||
|
message(WARNING "KissFFT not found in subdir kissfft")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_POCKET)
|
||||||
|
# git submodule add https://github.com/hayguen/pocketfft.git
|
||||||
|
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c")
|
||||||
|
message(STATUS "found subdir pocketfft")
|
||||||
|
set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft")
|
||||||
|
add_subdirectory( "${PATH_POCKET}" )
|
||||||
|
else()
|
||||||
|
message(WARNING "PocketFFT not found in subdir pocketfft")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
# select the release build type by default to get optimization flags
|
||||||
|
########################################################################
|
||||||
|
if(NOT CMAKE_BUILD_TYPE)
|
||||||
|
set(CMAKE_BUILD_TYPE "Release")
|
||||||
|
message(STATUS "Build type not specified: defaulting to release.")
|
||||||
|
endif(NOT CMAKE_BUILD_TYPE)
|
||||||
|
|
||||||
|
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
|
||||||
|
# using Visual Studio C++
|
||||||
|
message(STATUS "INFO: detected MSVC: will not link math lib m")
|
||||||
|
set(MATHLIB "")
|
||||||
|
|
||||||
|
add_definitions("/D_CRT_SECURE_NO_WARNINGS")
|
||||||
|
|
||||||
|
set(MSVC_DISABLED_WARNINGS_LIST
|
||||||
|
"C4996"
|
||||||
|
)
|
||||||
|
|
||||||
|
else()
|
||||||
|
if(PFFFT_DISABLE_LINK_WITH_M)
|
||||||
|
else()
|
||||||
|
message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
|
||||||
|
set(MATHLIB "m")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(STDCXXLIB "")
|
||||||
|
if (MINGW)
|
||||||
|
set(STDCXXLIB "stdc++")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
|
||||||
|
set( SIMD_DOUBLE_HDRS simd/pf_double.h simd/pf_avx_double.h simd/pf_scalar_double.h )
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
set( FLOAT_SOURCES pffft.c pffft.h ${SIMD_FLOAT_HDRS} )
|
||||||
|
if (INSTALL_PFFFT)
|
||||||
|
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.h)
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
set( FLOAT_SOURCES )
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
set( DOUBLE_SOURCES pffft_double.c pffft_double.h ${SIMD_DOUBLE_HDRS} )
|
||||||
|
if (INSTALL_PFFFT)
|
||||||
|
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft_double.h)
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
set( DOUBLE_SOURCES )
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
add_library(PFFFT STATIC ${FLOAT_SOURCES} ${DOUBLE_SOURCES} pffft_common.c pffft_priv_impl.h pffft.hpp )
|
||||||
|
set_target_properties(PFFFT PROPERTIES OUTPUT_NAME "pffft")
|
||||||
|
target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_activate_c_compiler_warnings(PFFFT)
|
||||||
|
if (PFFFT_USE_SCALAR_VECT)
|
||||||
|
target_compile_definitions(PFFFT PRIVATE PFFFT_SCALVEC_ENABLED=1)
|
||||||
|
endif()
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(PFFFT PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
target_set_c_arch_flags(PFFFT)
|
||||||
|
if (NOT PFFFT_USE_SIMD)
|
||||||
|
target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
|
||||||
|
endif()
|
||||||
|
target_link_libraries( PFFFT ${ASANLIB} ${MATHLIB} )
|
||||||
|
set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||||
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||||
|
)
|
||||||
|
if (INSTALL_PFFFT)
|
||||||
|
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFFT)
|
||||||
|
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.hpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_cplx.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
|
||||||
|
set_property(TARGET PFDSP PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET PFDSP PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
set_target_properties(PFDSP PROPERTIES OUTPUT_NAME "pfdsp")
|
||||||
|
target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_activate_cxx_compiler_warnings(PFDSP)
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(PFDSP PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
if (PFFFT_USE_SIMD)
|
||||||
|
target_set_cxx_arch_flags(PFDSP)
|
||||||
|
else()
|
||||||
|
target_compile_definitions(PFDSP PRIVATE PFFFT_SIMD_DISABLE=1)
|
||||||
|
endif()
|
||||||
|
target_link_libraries( PFDSP ${MATHLIB} )
|
||||||
|
set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||||
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||||
|
)
|
||||||
|
if (INSTALL_PFDSP)
|
||||||
|
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFDSP)
|
||||||
|
set(INSTALL_HEADERS ${INSTALL_HEADERS} pf_mixer.h pf_cplx.h pf_carrier.h pf_cic.h)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_FFTPACK)
|
||||||
|
|
||||||
|
# float / single precision
|
||||||
|
add_library(FFTPACK_FLOAT STATIC fftpack.c fftpack.h)
|
||||||
|
target_compile_definitions(FFTPACK_FLOAT PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_activate_c_compiler_warnings(FFTPACK_FLOAT)
|
||||||
|
target_link_libraries( FFTPACK_FLOAT ${MATHLIB} )
|
||||||
|
set_property(TARGET FFTPACK_FLOAT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||||
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||||
|
)
|
||||||
|
|
||||||
|
# double precision
|
||||||
|
add_library(FFTPACK_DOUBLE STATIC fftpack.c fftpack.h)
|
||||||
|
target_compile_definitions(FFTPACK_DOUBLE PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(FFTPACK_DOUBLE PUBLIC FFTPACK_DOUBLE_PRECISION)
|
||||||
|
target_activate_c_compiler_warnings(FFTPACK_DOUBLE)
|
||||||
|
target_link_libraries( FFTPACK_DOUBLE ${MATHLIB} )
|
||||||
|
set_property(TARGET FFTPACK_DOUBLE APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||||
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||||
|
)
|
||||||
|
|
||||||
|
# builtin test program of fftpack
|
||||||
|
add_executable(test_fftpack_float fftpack.c fftpack.h)
|
||||||
|
target_compile_definitions(test_fftpack_float PRIVATE _USE_MATH_DEFINES TESTING_FFTPACK)
|
||||||
|
target_link_libraries(test_fftpack_float ${MATHLIB})
|
||||||
|
|
||||||
|
add_executable(test_fftpack_double fftpack.c fftpack.h)
|
||||||
|
target_compile_definitions(test_fftpack_double PRIVATE _USE_MATH_DEFINES FFTPACK_DOUBLE_PRECISION TESTING_FFTPACK)
|
||||||
|
target_link_libraries(test_fftpack_double ${MATHLIB})
|
||||||
|
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
# only 'float' supported in PFFASTCONV
|
||||||
|
add_library(PFFASTCONV STATIC pffastconv.c pffastconv.h pffft.h )
|
||||||
|
set_target_properties(PFFASTCONV PROPERTIES OUTPUT_NAME "pffastconv")
|
||||||
|
target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_activate_c_compiler_warnings(PFFASTCONV)
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
|
||||||
|
set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||||
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||||
|
)
|
||||||
|
if (INSTALL_PFFASTCONV)
|
||||||
|
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFASTCONV)
|
||||||
|
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffastconv.h)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
install( TARGETS ${INSTALL_TARGETS} DESTINATION lib)
|
||||||
|
install( FILES ${INSTALL_HEADERS} DESTINATION include)
|
||||||
|
|
||||||
|
add_custom_target(uninstall
|
||||||
|
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/uninstall.cmake"
|
||||||
|
)
|
||||||
|
|
||||||
|
#######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
add_executable( test_pffft_float test_pffft.c )
|
||||||
|
target_compile_definitions(test_pffft_float PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(test_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
target_link_libraries( test_pffft_float PFFFT ${ASANLIB} )
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
add_executable( test_pffft_double test_pffft.c )
|
||||||
|
target_compile_definitions(test_pffft_double PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(test_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
target_link_libraries( test_pffft_double PFFFT ${ASANLIB} )
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
add_executable( test_fft_factors test_fft_factors.c )
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
endif()
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
endif()
|
||||||
|
target_link_libraries(test_fft_factors PFFFT ${ASANLIB} ${MATHLIB})
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
add_executable( test_pffft_cpp test_pffft.cpp )
|
||||||
|
target_compile_definitions(test_pffft_cpp PRIVATE _USE_MATH_DEFINES)
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
endif()
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
endif()
|
||||||
|
target_link_libraries( test_pffft_cpp PFFFT ${STDCXXLIB} ${ASANLIB} )
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
add_executable( test_pffft_cpp_11 test_pffft.cpp )
|
||||||
|
target_compile_definitions(test_pffft_cpp_11 PRIVATE _USE_MATH_DEFINES)
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
endif()
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
endif()
|
||||||
|
target_link_libraries( test_pffft_cpp_11 PFFFT ${STDCXXLIB} ${ASANLIB} )
|
||||||
|
|
||||||
|
set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
add_executable(test_pffastconv test_pffastconv.c
|
||||||
|
${SIMD_FLOAT_HDRS} ${SIMD_DOUBLE_HDRS}
|
||||||
|
)
|
||||||
|
target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
target_set_c_arch_flags(test_pffastconv)
|
||||||
|
if (NOT PFFFT_USE_SIMD)
|
||||||
|
target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
|
||||||
|
endif()
|
||||||
|
target_link_libraries( test_pffastconv PFFASTCONV ${ASANLIB} ${MATHLIB} )
|
||||||
|
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
add_executable(bench_pffft_float bench_pffft.c pffft.h)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(bench_pffft_float PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
target_link_libraries( bench_pffft_float PFFFT ${ASANLIB} )
|
||||||
|
|
||||||
|
if (PFFFT_USE_FFTPACK)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTPACK=1)
|
||||||
|
target_link_libraries(bench_pffft_float FFTPACK_FLOAT)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_FFTW)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTW=1)
|
||||||
|
target_link_libraries(bench_pffft_float fftw3f)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PATH_GREEN AND PFFFT_USE_BENCH_GREEN)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE HAVE_GREEN_FFTS=1)
|
||||||
|
target_link_libraries(bench_pffft_float GreenFFT)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PATH_KISS AND PFFFT_USE_BENCH_KISS)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1)
|
||||||
|
target_link_libraries(bench_pffft_float KissFFT)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1)
|
||||||
|
target_link_libraries(bench_pffft_float PocketFFT)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_MKL)
|
||||||
|
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
|
||||||
|
# has chances to work
|
||||||
|
else()
|
||||||
|
# other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?!
|
||||||
|
message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
|
||||||
|
target_compile_definitions(bench_pffft_float PRIVATE HAVE_MKL=1)
|
||||||
|
target_link_libraries(bench_pffft_float mkl_intel_lp64 mkl_sequential -lmkl_core)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
add_executable(bench_pffft_double bench_pffft.c pffft.h)
|
||||||
|
target_compile_definitions(bench_pffft_double PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(bench_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(bench_pffft_double PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
target_link_libraries( bench_pffft_double PFFFT ${ASANLIB} )
|
||||||
|
|
||||||
|
if (PFFFT_USE_FFTPACK)
|
||||||
|
target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTPACK=1)
|
||||||
|
target_link_libraries(bench_pffft_double FFTPACK_DOUBLE)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_FFTW)
|
||||||
|
target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1)
|
||||||
|
target_link_libraries(bench_pffft_double fftw3)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
|
||||||
|
target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1)
|
||||||
|
target_link_libraries(bench_pffft_double PocketFFT)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (PFFFT_USE_BENCH_MKL)
|
||||||
|
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
|
||||||
|
# has chances to work
|
||||||
|
else()
|
||||||
|
# other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?!
|
||||||
|
message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
|
||||||
|
target_compile_definitions(bench_pffft_double PRIVATE HAVE_MKL=1)
|
||||||
|
target_link_libraries(bench_pffft_double mkl_intel_lp64 mkl_sequential -lmkl_core)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
|
||||||
|
add_executable(bench_pf_mixer_float bench_mixers.cpp papi_perf_counter.h)
|
||||||
|
target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
target_link_libraries( bench_pf_mixer_float ${ASANLIB} )
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(bench_pf_mixer_float PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
if (PAPI_FOUND)
|
||||||
|
target_compile_definitions(bench_pf_mixer_float PRIVATE HAVE_PAPI=1)
|
||||||
|
target_link_libraries(bench_pf_mixer_float ${PAPI_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
target_link_libraries( bench_pf_mixer_float PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
|
||||||
|
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
add_library(pf_conv_arch_none pf_conv.cpp pf_conv.h pf_cplx.h)
|
||||||
|
target_compile_definitions(pf_conv_arch_none PRIVATE CONV_ARCH_POST=none MIPP_NO_INTRINSICS=1)
|
||||||
|
set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
target_activate_cxx_compiler_warnings(pf_conv_arch_none)
|
||||||
|
add_library(pf_conv_dispatcher pf_conv_dispatcher.cpp pf_conv_dispatcher.h pf_conv.h pf_cplx.h)
|
||||||
|
set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
target_activate_cxx_compiler_warnings(pf_conv_dispatcher)
|
||||||
|
|
||||||
|
add_library(pf_conv_arch_dflt pf_conv.cpp pf_conv.h pf_cplx.h)
|
||||||
|
target_compile_definitions(pf_conv_arch_dflt PRIVATE CONV_ARCH_POST=dflt)
|
||||||
|
set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
target_activate_cxx_compiler_warnings(pf_conv_arch_dflt)
|
||||||
|
target_set_cxx_arch_flags(pf_conv_arch_dflt)
|
||||||
|
|
||||||
|
target_link_libraries(pf_conv_dispatcher pf_conv_arch_none pf_conv_arch_dflt)
|
||||||
|
|
||||||
|
if ((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64"))
|
||||||
|
|
||||||
|
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||||
|
set(PF_CONV_ARCHES "sse3;sse4;avx;avx2")
|
||||||
|
set(PF_CONV_OPT_sse3 "core2") # emulate a map
|
||||||
|
set(PF_CONV_OPT_sse4 "nehalem")
|
||||||
|
set(PF_CONV_OPT_avx "sandybridge")
|
||||||
|
set(PF_CONV_OPT_avx2 "haswell")
|
||||||
|
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AMD64)
|
||||||
|
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||||
|
set(PF_CONV_ARCHES "sse2;avx;avx2")
|
||||||
|
set(PF_CONV_OPT_sse2 "SSE2") # emulate a map
|
||||||
|
set(PF_CONV_OPT_avx "AVX")
|
||||||
|
set(PF_CONV_OPT_avx2 "AVX2")
|
||||||
|
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_MSVC_AMD64)
|
||||||
|
else()
|
||||||
|
set(PF_CONV_ARCHES "")
|
||||||
|
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||||
|
|
||||||
|
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||||
|
set(PF_CONV_ARCHES "armv8a")
|
||||||
|
set(PF_CONV_OPT_armv8a "armv8-a") # emulate a map for arch
|
||||||
|
|
||||||
|
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AARCH64)
|
||||||
|
else()
|
||||||
|
set(PF_CONV_ARCHES "")
|
||||||
|
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
|
||||||
|
|
||||||
|
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||||
|
set(PF_CONV_ARCHES "neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72")
|
||||||
|
set(PF_CONV_OPT_neon_vfpv4 "armv7-a") # emulate a map for arch
|
||||||
|
set(PF_CONV_EXTRA_neon_vfpv4 "neon_vfpv4") # emulate a map for additional options (EXTRA)
|
||||||
|
set(PF_CONV_OPT_neon_rpi3_a53 "armv7-a")
|
||||||
|
set(PF_CONV_EXTRA_neon_rpi3_a53 "neon_rpi3_a53")
|
||||||
|
set(PF_CONV_OPT_neon_rpi4_a72 "armv7-a")
|
||||||
|
set(PF_CONV_EXTRA_neon_rpi4_a72 "neon_rpi4_a72")
|
||||||
|
|
||||||
|
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_ARM32NEON)
|
||||||
|
else()
|
||||||
|
set(PF_CONV_ARCHES "")
|
||||||
|
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
else()
|
||||||
|
message(WARNING "this is unforseen CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
foreach (arch_opt ${PF_CONV_ARCHES})
|
||||||
|
add_library(pf_conv_arch_${arch_opt} pf_conv.cpp pf_conv.h pf_cplx.h)
|
||||||
|
set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
target_activate_cxx_compiler_warnings(pf_conv_arch_${arch_opt})
|
||||||
|
target_compile_definitions(pf_conv_arch_${arch_opt} PRIVATE CONV_ARCH_POST=${arch_opt})
|
||||||
|
|
||||||
|
target_set_cxx_arch_option(pf_conv_arch_${arch_opt} "${PF_CONV_OPT_${arch_opt}}" "${PF_CONV_EXTRA_${arch_opt}}" "${PF_CONV_OPT_${arch_opt}}")
|
||||||
|
target_link_libraries(pf_conv_dispatcher pf_conv_arch_${arch_opt})
|
||||||
|
message(STATUS "added library pf_conv_arch_${arch_opt} with CONV_ARCH_POST=${arch_opt}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
foreach (arch_opt ${PF_CONV_ARCHES})
|
||||||
|
target_compile_options(pf_conv_arch_${arch_opt} PRIVATE "-fsanitize=address")
|
||||||
|
target_link_libraries( pf_conv_arch_${arch_opt} ${ASANLIB})
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
target_compile_options(pf_conv_arch_none PRIVATE "-fsanitize=address")
|
||||||
|
target_link_libraries( pf_conv_arch_none ${ASANLIB})
|
||||||
|
|
||||||
|
target_compile_options(pf_conv_dispatcher PRIVATE "-fsanitize=address")
|
||||||
|
target_link_libraries(pf_conv_dispatcher ${ASANLIB})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(MIPP_FOUND)
|
||||||
|
foreach (arch_opt ${PF_CONV_ARCHES})
|
||||||
|
message(STATUS "link pf_conv_arch_${arch_opt} against MIPP")
|
||||||
|
target_link_libraries(pf_conv_arch_${arch_opt} MIPP)
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
message(STATUS "link pf_conv_arch_none against MIPP")
|
||||||
|
target_link_libraries(pf_conv_arch_none MIPP)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
add_executable(bench_pf_conv_float bench_conv.cpp papi_perf_counter.h)
|
||||||
|
set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
target_compile_definitions(bench_pf_conv_float PRIVATE _USE_MATH_DEFINES)
|
||||||
|
target_compile_definitions(bench_pf_conv_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
if (PFFFT_USE_DEBUG_ASAN)
|
||||||
|
target_compile_options(bench_pf_conv_float PRIVATE "-fsanitize=address")
|
||||||
|
endif()
|
||||||
|
target_link_libraries( bench_pf_conv_float ${ASANLIB} )
|
||||||
|
if (PAPI_FOUND)
|
||||||
|
target_compile_definitions(bench_pf_conv_float PRIVATE HAVE_PAPI=1)
|
||||||
|
target_link_libraries(bench_pf_conv_float ${PAPI_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
if(MIPP_FOUND)
|
||||||
|
target_link_libraries(bench_pf_conv_float MIPP)
|
||||||
|
endif()
|
||||||
|
target_link_libraries( bench_pf_conv_float pf_conv_dispatcher PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
|
||||||
|
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
add_subdirectory(examples)
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
enable_testing()
|
||||||
|
|
||||||
|
|
||||||
|
add_test(NAME test_fft_factors
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fft_factors"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
if (PFFFT_USE_FFTPACK)
|
||||||
|
add_test(NAME test_fftpack_float
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_float"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(NAME test_fftpack_double
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_double"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
|
||||||
|
add_test(NAME bench_pffft_pow2
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--max-len" "128" "--quick"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(NAME bench_pffft_non2
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--non-pow2" "--max-len" "192" "--quick"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
# add_test(NAME bench_plots
|
||||||
|
# COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/plots.sh"
|
||||||
|
# WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
# )
|
||||||
|
|
||||||
|
add_test(NAME test_pfconv_lens_symetric
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(NAME test_pfconv_lens_non_sym
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(NAME bench_pfconv_symetric
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" "--sym"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(NAME bench_pfconv_non_sym
|
||||||
|
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
endif()
|
||||||
|
|
||||||
38
pffft/LICENSE.txt
Normal file
38
pffft/LICENSE.txt
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||||
|
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||||
|
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||||
|
Computational and Information Systems Laboratory, UCAR,
|
||||||
|
www.cisl.ucar.edu.
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
352
pffft/README.md
Normal file
352
pffft/README.md
Normal file
@@ -0,0 +1,352 @@
|
|||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# PFFFT: a pretty fast FFT and fast convolution with PFFASTCONV
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- toc -->
|
||||||
|
|
||||||
|
- [Brief Description](#brief-description)
|
||||||
|
- [Why does it exist?](#why-does-it-exist)
|
||||||
|
- [CMake](#cmake)
|
||||||
|
- [History / Origin / Changes](#history--origin--changes)
|
||||||
|
- [Comparison with other FFTs](#comparison-with-other-ffts)
|
||||||
|
- [Dependencies / Required Linux packages](#dependencies--required-linux-packages)
|
||||||
|
- [Benchmarks and results](#benchmarks-and-results)
|
||||||
|
|
||||||
|
<!-- tocstop -->
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Brief description:
|
||||||
|
|
||||||
|
PFFFT does 1D Fast Fourier Transforms, of single precision real and
|
||||||
|
complex vectors. It tries do it fast, it tries to be correct, and it
|
||||||
|
tries to be small. Computations do take advantage of SSE1 instructions
|
||||||
|
on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
|
||||||
|
license is BSD-like.
|
||||||
|
|
||||||
|
PFFFT is a fork of [Julien Pommier's library on bitbucket](https://bitbucket.org/jpommier/pffft/)
|
||||||
|
with some changes and additions.
|
||||||
|
|
||||||
|
|
||||||
|
PFFASTCONV does fast convolution (FIR filtering), of single precision
|
||||||
|
real vectors, utilizing the PFFFT library. The license is BSD-like.
|
||||||
|
|
||||||
|
PFDSP contains a few other signal processing functions.
|
||||||
|
Currently, mixing and carrier generation functions are contained.
|
||||||
|
It is work in progress - also the API!
|
||||||
|
The fast convolution from PFFASTCONV might get merged into PFDSP.
|
||||||
|
|
||||||
|
|
||||||
|
## Why does it exist:
|
||||||
|
|
||||||
|
I (Julien Pommier) was in search of a good performing FFT library ,
|
||||||
|
preferably very small and with a very liberal license.
|
||||||
|
|
||||||
|
When one says "fft library", FFTW ("Fastest Fourier Transform in the
|
||||||
|
West") is probably the first name that comes to mind -- I guess that
|
||||||
|
99% of open-source projects that need a FFT do use FFTW, and are happy
|
||||||
|
with it. However, it is quite a large library , which does everything
|
||||||
|
fft related (2d transforms, 3d transforms, other transformations such
|
||||||
|
as discrete cosine , or fast hartley). And it is licensed under the
|
||||||
|
GNU GPL , which means that it cannot be used in non open-source
|
||||||
|
products.
|
||||||
|
|
||||||
|
An alternative to FFTW that is really small, is the venerable FFTPACK
|
||||||
|
v4, which is available on NETLIB. A more recent version (v5) exists,
|
||||||
|
but it is larger as it deals with multi-dimensional transforms. This
|
||||||
|
is a library that is written in FORTRAN 77, a language that is now
|
||||||
|
considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
|
||||||
|
by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
|
||||||
|
its age, benchmarks show it that it still a very good performing FFT
|
||||||
|
library, see for example the 1d single precision benchmarks
|
||||||
|
[here](http://www.fftw.org/speed/opteron-2.2GHz-32bit/). It is however not
|
||||||
|
competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
|
||||||
|
Apple vDSP. The reason for that is that those libraries do take
|
||||||
|
advantage of the SSE SIMD instructions available on Intel CPUs,
|
||||||
|
available since the days of the Pentium III. These instructions deal
|
||||||
|
with small vectors of 4 floats at a time, instead of a single float
|
||||||
|
for a traditionnal FPU, so when using these instructions one may expect
|
||||||
|
a 4-fold performance improvement.
|
||||||
|
|
||||||
|
The idea was to take this fortran fftpack v4 code, translate to C,
|
||||||
|
modify it to deal with those SSE instructions, and check that the
|
||||||
|
final performance is not completely ridiculous when compared to other
|
||||||
|
SIMD FFT libraries. Translation to C was performed with [f2c](
|
||||||
|
http://www.netlib.org/f2c/). The resulting file was a bit edited in
|
||||||
|
order to remove the thousands of gotos that were introduced by
|
||||||
|
f2c. You will find the fftpack.h and fftpack.c sources in the
|
||||||
|
repository, this a complete translation of [fftpack](
|
||||||
|
http://www.netlib.org/fftpack/), with the discrete cosine transform
|
||||||
|
and the test program. There is no license information in the netlib
|
||||||
|
repository, but it was confirmed to me by the fftpack v5 curators that
|
||||||
|
the [same terms do apply to fftpack v4]
|
||||||
|
(http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html). This is a
|
||||||
|
"BSD-like" license, it is compatible with proprietary projects.
|
||||||
|
|
||||||
|
Adapting fftpack to deal with the SIMD 4-element vectors instead of
|
||||||
|
scalar single precision numbers was more complex than I originally
|
||||||
|
thought, especially with the real transforms, and I ended up writing
|
||||||
|
more code than I planned..
|
||||||
|
|
||||||
|
|
||||||
|
## The code:
|
||||||
|
|
||||||
|
### Good old C:
|
||||||
|
The FFT API is very very simple, just make sure that you read the comments in `pffft.h`.
|
||||||
|
|
||||||
|
The Fast convolution's API is also very simple, just make sure that you read the comments
|
||||||
|
in `pffastconv.h`.
|
||||||
|
|
||||||
|
### C++:
|
||||||
|
A simple C++ wrapper is available in `pffft.hpp`.
|
||||||
|
|
||||||
|
### Git:
|
||||||
|
This archive's source can be downloaded with git (without the submodules):
|
||||||
|
```
|
||||||
|
git clone https://github.com/marton78/pffft.git
|
||||||
|
```
|
||||||
|
|
||||||
|
### Only two files?:
|
||||||
|
_"Only two files, in good old C, pffft.c and pffft.h"_
|
||||||
|
|
||||||
|
This statement does **NO LONGER** hold!
|
||||||
|
|
||||||
|
With new functionality and support for AVX, there was need to restructure the sources.
|
||||||
|
But you can compile and link **pffft** as a static library.
|
||||||
|
|
||||||
|
|
||||||
|
## CMake:
|
||||||
|
There's now CMake support to build the static libraries `libPFFFT.a`
|
||||||
|
and `libPFFASTCONV.a` from the source files, plus the additional
|
||||||
|
`libFFTPACK.a` library. Later one's sources are there anyway for the benchmark.
|
||||||
|
|
||||||
|
There are several CMake options to modify library size and optimization.
|
||||||
|
You can explore all available options with `cmake-gui` or `ccmake`,
|
||||||
|
the console version - after having installed (on Debian/Ubuntu Linux) one of
|
||||||
|
```
|
||||||
|
sudo apt-get install cmake-qt-gui
|
||||||
|
sudo apt-get install cmake-curses-gui
|
||||||
|
```
|
||||||
|
|
||||||
|
Some of the options:
|
||||||
|
* `PFFFT_USE_TYPE_FLOAT` to activate single precision 'float' (default: ON)
|
||||||
|
* `PFFFT_USE_TYPE_DOUBLE` to activate 'double' precision float (default: ON)
|
||||||
|
* `PFFFT_USE_SIMD` to use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? (default: ON)
|
||||||
|
* `DISABLE_SIMD_AVX` to disable AVX CPU features (default: OFF)
|
||||||
|
* `PFFFT_USE_SIMD_NEON` to force using NEON on ARM (requires PFFFT_USE_SIMD) (default: OFF)
|
||||||
|
* `PFFFT_USE_SCALAR_VECT` to use 4-element vector scalar operations (if no other SIMD) (default: ON)
|
||||||
|
|
||||||
|
Options can be passed to `cmake` at command line, e.g.
|
||||||
|
```
|
||||||
|
cmake -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_TYPE_DOUBLE=ON
|
||||||
|
```
|
||||||
|
|
||||||
|
My Linux distribution defaults to GCC. With installed CLANG and the bash shell, you can use it with
|
||||||
|
```
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -DCMAKE_BUILD_TYPE=Debug ../
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=~ ../
|
||||||
|
ccmake . # or: cmake-gui .
|
||||||
|
cmake --build . # or simply: make
|
||||||
|
ctest # to execute some tests - including benchmarks
|
||||||
|
cmake --build . --target install # or simply: [sudo] make install
|
||||||
|
```
|
||||||
|
|
||||||
|
With MSVC on Windows, you need some different options. Following ones to build a 64-bit Release with Visual Studio 2019:
|
||||||
|
```
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G "Visual Studio 16 2019" -A x64 ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
ctest -C Release
|
||||||
|
```
|
||||||
|
|
||||||
|
see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
|
||||||
|
|
||||||
|
|
||||||
|
## History / Origin / Changes:
|
||||||
|
Origin for this code/fork is Julien Pommier's pffft on bitbucket:
|
||||||
|
[https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/)
|
||||||
|
|
||||||
|
Git history shows following first commits of the major contributors:
|
||||||
|
* Julien Pommier: November 19, 2011
|
||||||
|
* Marton Danoczy: September 30, 2015
|
||||||
|
* Hayati Ayguen: December 22, 2019
|
||||||
|
* Dario Mambro: March 24, 2020
|
||||||
|
|
||||||
|
There are a few other contributors not listed here.
|
||||||
|
|
||||||
|
The main changes include:
|
||||||
|
* improved benchmarking, see [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks)
|
||||||
|
* double support
|
||||||
|
* avx(2) support
|
||||||
|
* c++ headers (wrapper)
|
||||||
|
* additional API helper functions
|
||||||
|
* additional library for fast convolution
|
||||||
|
* cmake support
|
||||||
|
* ctest
|
||||||
|
|
||||||
|
|
||||||
|
## Comparison with other FFTs:
|
||||||
|
The idea was not to break speed records, but to get a decently fast
|
||||||
|
fft that is at least 50% as fast as the fastest FFT -- especially on
|
||||||
|
slowest computers . I'm more focused on getting the best performance
|
||||||
|
on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
|
||||||
|
on getting top performance on today fastest cpus.
|
||||||
|
|
||||||
|
It can be used in a real-time context as the fft functions do not
|
||||||
|
perform any memory allocation -- that is why they accept a 'work'
|
||||||
|
array in their arguments.
|
||||||
|
|
||||||
|
It is also a bit focused on performing 1D convolutions, that is why it
|
||||||
|
provides "unordered" FFTs , and a fourier domain convolution
|
||||||
|
operation.
|
||||||
|
|
||||||
|
Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages).
|
||||||
|
It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast.
|
||||||
|
The whole C++ implementation file is 161 lines, including the Copyright header, see
|
||||||
|
[https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp)
|
||||||
|
|
||||||
|
## Dependencies / Required Linux packages
|
||||||
|
|
||||||
|
On Debian/Ubuntu Linux following packages should be installed:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo apt-get install build-essential gcc g++ cmake
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Benchmarks and results
|
||||||
|
|
||||||
|
#### Quicklink
|
||||||
|
Find results at [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
|
||||||
|
|
||||||
|
#### General
|
||||||
|
My (Hayati Ayguen) first look at FFT-benchmarks was with [benchFFT](http://www.fftw.org/benchfft/)
|
||||||
|
and especially the results of the benchmarks [results](http://www.fftw.org/speed/),
|
||||||
|
which demonstrate the performance of the [FFTW](http://www.fftw.org/).
|
||||||
|
Looking at the benchmarked computer systems from todays view (2021), these are quite outdated.
|
||||||
|
|
||||||
|
Having a look into the [benchFFT source code](http://www.fftw.org/benchfft/benchfft-3.1.tar.gz),
|
||||||
|
the latest source changes, including competitive fft implementations, are dated November 2003.
|
||||||
|
|
||||||
|
In 2019, when pffft got my attention at [bitbucket](https://bitbucket.org/jpommier/pffft/src/master/),
|
||||||
|
there were also some benchmark results.
|
||||||
|
Unfortunately the results are tables with numbers - without graphical plots.
|
||||||
|
Without the plots, i could not get an impression. That was, why i started
|
||||||
|
[https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks),
|
||||||
|
which includes GnuPlot figures.
|
||||||
|
|
||||||
|
Today in June 2021, i realized the existence of [https://github.com/FFTW/benchfft](https://github.com/FFTW/benchfft).
|
||||||
|
This repository is much more up-to-date with a commit in December 2020.
|
||||||
|
Unfortunately, it looks not so simple to get it run - including the generation of plots.
|
||||||
|
|
||||||
|
Is there any website showing benchFFT results of more recent computer systems?
|
||||||
|
|
||||||
|
Of course, it's very important, that a benchmark can be compared with a bunch
|
||||||
|
of different FFT algorithms/implementations.
|
||||||
|
This requires to have these compiled/built and utilizable.
|
||||||
|
|
||||||
|
|
||||||
|
#### Git submodules for Green-, Kiss- and Pocket-FFT
|
||||||
|
Sources for [Green-](https://github.com/hayguen/greenffts),
|
||||||
|
[Kiss-](https://github.com/hayguen/kissfft)
|
||||||
|
and [Pocket-FFT](https://github.com/hayguen/pocketfft)
|
||||||
|
can be downloaded directly with the sources of this repository - using git submodules:
|
||||||
|
```
|
||||||
|
git clone --recursive https://github.com/marton78/pffft.git
|
||||||
|
```
|
||||||
|
|
||||||
|
Important is `--recursive`, that does also fetch the submodules directly.
|
||||||
|
But you might retrieve the submodules later, too:
|
||||||
|
```
|
||||||
|
git submodule update --init
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Fastest Fourier Transform in the West: FFTW
|
||||||
|
To allow comparison with FFTW [http://www.fftw.org/](http://www.fftw.org/),
|
||||||
|
cmake option `-DPFFFT_USE_BENCH_FFTW=ON` has to be used with following commands.
|
||||||
|
The cmake option requires previous setup of following (debian/ubuntu) package:
|
||||||
|
```
|
||||||
|
sudo apt-get install libfftw3-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Intel Math Kernel Library: MKL
|
||||||
|
Intel's MKL [https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html)
|
||||||
|
currently looks even faster than FFTW.
|
||||||
|
|
||||||
|
On Ubuntu-Linux it's easy to setup with the package `intel-mkl`.
|
||||||
|
Similar on Debian: `intel-mkl-full`.
|
||||||
|
|
||||||
|
There are special repositories for following Linux distributions:
|
||||||
|
* Debian/apt: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html)
|
||||||
|
* RedHat/yum: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html)
|
||||||
|
* Gentoo/ebuild: [https://packages.gentoo.org/packages/sci-libs/mkl](https://packages.gentoo.org/packages/sci-libs/mkl)
|
||||||
|
|
||||||
|
#### Performing the benchmarks - with CMake
|
||||||
|
Benchmarks should be prepared by creating a special build folder
|
||||||
|
```
|
||||||
|
mkdir build_benches
|
||||||
|
cd build_benches
|
||||||
|
cmake ../bench
|
||||||
|
```
|
||||||
|
|
||||||
|
There are several CMake options to parametrize, which fft implementations should be benched.
|
||||||
|
You can explore all available options with `cmake-gui` or `ccmake`, see [CMake](#cmake).
|
||||||
|
|
||||||
|
Some of the options:
|
||||||
|
* `BENCH_ID` name the benchmark - used in filename
|
||||||
|
* `BENCH_ARCH` target architecture passed to compiler for code optimization
|
||||||
|
* `PFFFT_USE_BENCH_FFTW` use (system-installed) FFTW3 in fft benchmark? (default: OFF)
|
||||||
|
* `PFFFT_USE_BENCH_GREEN` use Green FFT in fft benchmark? (default: ON)
|
||||||
|
* `PFFFT_USE_BENCH_KISS` use KissFFT in fft benchmark? (default: ON)
|
||||||
|
* `PFFFT_USE_BENCH_POCKET` use PocketFFT in fft benchmark? (default: ON)
|
||||||
|
* `PFFFT_USE_BENCH_MKL` use Intel MKL in fft benchmark? (default: OFF)
|
||||||
|
|
||||||
|
These options can be passed to `cmake` at command line, e.g.
|
||||||
|
```
|
||||||
|
cmake -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
|
||||||
|
```
|
||||||
|
|
||||||
|
The benchmarks are built and executed with
|
||||||
|
```
|
||||||
|
cmake --build .
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also specify to use a different compiler/version with the cmake step, e.g.:
|
||||||
|
|
||||||
|
```
|
||||||
|
CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 cmake -DBENCH_ID=gcc9 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
CC=/usr/bin/clang-11 CXX=/usr/bin/clang++-11 cmake -DBENCH_ID=clang11 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
|
||||||
|
```
|
||||||
|
|
||||||
|
For using MSVC/Windows, the cmake command requires/needs the generator and architecture options and to be called from the VS Developer prompt:
|
||||||
|
```
|
||||||
|
cmake -G "Visual Studio 16 2019" -A x64 ../bench/
|
||||||
|
```
|
||||||
|
|
||||||
|
see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
For running with different compiler version(s):
|
||||||
|
* copy the result file (.tgz), e.g. `cp *.tgz ../`
|
||||||
|
* delete the build directory: `rm -rf *`
|
||||||
|
* then continue with the cmake step
|
||||||
|
|
||||||
|
|
||||||
|
#### Benchmark results and contribution
|
||||||
|
You might contribute by providing us the results of your computer(s).
|
||||||
|
|
||||||
|
The benchmark results are stored in a separate git-repository:
|
||||||
|
See [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
|
||||||
|
|
||||||
|
This is to keep this repositories' sources small.
|
||||||
|
|
||||||
224
pffft/bench/CMakeLists.txt
Normal file
224
pffft/bench/CMakeLists.txt
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
cmake_minimum_required(VERSION 2.8)
|
||||||
|
project(BENCH_PFFFT)
|
||||||
|
|
||||||
|
set(BENCH_ID "default" CACHE STRING "ID: use single word without spaces. gets part of result filename")
|
||||||
|
|
||||||
|
option(BENCH_FAST_MATH "Build with fast math - non IEEE compliant" ON)
|
||||||
|
|
||||||
|
if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||||
|
set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge/ARM-NEON:armv7-a")
|
||||||
|
elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||||
|
set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge")
|
||||||
|
elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC") # others: "Intel"
|
||||||
|
set(BENCH_ARCH "AVX" CACHE STRING "target architecture (/arch): SSE2/AVX")
|
||||||
|
else()
|
||||||
|
set(BENCH_ARCH "" CACHE STRING "target architecture - use full compiler option!")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# architecture/optimization options
|
||||||
|
option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
|
||||||
|
option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
|
||||||
|
option(PFFFT_USE_SIMD_NEON "force using NEON on ARM? (requires PFFFT_USE_SIMD)" OFF)
|
||||||
|
option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
|
||||||
|
|
||||||
|
option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
|
||||||
|
option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
|
||||||
|
option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
|
||||||
|
option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
|
||||||
|
option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF)
|
||||||
|
|
||||||
|
|
||||||
|
set(OSSTR "")
|
||||||
|
if (WIN32)
|
||||||
|
set(OSSTR "Win32")
|
||||||
|
endif (WIN32)
|
||||||
|
if (UNIX)
|
||||||
|
set(OSSTR "Unix")
|
||||||
|
endif (UNIX)
|
||||||
|
|
||||||
|
set(BUILD_DIR_TO_EXE "")
|
||||||
|
set(CMAKE_PLATFORM_OPT "")
|
||||||
|
set(CMAKE_MAKE_OPT "")
|
||||||
|
if (MSVC)
|
||||||
|
set(BUILD_DIR_TO_EXE "Release/")
|
||||||
|
set(CMAKE_PLATFORM_OPT "-A \"${CMAKE_GENERATOR_PLATFORM}\"")
|
||||||
|
set(CMAKE_MAKE_OPT "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set(benchdir "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}")
|
||||||
|
set(benchdir_flt "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/float")
|
||||||
|
set(benchdir_dbl "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/double")
|
||||||
|
set(builddir_flt "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_float")
|
||||||
|
set(builddir_dbl "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_double")
|
||||||
|
|
||||||
|
add_custom_command(OUTPUT "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "benchmark ${BENCH_ID}" > "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "CMake major: ${CMAKE_MAJOR_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "CMake minor: ${CMAKE_MINOR_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "OS: ${OSSTR}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "System: ${CMAKE_SYSTEM_NAME}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "System CPU: ${CMAKE_SYSTEM_PROCESSOR}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "System Version: ${CMAKE_HOST_SYSTEM_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "C Compiler: ${CMAKE_C_COMPILER_ID}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "C Version: ${CMAKE_C_COMPILER_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "C++ Compiler: ${CMAKE_CXX_COMPILER_ID}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "C++ Version: ${CMAKE_CXX_COMPILER_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "MSVC Version: ${MSVC_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "MSVC Toolset: ${MSVC_TOOLSET_VERSION}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "Exe Suffix: ${CMAKE_EXECUTABLE_SUFFIX}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "C Byte Order: ${CMAKE_C_BYTE_ORDER}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "C++ Byte Order: ${CMAKE_CXX_BYTE_ORDER}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "Architecture: ${BENCH_ARCH}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "Fast math: ${BENCH_FAST_MATH}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD=${PFFFT_USE_SIMD}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config DISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}" >> "${benchdir}/info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}" >> "${benchdir}/info.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
if (UNIX)
|
||||||
|
add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
|
||||||
|
COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/unix_info.sh"
|
||||||
|
DEPENDS "${benchdir}/info.txt"
|
||||||
|
WORKING_DIRECTORY ${benchdir}
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
|
||||||
|
DEPENDS "${benchdir}/info.txt"
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
add_custom_command(OUTPUT "${builddir_flt}/directory.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_flt}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E touch "${builddir_flt}/directory.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(OUTPUT "${builddir_dbl}/directory.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_dbl}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E touch "${builddir_dbl}/directory.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(OUTPUT "${benchdir_flt}/directory.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_flt}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_flt}/directory.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(OUTPUT "${benchdir_dbl}/directory.txt"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_dbl}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_dbl}/directory.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
add_custom_target(build_float
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "start cmake for float in ${builddir_flt}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
|
||||||
|
"${CMAKE_MAKE_OPT}"
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
"-DARCH=${BENCH_ARCH}"
|
||||||
|
-DUSE_FAST_MATH=${BENCH_FAST_MATH}
|
||||||
|
-DPFFFT_USE_TYPE_FLOAT=ON
|
||||||
|
-DPFFFT_USE_TYPE_DOUBLE=OFF
|
||||||
|
-DUSE_FLOAT_PREC=ON
|
||||||
|
-DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
|
||||||
|
-DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
|
||||||
|
-DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
|
||||||
|
-DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
|
||||||
|
-DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
|
||||||
|
-DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
|
||||||
|
-DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
|
||||||
|
-DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
|
||||||
|
-DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
|
||||||
|
"${CMAKE_SOURCE_DIR}/.."
|
||||||
|
# COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for float in ${builddir_flt}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} --build . --config Release
|
||||||
|
DEPENDS "${builddir_flt}/directory.txt"
|
||||||
|
WORKING_DIRECTORY "${builddir_flt}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(build_double
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "start cmake for double in ${builddir_dbl}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
|
||||||
|
"${CMAKE_MAKE_OPT}"
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
"-DARCH=${BENCH_ARCH}"
|
||||||
|
-DUSE_FAST_MATH=${BENCH_FAST_MATH}
|
||||||
|
-DPFFFT_USE_TYPE_FLOAT=OFF
|
||||||
|
-DPFFFT_USE_TYPE_DOUBLE=ON
|
||||||
|
-DUSE_FLOAT_PREC=OFF
|
||||||
|
-DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
|
||||||
|
-DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
|
||||||
|
-DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
|
||||||
|
-DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
|
||||||
|
-DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
|
||||||
|
-DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
|
||||||
|
-DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
|
||||||
|
-DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
|
||||||
|
-DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
|
||||||
|
"${CMAKE_SOURCE_DIR}/.."
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for double in ${builddir_dbl}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} --build . --config Release
|
||||||
|
DEPENDS "${builddir_dbl}/directory.txt"
|
||||||
|
WORKING_DIRECTORY "${builddir_dbl}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(bench_float
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for float"
|
||||||
|
COMMAND "${builddir_flt}/${BUILD_DIR_TO_EXE}bench_pffft_float${CMAKE_EXECUTABLE_SUFFIX}"
|
||||||
|
DEPENDS "${benchdir_flt}/directory.txt" build_float
|
||||||
|
WORKING_DIRECTORY "${benchdir_flt}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(bench_double
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for double"
|
||||||
|
COMMAND "${builddir_dbl}/${BUILD_DIR_TO_EXE}bench_pffft_double${CMAKE_EXECUTABLE_SUFFIX}"
|
||||||
|
DEPENDS "${benchdir_dbl}/directory.txt" build_double
|
||||||
|
WORKING_DIRECTORY "${benchdir_dbl}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(bench ALL
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
|
||||||
|
# DEPENDS "${benchdir}/info.txt" "${benchdir}/unix_info.txt"
|
||||||
|
DEPENDS "${benchdir}/info.txt" bench_float bench_double "${benchdir}/unix_info.txt"
|
||||||
|
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(bench_float_tar
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
|
||||||
|
DEPENDS "${benchdir}/info.txt" bench_float "${benchdir}/unix_info.txt"
|
||||||
|
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(bench_double_tar
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
|
||||||
|
DEPENDS "${benchdir}/info.txt" bench_double "${benchdir}/unix_info.txt"
|
||||||
|
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(clean_results
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_flt}"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_dbl}"
|
||||||
|
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||||
|
)
|
||||||
|
|
||||||
9
pffft/bench/unix_info.sh
Executable file
9
pffft/bench/unix_info.sh
Executable file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
lscpu > unix_lscpu.txt
|
||||||
|
cat /proc/cpuinfo > unix_cpuinfo.txt
|
||||||
|
lsb_release -a > unix_lsb_release.txt
|
||||||
|
FILES=$(ls -1 /etc/*-release)
|
||||||
|
if [ ! -z "$FILES" ]; then
|
||||||
|
cp /etc/*-release ./
|
||||||
|
fi
|
||||||
345
pffft/bench_conv.cpp
Normal file
345
pffft/bench_conv.cpp
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <random>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <complex>
|
||||||
|
|
||||||
|
#include "papi_perf_counter.h"
|
||||||
|
|
||||||
|
//#if defined(HAVE_MIPP) && !defined(NO_MIPP)
|
||||||
|
#if defined(HAVE_MIPP)
|
||||||
|
#include <mipp.h>
|
||||||
|
|
||||||
|
#define MIPP_VECTOR mipp::vector
|
||||||
|
#else
|
||||||
|
#define MIPP_VECTOR std::vector
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "pf_conv_dispatcher.h"
|
||||||
|
#include "pf_conv.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define TEST_WITH_MIN_LEN 0
|
||||||
|
|
||||||
|
|
||||||
|
MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
|
||||||
|
{
|
||||||
|
MIPP_VECTOR<float> v(N < 0 ? M : N);
|
||||||
|
std::mt19937 g;
|
||||||
|
g.seed(seed_value);
|
||||||
|
constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
|
||||||
|
for (int k = 0; k < M; ++k)
|
||||||
|
v[k] = float(int_fast32_t(g())) * scale;
|
||||||
|
for (int k = M; k < N; ++k)
|
||||||
|
v[k] = 0.0F;
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int bench_oop_core(
|
||||||
|
const conv_f_ptrs & conv_arch,
|
||||||
|
const float * signal, const int sz_signal,
|
||||||
|
const float * filter, const int sz_filter,
|
||||||
|
const int blockLen,
|
||||||
|
float * y
|
||||||
|
)
|
||||||
|
{
|
||||||
|
conv_buffer_state state;
|
||||||
|
const auto conv_oop = conv_arch.fp_conv_float_oop;
|
||||||
|
int n_out_sum = 0;
|
||||||
|
state.offset = 0;
|
||||||
|
state.size = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||||
|
{
|
||||||
|
state.size += blockLen;
|
||||||
|
int n_out = conv_oop(signal, &state, filter, sz_filter, y);
|
||||||
|
n_out_sum += n_out;
|
||||||
|
}
|
||||||
|
return n_out_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bench_inplace_core(
|
||||||
|
const conv_f_ptrs & conv_arch,
|
||||||
|
float * signal, const int sz_signal,
|
||||||
|
const float * filter, const int sz_filter,
|
||||||
|
const int blockLen
|
||||||
|
)
|
||||||
|
{
|
||||||
|
conv_buffer_state state;
|
||||||
|
const auto conv_inplace = conv_arch.fp_conv_float_inplace;
|
||||||
|
int n_out_sum = 0;
|
||||||
|
state.offset = 0;
|
||||||
|
state.size = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||||
|
{
|
||||||
|
state.size += blockLen;
|
||||||
|
int n_out = conv_inplace(signal, &state, filter, sz_filter);
|
||||||
|
n_out_sum += n_out;
|
||||||
|
}
|
||||||
|
return n_out_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int bench_oop(
|
||||||
|
const conv_f_ptrs & conv_arch,
|
||||||
|
float * buffer,
|
||||||
|
const float * signal, const int sz_signal,
|
||||||
|
const float * filter, const int sz_filter,
|
||||||
|
const int blockLen,
|
||||||
|
float * y
|
||||||
|
)
|
||||||
|
{
|
||||||
|
conv_buffer_state state;
|
||||||
|
const auto conv_oop = conv_arch.fp_conv_float_oop;
|
||||||
|
const auto move_rest = conv_arch.fp_conv_float_move_rest;
|
||||||
|
int n_out_sum = 0;
|
||||||
|
state.offset = 0;
|
||||||
|
state.size = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||||
|
{
|
||||||
|
move_rest(buffer, &state);
|
||||||
|
//memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
|
||||||
|
std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
|
||||||
|
state.size += blockLen;
|
||||||
|
int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
|
||||||
|
n_out_sum += n_out;
|
||||||
|
}
|
||||||
|
return n_out_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bench_cx_real_oop(
|
||||||
|
const conv_f_ptrs & conv_arch,
|
||||||
|
complexf * buffer,
|
||||||
|
const float * signal_re, const int sz_signal_re,
|
||||||
|
const float * filter, const int sz_filter,
|
||||||
|
const int blockLen,
|
||||||
|
float * y_re
|
||||||
|
)
|
||||||
|
{
|
||||||
|
conv_buffer_state state;
|
||||||
|
const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
|
||||||
|
const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
|
||||||
|
// interpret buffer, signal and output vector y as complex data
|
||||||
|
complexf * y = reinterpret_cast<complexf *>(y_re);
|
||||||
|
const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
|
||||||
|
const int sz_signal = sz_signal_re / 2;
|
||||||
|
int n_out_sum = 0;
|
||||||
|
state.offset = 0;
|
||||||
|
state.size = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||||
|
{
|
||||||
|
move_rest(buffer, &state);
|
||||||
|
//memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
|
||||||
|
std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
|
||||||
|
state.size += blockLen;
|
||||||
|
int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
|
||||||
|
n_out_sum += n_out;
|
||||||
|
}
|
||||||
|
return n_out_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
// cli defaults:
|
||||||
|
// process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
|
||||||
|
int arch = 0, N = 64 * 1024 * 1024;
|
||||||
|
int filterLen = 128, blockLen = 1024;
|
||||||
|
int seed_sig = 1, seed_filter = 2;
|
||||||
|
bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
|
||||||
|
|
||||||
|
for (int i = 1; i < argc; ++i)
|
||||||
|
{
|
||||||
|
if (i+1 < argc && !strcmp(argv[i], "-a"))
|
||||||
|
arch = atoi(argv[++i]);
|
||||||
|
else if (i+1 < argc && !strcmp(argv[i], "-n"))
|
||||||
|
N = atoi(argv[++i]) * 1024 * 1024;
|
||||||
|
else if (i+1 < argc && !strcmp(argv[i], "-f"))
|
||||||
|
filterLen = atoi(argv[++i]);
|
||||||
|
else if (i+1 < argc && !strcmp(argv[i], "-b"))
|
||||||
|
blockLen = atoi(argv[++i]);
|
||||||
|
else if (i+1 < argc && !strcmp(argv[i], "-ss"))
|
||||||
|
seed_sig = atoi(argv[++i]);
|
||||||
|
else if (i+1 < argc && !strcmp(argv[i], "-sf"))
|
||||||
|
seed_filter = atoi(argv[++i]);
|
||||||
|
else if (!strcmp(argv[i], "-v"))
|
||||||
|
verbose = true;
|
||||||
|
else if (!strcmp(argv[i], "-h"))
|
||||||
|
showUsage = exitFromUsage = true;
|
||||||
|
else
|
||||||
|
fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_arch = 0;
|
||||||
|
const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
|
||||||
|
|
||||||
|
if (verbose)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "num_arch is %d\n", num_arch);
|
||||||
|
for (int a = 0; a < num_arch; ++a)
|
||||||
|
if (conv_arch_ptrs[a])
|
||||||
|
fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
|
||||||
|
else
|
||||||
|
fprintf(stderr, " arch %d is nullptr !!!\n", a );
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
|
||||||
|
fprintf(stderr, " [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
|
||||||
|
fprintf(stderr, "arch is one of:");
|
||||||
|
for (int a = 0; a < num_arch; ++a)
|
||||||
|
if (conv_arch_ptrs[a])
|
||||||
|
fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbose)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_PAPI
|
||||||
|
fprintf(stderr, "PAPI is available\n");
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "PAPI is NOT available!\n");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if !defined(HAVE_MIPP)
|
||||||
|
fprintf(stderr, "MIPP is NOT available!\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//int float_simd_size[num_arch];
|
||||||
|
int max_simd_size = -1;
|
||||||
|
for (int a = 0; a < num_arch; ++a)
|
||||||
|
{
|
||||||
|
if (conv_arch_ptrs[a])
|
||||||
|
{
|
||||||
|
const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
|
||||||
|
//float_simd_size[a] = sz;
|
||||||
|
if (max_simd_size < sz)
|
||||||
|
max_simd_size = sz;
|
||||||
|
if (verbose)
|
||||||
|
fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
|
||||||
|
}
|
||||||
|
//else
|
||||||
|
// float_simd_size[a] = 0;
|
||||||
|
}
|
||||||
|
//const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
|
||||||
|
if (verbose)
|
||||||
|
fprintf(stderr, "max float simd size: %d\n", max_simd_size);
|
||||||
|
|
||||||
|
#if TEST_WITH_MIN_LEN
|
||||||
|
filterLen = 2;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// round up filter length
|
||||||
|
filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
|
||||||
|
|
||||||
|
#if TEST_WITH_MIN_LEN
|
||||||
|
blockLen = 1;
|
||||||
|
N = 2 * (3 + filterLen); // produce 3+1 samples
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!conv_arch_ptrs[arch])
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
const conv_f_ptrs & conv_arch = *conv_arch_ptrs[arch];
|
||||||
|
if (verbose)
|
||||||
|
fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
|
||||||
|
|
||||||
|
fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
|
||||||
|
N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
|
||||||
|
|
||||||
|
MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
|
||||||
|
MIPP_VECTOR<float> y(N + 1, 0.0F);
|
||||||
|
MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
|
||||||
|
MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
|
||||||
|
MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
|
||||||
|
|
||||||
|
#if 1 && TEST_WITH_MIN_LEN
|
||||||
|
for (int k = 0; k < N; ++k)
|
||||||
|
s[k] = (k+1);
|
||||||
|
for (int k = 0; k < filterLen; ++k)
|
||||||
|
filter[k] = (k+1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
s[N] = 123.0F;
|
||||||
|
y[N] = 321.0F;
|
||||||
|
buffer[blockLen + filterLen] = 789.0F;
|
||||||
|
buffer_cx[blockLen + filterLen].i = 987.0F;
|
||||||
|
|
||||||
|
fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
|
||||||
|
int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
|
||||||
|
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
|
||||||
|
#if TEST_WITH_MIN_LEN
|
||||||
|
for (int k = 0; k < n_oop_out; ++k )
|
||||||
|
fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
|
||||||
|
n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
|
||||||
|
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
|
||||||
|
assert(s[N] == 123.0F);
|
||||||
|
assert(y[N] == 321.0F);
|
||||||
|
assert(buffer[blockLen + filterLen] == 789.0F);
|
||||||
|
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
|
||||||
|
#if TEST_WITH_MIN_LEN
|
||||||
|
for (int k = 0; k < n_oop_out; ++k )
|
||||||
|
fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
|
||||||
|
n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
|
||||||
|
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
|
||||||
|
assert(s[N] == 123.0F);
|
||||||
|
assert(y[N] == 321.0F);
|
||||||
|
assert(buffer[blockLen + filterLen] == 789.0F);
|
||||||
|
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
|
||||||
|
#if TEST_WITH_MIN_LEN
|
||||||
|
fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
|
||||||
|
for (int k = 0; k < n_oop_out; ++k )
|
||||||
|
fprintf(stderr, "y[%2d] = %g %+g * i\n", k, y[2*k], y[2*k+1]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
|
||||||
|
const int Nc = N /2;
|
||||||
|
fprintf(stderr, "reference with std::complex<float>:\n");
|
||||||
|
for (int off = 0; off +filterLen <= Nc; ++off )
|
||||||
|
{
|
||||||
|
std::complex<float> sum(0.0F, 0.0F);
|
||||||
|
for (int k=0; k < filterLen; ++k)
|
||||||
|
sum += sc[off+k] * filter[k];
|
||||||
|
fprintf(stderr, "yv[%2d] = %g %+g * i\n", off, sum.real(), sum.imag() );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
|
||||||
|
int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
|
||||||
|
fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
|
||||||
|
assert(s[N] == 123.0F);
|
||||||
|
assert(y[N] == 321.0F);
|
||||||
|
assert(buffer[blockLen + filterLen] == 789.0F);
|
||||||
|
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
|
||||||
|
#if TEST_WITH_MIN_LEN
|
||||||
|
for (int k = 0; k < n_inp_out; ++k )
|
||||||
|
fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
889
pffft/bench_mixers.cpp
Normal file
889
pffft/bench_mixers.cpp
Normal file
@@ -0,0 +1,889 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
|
||||||
|
bench for mixer algorithm/implementations
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <pf_mixer.h>
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "papi_perf_counter.h"
|
||||||
|
|
||||||
|
#if defined(__linux__)
|
||||||
|
#define HAVE_SYS_TIMES
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_SYS_TIMES
|
||||||
|
# include <sys/times.h>
|
||||||
|
# include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#define VC_EXTRALEAN
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define BENCH_REF_TRIG_FUNC 1
|
||||||
|
#define BENCH_OUT_OF_PLACE_ALGOS 0
|
||||||
|
#define BENCH_INPLACE_ALGOS 1
|
||||||
|
|
||||||
|
#define SAVE_BY_DEFAULT 0
|
||||||
|
#define SAVE_LIMIT_MSPS 16
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
#define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
|
||||||
|
#define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
|
||||||
|
#define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
|
||||||
|
#define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
|
||||||
|
#define BENCH_FILE_REC_OSC_CC ""
|
||||||
|
#define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
|
||||||
|
#define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
|
||||||
|
#else
|
||||||
|
#define BENCH_FILE_SHIFT_MATH_CC ""
|
||||||
|
#define BENCH_FILE_ADD_FAST_CC ""
|
||||||
|
#define BENCH_FILE_ADD_FAST_INP_C ""
|
||||||
|
#define BENCH_FILE_UNROLL_INP_C ""
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_INP_C ""
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C ""
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C ""
|
||||||
|
#define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C ""
|
||||||
|
#define BENCH_FILE_REC_OSC_CC ""
|
||||||
|
#define BENCH_FILE_REC_OSC_INP_C ""
|
||||||
|
#define BENCH_FILE_REC_OSC_SSE_INP_C ""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(HAVE_SYS_TIMES)
|
||||||
|
static double ttclk = 0.;
|
||||||
|
|
||||||
|
static double uclock_sec(int find_start)
|
||||||
|
{
|
||||||
|
struct tms t0, t;
|
||||||
|
if (ttclk == 0.)
|
||||||
|
{
|
||||||
|
ttclk = sysconf(_SC_CLK_TCK);
|
||||||
|
fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
|
||||||
|
}
|
||||||
|
times(&t);
|
||||||
|
if (find_start)
|
||||||
|
{
|
||||||
|
t0 = t;
|
||||||
|
while (t0.tms_utime == t.tms_utime)
|
||||||
|
times(&t);
|
||||||
|
}
|
||||||
|
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
|
||||||
|
return ((double)t.tms_utime) / ttclk;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(WIN32)
|
||||||
|
// https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
|
||||||
|
double uclock_sec(int find_start)
|
||||||
|
{
|
||||||
|
FILETIME a, b, c, d;
|
||||||
|
if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
|
||||||
|
{
|
||||||
|
// Returns total user time.
|
||||||
|
// Can be tweaked to include kernel times as well.
|
||||||
|
return
|
||||||
|
(double)(d.dwLowDateTime |
|
||||||
|
((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Handle error
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
double uclock_sec(int find_start)
|
||||||
|
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
void save(complexf * d, int B, int N, const char * fn)
|
||||||
|
{
|
||||||
|
if (!fn || !fn[0])
|
||||||
|
{
|
||||||
|
if (! SAVE_BY_DEFAULT)
|
||||||
|
return;
|
||||||
|
fn = "/dev/shm/bench.bin";
|
||||||
|
}
|
||||||
|
FILE * f = fopen(fn, "wb");
|
||||||
|
if (!f) {
|
||||||
|
fprintf(stderr, "error writing result to %s\n", fn);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
|
||||||
|
N = SAVE_LIMIT_MSPS * 1024 * 1024;
|
||||||
|
for (int off = 0; off + B <= N; off += B)
|
||||||
|
{
|
||||||
|
fwrite(d+off, sizeof(complexf), B, f);
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_math_cc(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
const complexf *input,
|
||||||
|
complexf *output,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
float phase = 0.0F;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_math_cc(const int B, const int N, const bool ignore_time) {
|
||||||
|
int iter, off;
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_math_cc(B, N, ignore_time, input, output, iter, off);
|
||||||
|
|
||||||
|
save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
free(output);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_shift_table_cc(int B, int N) {
|
||||||
|
double t0, t1, tstop, T, nI;
|
||||||
|
int iter, off;
|
||||||
|
int table_size=65536;
|
||||||
|
float phase = 0.0F;
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
|
||||||
|
shift_table_data_t table_data = shift_table_init(table_size);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
iter = 0;
|
||||||
|
off = 0;
|
||||||
|
t0 = uclock_sec(1);
|
||||||
|
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
|
||||||
|
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( t1 < tstop && off + B < N );
|
||||||
|
|
||||||
|
save(output, B, off, NULL);
|
||||||
|
free(input);
|
||||||
|
free(output);
|
||||||
|
T = ( t1 - t0 ); /* duration per fft() */
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_shift_addfast(int B, int N) {
|
||||||
|
double t0, t1, tstop, T, nI;
|
||||||
|
int iter, off;
|
||||||
|
float phase = 0.0F;
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_addfast_data_t state = shift_addfast_init(-0.0009F);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
iter = 0;
|
||||||
|
off = 0;
|
||||||
|
t0 = uclock_sec(1);
|
||||||
|
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
|
||||||
|
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( t1 < tstop && off + B < N );
|
||||||
|
|
||||||
|
save(output, B, off, BENCH_FILE_ADD_FAST_CC);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
free(output);
|
||||||
|
T = ( t1 - t0 ); /* duration per fft() */
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_addfast_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_addfast_data_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
float phase = 0.0F;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
phase = shift_addfast_inp_c(data+off, B, &state, phase);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_addfast_inp(int B, int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_addfast_data_t state = shift_addfast_init(-0.0009F);
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_addfast_inplace(
|
||||||
|
B, N, ignore_time, input, state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_shift_unroll_oop(int B, int N) {
|
||||||
|
double t0, t1, tstop, T, nI;
|
||||||
|
int iter, off;
|
||||||
|
float phase = 0.0F;
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
iter = 0;
|
||||||
|
off = 0;
|
||||||
|
t0 = uclock_sec(1);
|
||||||
|
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
|
||||||
|
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( t1 < tstop && off + B < N );
|
||||||
|
|
||||||
|
save(output, B, off, NULL);
|
||||||
|
free(input);
|
||||||
|
free(output);
|
||||||
|
T = ( t1 - t0 ); /* duration per fft() */
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_unroll_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_unroll_data_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
float phase = 0.0F;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
phase = shift_unroll_inp_c(data+off, B, &state, phase);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_unroll_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_unroll_inplace(
|
||||||
|
B, N, ignore_time, input, state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_UNROLL_INP_C);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
shift_unroll_deinit(&state);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
double bench_shift_limited_unroll_oop(int B, int N) {
|
||||||
|
double t0, t1, tstop, T, nI;
|
||||||
|
int iter, off;
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
iter = 0;
|
||||||
|
off = 0;
|
||||||
|
t0 = uclock_sec(1);
|
||||||
|
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_limited_unroll_cc(input+off, output+off, B, &state);
|
||||||
|
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( t1 < tstop && off + B < N );
|
||||||
|
|
||||||
|
save(output, B, off, NULL);
|
||||||
|
free(input);
|
||||||
|
free(output);
|
||||||
|
T = ( t1 - t0 ); /* duration per fft() */
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_limited_unroll_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_limited_unroll_data_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_limited_unroll_inp_c(data+off, B, &state);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_limited_unroll_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_limited_unroll_inplace(
|
||||||
|
B, N, ignore_time, input, state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_limited_unroll_A_sse_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_limited_unroll_A_sse_data_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_limited_unroll_A_sse_inp_c(data+off, B, &state);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_limited_unroll_A_sse_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_limited_unroll_A_sse_data_t *state = (shift_limited_unroll_A_sse_data_t*)malloc(sizeof(shift_limited_unroll_A_sse_data_t));
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
*state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_limited_unroll_A_sse_inplace(
|
||||||
|
B, N, ignore_time, input, *state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
free(state);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_limited_unroll_B_sse_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_limited_unroll_B_sse_data_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_limited_unroll_B_sse_inp_c(data+off, B, &state);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_limited_unroll_B_sse_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_limited_unroll_B_sse_data_t *state = (shift_limited_unroll_B_sse_data_t*)malloc(sizeof(shift_limited_unroll_B_sse_data_t));
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
*state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
//shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_limited_unroll_B_sse_inplace(
|
||||||
|
B, N, ignore_time, input, *state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
free(state);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_limited_unroll_C_sse_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_limited_unroll_C_sse_data_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_limited_unroll_C_sse_inp_c(data+off, B, &state);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_limited_unroll_C_sse_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
shift_limited_unroll_C_sse_data_t *state = (shift_limited_unroll_C_sse_data_t*)malloc(sizeof(shift_limited_unroll_C_sse_data_t));
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
*state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_limited_unroll_C_sse_inplace(
|
||||||
|
B, N, ignore_time, input, *state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
|
||||||
|
|
||||||
|
free(input);
|
||||||
|
free(state);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_shift_rec_osc_cc_oop(int B, int N) {
|
||||||
|
double t0, t1, tstop, T, nI;
|
||||||
|
int iter, off;
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state, shift_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf, shift_conf;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
iter = 0;
|
||||||
|
off = 0;
|
||||||
|
t0 = uclock_sec(1);
|
||||||
|
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
|
||||||
|
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( t1 < tstop && off + B < N );
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_REC_OSC_CC);
|
||||||
|
|
||||||
|
save(output, B, off, NULL);
|
||||||
|
free(input);
|
||||||
|
free(output);
|
||||||
|
T = ( t1 - t0 ); /* duration per fft() */
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_rec_osc_cc_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_recursive_osc_conf_t &conf, shift_recursive_osc_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_recursive_osc_inp_c(data+off, B, &conf, &state);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_rec_osc_cc_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state, shift_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf, shift_conf;
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_rec_osc_cc_inplace(
|
||||||
|
B, N, ignore_time, input, shift_conf, shift_state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
|
||||||
|
free(input);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double bench_core_shift_rec_osc_sse_c_inplace(
|
||||||
|
const int B, const int N, const bool ignore_time,
|
||||||
|
complexf *data,
|
||||||
|
shift_recursive_osc_sse_conf_t &conf, shift_recursive_osc_sse_t &state,
|
||||||
|
int &iters_out, int &off_out
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const double t0 = uclock_sec(1);
|
||||||
|
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||||
|
double t1;
|
||||||
|
int off = 0, iter = 0;
|
||||||
|
papi_perf_counter perf_counter(1);
|
||||||
|
|
||||||
|
do {
|
||||||
|
// work
|
||||||
|
shift_recursive_osc_sse_inp_c(data+off, B, &conf, &state);
|
||||||
|
off += B;
|
||||||
|
++iter;
|
||||||
|
t1 = uclock_sec(0);
|
||||||
|
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||||
|
|
||||||
|
iters_out = iter;
|
||||||
|
off_out = off;
|
||||||
|
return t1 - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double bench_shift_rec_osc_sse_c_inp(const int B, const int N, const bool ignore_time) {
|
||||||
|
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||||
|
shift_recursive_osc_t gen_state;
|
||||||
|
shift_recursive_osc_conf_t gen_conf;
|
||||||
|
|
||||||
|
shift_recursive_osc_sse_t *shift_state = (shift_recursive_osc_sse_t*)malloc(sizeof(shift_recursive_osc_sse_t));
|
||||||
|
shift_recursive_osc_sse_conf_t shift_conf;
|
||||||
|
int iter, off;
|
||||||
|
|
||||||
|
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||||
|
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||||
|
|
||||||
|
shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
|
||||||
|
|
||||||
|
double T = bench_core_shift_rec_osc_sse_c_inplace(
|
||||||
|
B, N, ignore_time, input, shift_conf, *shift_state,
|
||||||
|
iter, off
|
||||||
|
);
|
||||||
|
|
||||||
|
save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
|
||||||
|
free(input);
|
||||||
|
free(shift_state);
|
||||||
|
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||||
|
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||||
|
return (nI / T); /* normalized iterations per second */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
double rt;
|
||||||
|
|
||||||
|
// process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
|
||||||
|
int B = 8 * 1024;
|
||||||
|
int N = 64 * 1024 * 1024;
|
||||||
|
int showUsage = 0;
|
||||||
|
bool ignore_time = true;
|
||||||
|
|
||||||
|
if (argc == 1)
|
||||||
|
showUsage = 1;
|
||||||
|
|
||||||
|
if (1 < argc)
|
||||||
|
B = atoi(argv[1]);
|
||||||
|
if (2 < argc)
|
||||||
|
N = atoi(argv[2]) * 1024 * 1024;
|
||||||
|
|
||||||
|
if ( !B || !N || showUsage )
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
|
||||||
|
if ( !B || !N )
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "processing up to N = %d MSamples with block length of %d samples\n",
|
||||||
|
N / (1024 * 1024), B );
|
||||||
|
|
||||||
|
|
||||||
|
#if BENCH_REF_TRIG_FUNC
|
||||||
|
printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
|
||||||
|
rt = bench_shift_math_cc(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if BENCH_OUT_OF_PLACE_ALGOS
|
||||||
|
printf("starting bench of shift_table_cc (out-of-place) ..\n");
|
||||||
|
rt = bench_shift_table_cc(B, N);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
|
||||||
|
rt = bench_shift_addfast(B, N);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
|
||||||
|
rt = bench_shift_unroll_oop(B, N);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
|
||||||
|
rt = bench_shift_limited_unroll_oop(B, N);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
|
||||||
|
rt = bench_shift_rec_osc_cc_oop(B, N);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if BENCH_INPLACE_ALGOS
|
||||||
|
|
||||||
|
printf("starting bench of shift_addfast_inp_c in-place ..\n");
|
||||||
|
rt = bench_shift_addfast_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("starting bench of shift_unroll_inp_c in-place ..\n");
|
||||||
|
rt = bench_shift_unroll_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
|
||||||
|
rt = bench_shift_limited_unroll_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
if ( have_sse_shift_mixer_impl() )
|
||||||
|
{
|
||||||
|
printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
|
||||||
|
rt = bench_shift_limited_unroll_A_sse_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
|
||||||
|
rt = bench_shift_limited_unroll_B_sse_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
|
||||||
|
rt = bench_shift_limited_unroll_C_sse_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("starting bench of shift_recursive_osc_cc in-place ..\n");
|
||||||
|
rt = bench_shift_rec_osc_cc_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
|
||||||
|
if ( have_sse_shift_mixer_impl() )
|
||||||
|
{
|
||||||
|
printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
|
||||||
|
rt = bench_shift_rec_osc_sse_c_inp(B, N, ignore_time);
|
||||||
|
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
1402
pffft/bench_pffft.c
Normal file
1402
pffft/bench_pffft.c
Normal file
File diff suppressed because it is too large
Load Diff
26
pffft/cmake/FindMIPP.cmake
Normal file
26
pffft/cmake/FindMIPP.cmake
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
|
||||||
|
if(MIPP_INCLUDE_DIRS)
|
||||||
|
set(MIPP_FIND_QUIETLY TRUE)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_path(MIPP_INCLUDE_DIRS NAMES mipp.h
|
||||||
|
HINTS
|
||||||
|
${MIPP_ROOT}
|
||||||
|
$ENV{HOME}/.local
|
||||||
|
PATH_SUFFIXES include/mipp
|
||||||
|
)
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(MIPP DEFAULT_MSG MIPP_INCLUDE_DIRS)
|
||||||
|
|
||||||
|
if(MIPP_FOUND AND NOT TARGET MIPP)
|
||||||
|
message(STATUS "MIPP_FOUND -> creating interface library MIPP at ${MIPP_INCLUDE_DIRS}")
|
||||||
|
add_library(MIPP INTERFACE)
|
||||||
|
target_compile_definitions(MIPP INTERFACE HAVE_MIPP=1)
|
||||||
|
target_include_directories(MIPP INTERFACE ${MIPP_INCLUDE_DIRS})
|
||||||
|
target_compile_features(MIPP INTERFACE cxx_std_11)
|
||||||
|
else()
|
||||||
|
message(WARNING "MIPP not found.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
mark_as_advanced(MIPP_INCLUDE_DIRS)
|
||||||
25
pffft/cmake/FindPAPI.cmake
Normal file
25
pffft/cmake/FindPAPI.cmake
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Find PAPI libraries
|
||||||
|
# Once done this will define
|
||||||
|
# PAPI_FOUND - System has PAPI
|
||||||
|
# PAPI_INCLUDE_DIRS - The PAPI include directories
|
||||||
|
# PAPI_LIBRARIES - The libraries needed to use PAPI
|
||||||
|
|
||||||
|
if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)
|
||||||
|
set(PAPI_FIND_QUIETLY TRUE)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_path(PAPI_INCLUDE_DIRS NAMES papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include)
|
||||||
|
find_library(PAPI_LIBRARIES NAMES papi HINTS ${PAPI_ROOT} PATH_SUFFIXES lib lib64)
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARIES PAPI_INCLUDE_DIRS)
|
||||||
|
if(PAPI_FOUND AND NOT TARGET PAPI::PAPI)
|
||||||
|
set(PAPI_LIBRARIES ${PAPI_LIBRARIES} rt)
|
||||||
|
|
||||||
|
add_library(PAPI::PAPI SHARED IMPORTED)
|
||||||
|
set_target_properties(PAPI::PAPI PROPERTIES
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}"
|
||||||
|
IMPORTED_LOCATION "${PAPI_LIBRARIES}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)
|
||||||
11
pffft/cmake/compiler_warnings.cmake
Normal file
11
pffft/cmake/compiler_warnings.cmake
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
|
||||||
|
function(target_activate_cxx_compiler_warnings target)
|
||||||
|
target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
|
||||||
|
target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
function(target_activate_c_compiler_warnings target)
|
||||||
|
target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
|
||||||
|
target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
197
pffft/cmake/target_optimizations.cmake
Normal file
197
pffft/cmake/target_optimizations.cmake
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
|
||||||
|
# cmake options: TARGET_C_ARCH / TARGET_CPP_ARCH:
|
||||||
|
# and optionally: TARGET_C_EXTRA TARGET_CXX_EXTRA
|
||||||
|
#
|
||||||
|
# provided:
|
||||||
|
# - function: target_set_c_arch_flags(<target>) # uses options TARGET_C_ARCH and TARGET_C_EXTRA
|
||||||
|
# - function: target_set_cxx_arch_flags(<target>) # uses options TARGET_CXX_ARCH and TARGET_CXX_EXTRA
|
||||||
|
# - macro: target_set_cxx_arch_option(<target> <gcc/clang_march> <gcc/clang_extra> <msvc_arch>)
|
||||||
|
#
|
||||||
|
# see https://en.wikichip.org/wiki/x86/extensions
|
||||||
|
# and https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
|
||||||
|
# for gcc specific architecture options
|
||||||
|
# and https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
|
||||||
|
# or https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
|
||||||
|
# for msvc specific architecture options
|
||||||
|
|
||||||
|
# https://en.wikichip.org/wiki/arm/versions
|
||||||
|
# https://en.wikipedia.org/wiki/Raspberry_Pi
|
||||||
|
# https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#ARM-Options
|
||||||
|
# https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores
|
||||||
|
# https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
|
||||||
|
|
||||||
|
# arm32_rpi1 untested
|
||||||
|
# -mcpu=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp -mtune=arm1176jzf-s
|
||||||
|
# arm32_rpi2 untested
|
||||||
|
# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||||
|
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||||
|
# arm32_rpi3 with "armv7-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
|
||||||
|
# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||||
|
# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
|
||||||
|
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||||
|
# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
|
||||||
|
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53"
|
||||||
|
# arm32_rpi4 untested
|
||||||
|
# RPi 4 Model B: Cortex-A72 => "-mtune=cortex-a72" ?
|
||||||
|
# "-mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits -mtune=cortex-a72"
|
||||||
|
|
||||||
|
set(MSVC_EXTRA_OPT_none "")
|
||||||
|
set(GCC_EXTRA_OPT_none "")
|
||||||
|
set(GCC_EXTRA_OPT_neon_vfpv4 "-mfloat-abi=hard" "-mfpu=neon-vfpv4")
|
||||||
|
set(GCC_EXTRA_OPT_neon_rpi3_a53 "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53")
|
||||||
|
set(GCC_EXTRA_OPT_neon_rpi4_a72 "-mfloat-abi=hard" "-mfpu=neon-fp-armv8" "-mtune=cortex-a72")
|
||||||
|
|
||||||
|
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
|
||||||
|
set(GCC_MARCH_DESC "native/SSE2:pentium4/SSE3:core2/SSE4:nehalem/AVX:sandybridge/AVX2:haswell")
|
||||||
|
set(GCC_MARCH_VALUES "none;native;pentium4;core2;nehalem;sandybridge;haswell" CACHE INTERNAL "List of possible architectures")
|
||||||
|
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible EXTRA options")
|
||||||
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||||
|
set(GCC_MARCH_DESC "native/ARMwNEON:armv8-a")
|
||||||
|
set(GCC_MARCH_VALUES "none;native;armv8-a" CACHE INTERNAL "List of possible architectures")
|
||||||
|
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
|
||||||
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
|
||||||
|
set(GCC_MARCH_DESC "native/ARMwNEON:armv7-a")
|
||||||
|
set(GCC_MARCH_VALUES "none;native;armv7-a" CACHE INTERNAL "List of possible architectures")
|
||||||
|
set(GCC_EXTRA_VALUES "none;neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72" CACHE INTERNAL "List of possible additional options")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}'")
|
||||||
|
# other PROCESSORs could be "ppc", "ppc64", "arm" - or something else?!
|
||||||
|
set(GCC_MARCH_DESC "native")
|
||||||
|
set(GCC_MARCH_VALUES "none;native" CACHE INTERNAL "List of possible architectures")
|
||||||
|
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# cmake options - depending on C/C++ compiler
|
||||||
|
# how are chances, that C and C++ compilers are from different vendors?
|
||||||
|
if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||||
|
set(TARGET_C_ARCH "none" CACHE STRING "gcc target C architecture (-march): ${GCC_MARCH_DESC}")
|
||||||
|
set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||||
|
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||||
|
set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
|
||||||
|
set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||||
|
endif()
|
||||||
|
elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||||
|
set(TARGET_C_ARCH "none" CACHE STRING "clang target C architecture (-march): ${GCC_MARCH_DESC}")
|
||||||
|
set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||||
|
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||||
|
set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
|
||||||
|
set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||||
|
endif()
|
||||||
|
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||||
|
set(TARGET_C_ARCH "none" CACHE STRING "msvc target C architecture (/arch): SSE2/AVX/AVX2/AVX512")
|
||||||
|
set(TARGET_C_EXTRA "none" CACHE STRING "msvc additional options")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||||
|
set(TARGET_CXX_ARCH "none" CACHE STRING "gcc target C++ architecture (-march): ${GCC_MARCH_DESC}")
|
||||||
|
set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||||
|
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||||
|
set(TARGET_CXX_EXTRA "none" CACHE STRING "gcc additional options for C++")
|
||||||
|
set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||||
|
endif()
|
||||||
|
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||||
|
set(TARGET_CXX_ARCH "none" CACHE STRING "clang target C++ architecture (-march): ${GCC_MARCH_DESC}")
|
||||||
|
set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||||
|
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||||
|
set(TARGET_CXX_EXTRA "none" CACHE STRING "clang additional options for C++")
|
||||||
|
set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||||
|
endif()
|
||||||
|
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||||
|
set(TARGET_CXX_ARCH "none" CACHE STRING "msvc target C++ architecture (/arch): SSE2/AVX/AVX2/AVX512")
|
||||||
|
set(TARGET_CXX_EXTRA "none" CACHE STRING "msvc additional options")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
function(target_set_c_arch_flags target)
|
||||||
|
if ( ("${TARGET_C_ARCH}" STREQUAL "") OR ("${TARGET_C_ARCH}" STREQUAL "none") )
|
||||||
|
message(STATUS "C ARCH for target ${target} is not set!")
|
||||||
|
else()
|
||||||
|
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
|
||||||
|
target_compile_options(${target} PRIVATE "-march=${TARGET_C_ARCH}")
|
||||||
|
message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
|
||||||
|
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||||
|
target_compile_options(${target} PRIVATE "/arch:${TARGET_C_ARCH}")
|
||||||
|
message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if ( ("${TARGET_C_EXTRA}" STREQUAL "") OR ("${TARGET_C_EXTRA}" STREQUAL "none") )
|
||||||
|
message(STATUS "C additional options for target ${target} is not set!")
|
||||||
|
else()
|
||||||
|
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
|
||||||
|
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
|
||||||
|
message(STATUS "C additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
|
||||||
|
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||||
|
# target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_C_EXTRA}}")
|
||||||
|
message(STATUS "C additional options for target ${target} not usable with MSVC")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
if ( ("${TARGET_C_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
|
||||||
|
message(STATUS "additional option contains neon: setting PFFFT_ENABLE_NEON for C target ${target}")
|
||||||
|
target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
function(target_set_cxx_arch_flags target)
|
||||||
|
if ( ("${TARGET_CXX_ARCH}" STREQUAL "") OR ("${TARGET_CXX_ARCH}" STREQUAL "none") )
|
||||||
|
message(STATUS "C++ ARCH for target ${target} is not set!")
|
||||||
|
else()
|
||||||
|
if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
|
||||||
|
target_compile_options(${target} PRIVATE "-march=${TARGET_CXX_ARCH}")
|
||||||
|
message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
|
||||||
|
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||||
|
target_compile_options(${target} PRIVATE "/arch:${TARGET_CXX_ARCH}")
|
||||||
|
message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if ( ("${TARGET_CXX_EXTRA}" STREQUAL "") OR ("${TARGET_CXX_EXTRA}" STREQUAL "none") )
|
||||||
|
message(STATUS "C++ additional options for target ${target} is not set!")
|
||||||
|
else()
|
||||||
|
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
|
||||||
|
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
|
||||||
|
message(STATUS "C++ additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
|
||||||
|
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||||
|
# target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
|
||||||
|
message(STATUS "C++ additional options for target ${target} not usable with MSVC")
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
if ( ("${TARGET_CXX_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
|
||||||
|
message(STATUS "additional option contains 'neon': setting PFFFT_ENABLE_NEON for C++ target ${target}")
|
||||||
|
target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
|
||||||
|
macro(target_set_cxx_arch_option target gcc_clang_arch gcc_clang_extra msvc_arch )
|
||||||
|
if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
|
||||||
|
|
||||||
|
if ( NOT (("${gcc_clang_arch}" STREQUAL "") OR ("${gcc_clang_arch}" STREQUAL "none") ) )
|
||||||
|
target_compile_options(${target} PRIVATE "-march=${gcc_clang_arch}")
|
||||||
|
message(STATUS "C++ ARCH for target ${target}: ${gcc_clang_arch}")
|
||||||
|
endif()
|
||||||
|
if (NOT ( ("${gcc_clang_extra}" STREQUAL "") OR ("${gcc_clang_extra}" STREQUAL "none") ) )
|
||||||
|
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${gcc_clang_extra}}")
|
||||||
|
message(STATUS "C++ additional options for target ${target}: ${GCC_EXTRA_OPT_${gcc_clang_extra}}")
|
||||||
|
endif()
|
||||||
|
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||||
|
if ( NOT (("${msvc_arch}" STREQUAL "") OR ("${msvc_arch}" STREQUAL "none") ) )
|
||||||
|
target_compile_options(${target} PRIVATE "/arch:${msvc_arch}")
|
||||||
|
message(STATUS "C++ ARCH for target ${target} set: ${msvc_arch}")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_option(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||||
|
endif()
|
||||||
|
endmacro()
|
||||||
|
|
||||||
25
pffft/cross_build_mingw32.sh
Executable file
25
pffft/cross_build_mingw32.sh
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# requires debian/ubuntu packages: zip gcc-mingw-w64
|
||||||
|
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "usage: $0 <zip-post> <any other cmake options>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ZIP_POST="$1"
|
||||||
|
shift
|
||||||
|
|
||||||
|
CROSS="i686-w64-mingw32"
|
||||||
|
WN="w32"
|
||||||
|
TOOLCHAIN="mingw-w32-i686.cmake"
|
||||||
|
|
||||||
|
rm -rf build_${WN}_${ZIP_POST}
|
||||||
|
echo -e "\n\n********************************************************"
|
||||||
|
echo "start build of pffft_${WN}_${ZIP_POST}"
|
||||||
|
mkdir build_${WN}_${ZIP_POST} && \
|
||||||
|
cmake -S . -B build_${WN}_${ZIP_POST} \
|
||||||
|
-DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
|
||||||
|
"$@" && \
|
||||||
|
cmake --build build_${WN}_${ZIP_POST}
|
||||||
25
pffft/cross_build_mingw64.sh
Executable file
25
pffft/cross_build_mingw64.sh
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# requires debian/ubuntu packages: zip gcc-mingw-w64
|
||||||
|
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "usage: $0 <zip-post> <any other cmake options>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ZIP_POST="$1"
|
||||||
|
shift
|
||||||
|
|
||||||
|
# CROSS="x86_64-w64-mingw32"
|
||||||
|
WN="w64"
|
||||||
|
TOOLCHAIN="mingw-w64-x64_64.cmake"
|
||||||
|
|
||||||
|
rm -rf build_${WN}_${ZIP_POST}
|
||||||
|
echo -e "\n\n********************************************************"
|
||||||
|
echo "start build of pffft_${WN}_${ZIP_POST}"
|
||||||
|
mkdir build_${WN}_${ZIP_POST} && \
|
||||||
|
cmake -S . -B build_${WN}_${ZIP_POST} \
|
||||||
|
-DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
|
||||||
|
"$@" && \
|
||||||
|
cmake --build build_${WN}_${ZIP_POST}
|
||||||
63
pffft/examples/CMakeLists.txt
Normal file
63
pffft/examples/CMakeLists.txt
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.1)
|
||||||
|
project(examples)
|
||||||
|
|
||||||
|
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
|
||||||
|
# using Visual Studio C++
|
||||||
|
message(STATUS "INFO: detected MSVC: will not link math lib m")
|
||||||
|
set(MATHLIB "")
|
||||||
|
add_definitions("/D_CRT_SECURE_NO_WARNINGS")
|
||||||
|
set(MSVC_DISABLED_WARNINGS_LIST "C4996")
|
||||||
|
else()
|
||||||
|
if(PFFFT_DISABLE_LINK_WITH_M)
|
||||||
|
else()
|
||||||
|
message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
|
||||||
|
set(MATHLIB "m")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(STDCXXLIB "")
|
||||||
|
if (MINGW)
|
||||||
|
set(STDCXXLIB "stdc++")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||||
|
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_DOUBLE)
|
||||||
|
add_executable(example_cpp11_real_dbl_fwd example_cpp11_real_dbl_fwd.cpp)
|
||||||
|
target_compile_definitions(example_cpp11_real_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
target_link_libraries(example_cpp11_real_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||||
|
set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
add_executable(example_cpp11_cplx_dbl_fwd example_cpp11_cplx_dbl_fwd.cpp)
|
||||||
|
target_compile_definitions(example_cpp11_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||||
|
target_link_libraries(example_cpp11_cplx_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||||
|
set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD 11)
|
||||||
|
set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
add_executable(example_c_cplx_dbl_fwd example_c_cplx_dbl_fwd.c)
|
||||||
|
target_compile_definitions(example_c_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
target_link_libraries(example_c_cplx_dbl_fwd PFFFT ${MATHLIB})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if (PFFFT_USE_TYPE_FLOAT)
|
||||||
|
add_executable(example_cpp98_real_flt_fwd example_cpp98_real_flt_fwd.cpp)
|
||||||
|
target_compile_definitions(example_cpp98_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
target_link_libraries(example_cpp98_real_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||||
|
set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD 98)
|
||||||
|
set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
add_executable(example_cpp98_cplx_flt_fwd example_cpp98_cplx_flt_fwd.cpp)
|
||||||
|
target_compile_definitions(example_cpp98_cplx_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
target_link_libraries(example_cpp98_cplx_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||||
|
set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD 98)
|
||||||
|
set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
add_executable(example_c_real_flt_fwd example_c_real_flt_fwd.c)
|
||||||
|
target_compile_definitions(example_c_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||||
|
target_link_libraries(example_c_real_flt_fwd PFFFT ${MATHLIB})
|
||||||
|
endif()
|
||||||
|
|
||||||
69
pffft/examples/example_c_cplx_dbl_fwd.c
Normal file
69
pffft/examples/example_c_cplx_dbl_fwd.c
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
|
||||||
|
#include "pffft_double.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
|
||||||
|
void c_forward_complex_double(const int transformLen)
|
||||||
|
{
|
||||||
|
printf("running %s()\n", __FUNCTION__);
|
||||||
|
|
||||||
|
/* first check - might be skipped */
|
||||||
|
if (transformLen < pffftd_min_fft_size(PFFFT_COMPLEX))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffftd_min_fft_size(PFFFT_COMPLEX));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* instantiate FFT and prepare transformation for length N */
|
||||||
|
PFFFTD_Setup *ffts = pffftd_new_setup(transformLen, PFFFT_COMPLEX);
|
||||||
|
|
||||||
|
/* one more check */
|
||||||
|
if (!ffts)
|
||||||
|
{
|
||||||
|
fprintf(stderr,
|
||||||
|
"Error: transformation length %d is not decomposable into small prime factors. "
|
||||||
|
"Next valid transform size is: %d ; next power of 2 is: %d\n",
|
||||||
|
transformLen,
|
||||||
|
pffftd_nearest_transform_size(transformLen, PFFFT_COMPLEX, 1),
|
||||||
|
pffftd_next_power_of_two(transformLen) );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* allocate aligned vectors for input X and output Y */
|
||||||
|
double *X = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */
|
||||||
|
double *Y = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */
|
||||||
|
double *W = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));
|
||||||
|
|
||||||
|
/* prepare some input data */
|
||||||
|
for (int k = 0; k < 2 * transformLen; k += 4)
|
||||||
|
{
|
||||||
|
X[k] = k / 2; /* real */
|
||||||
|
X[k+1] = (k / 2) & 1; /* imag */
|
||||||
|
|
||||||
|
X[k+2] = -1 - k / 2; /* real */
|
||||||
|
X[k+3] = (k / 2) & 1; /* imag */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* do the forward transform; write complex spectrum result into Y */
|
||||||
|
pffftd_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
|
||||||
|
|
||||||
|
/* print spectral output */
|
||||||
|
printf("output should be complex spectrum with %d complex bins\n", transformLen);
|
||||||
|
for (int k = 0; k < 2 * transformLen; k += 2)
|
||||||
|
printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
|
||||||
|
|
||||||
|
pffftd_aligned_free(W);
|
||||||
|
pffftd_aligned_free(Y);
|
||||||
|
pffftd_aligned_free(X);
|
||||||
|
pffftd_destroy_setup(ffts);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 16;
|
||||||
|
c_forward_complex_double(N);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
66
pffft/examples/example_c_real_flt_fwd.c
Normal file
66
pffft/examples/example_c_real_flt_fwd.c
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
|
||||||
|
#include "pffft.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
|
||||||
|
void c_forward_real_float(const int transformLen)
|
||||||
|
{
|
||||||
|
printf("running %s()\n", __FUNCTION__);
|
||||||
|
|
||||||
|
/* first check - might be skipped */
|
||||||
|
if (transformLen < pffft_min_fft_size(PFFFT_REAL))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffft_min_fft_size(PFFFT_REAL));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* instantiate FFT and prepare transformation for length N */
|
||||||
|
PFFFT_Setup *ffts = pffft_new_setup(transformLen, PFFFT_REAL);
|
||||||
|
|
||||||
|
/* one more check */
|
||||||
|
if (!ffts)
|
||||||
|
{
|
||||||
|
fprintf(stderr,
|
||||||
|
"Error: transformation length %d is not decomposable into small prime factors. "
|
||||||
|
"Next valid transform size is: %d ; next power of 2 is: %d\n",
|
||||||
|
transformLen,
|
||||||
|
pffft_nearest_transform_size(transformLen, PFFFT_REAL, 1),
|
||||||
|
pffft_next_power_of_two(transformLen) );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* allocate aligned vectors for input X and output Y */
|
||||||
|
float *X = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
|
||||||
|
float *Y = (float*)pffft_aligned_malloc(transformLen * sizeof(float)); /* complex: re/im interleaved */
|
||||||
|
float *W = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
|
||||||
|
|
||||||
|
/* prepare some input data */
|
||||||
|
for (int k = 0; k < transformLen; k += 2)
|
||||||
|
{
|
||||||
|
X[k] = k;
|
||||||
|
X[k+1] = -1-k;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* do the forward transform; write complex spectrum result into Y */
|
||||||
|
pffft_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
|
||||||
|
|
||||||
|
/* print spectral output */
|
||||||
|
printf("output should be complex spectrum with %d complex bins\n", transformLen /2);
|
||||||
|
for (int k = 0; k < transformLen; k += 2)
|
||||||
|
printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
|
||||||
|
|
||||||
|
pffft_aligned_free(W);
|
||||||
|
pffft_aligned_free(Y);
|
||||||
|
pffft_aligned_free(X);
|
||||||
|
pffft_destroy_setup(ffts);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 32;
|
||||||
|
c_forward_real_float(N);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
66
pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
Normal file
66
pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
|
||||||
|
#include "pffft.hpp"
|
||||||
|
|
||||||
|
#include <complex>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
void cxx11_forward_complex_double(const int transformLen)
|
||||||
|
{
|
||||||
|
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||||
|
|
||||||
|
// first check - might be skipped
|
||||||
|
using FFT_T = pffft::Fft< std::complex<double> >;
|
||||||
|
if (transformLen < FFT_T::minFFtsize())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// instantiate FFT and prepare transformation for length N
|
||||||
|
pffft::Fft< std::complex<double> > fft(transformLen);
|
||||||
|
|
||||||
|
// one more check
|
||||||
|
if (!fft.isValid())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||||
|
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||||
|
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate aligned vectors for input X and output Y
|
||||||
|
auto X = fft.valueVector();
|
||||||
|
auto Y = fft.spectrumVector();
|
||||||
|
|
||||||
|
// alternative access: get raw pointers to aligned vectors
|
||||||
|
std::complex<double> *Xs = X.data();
|
||||||
|
std::complex<double> *Ys = Y.data();
|
||||||
|
|
||||||
|
// prepare some input data
|
||||||
|
for (int k = 0; k < transformLen; k += 2)
|
||||||
|
{
|
||||||
|
X[k] = std::complex<double>(k, k&1); // access through AlignedVector<double>
|
||||||
|
Xs[k+1] = std::complex<double>(-1-k, k&1); // access through raw pointer
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the forward transform; write complex spectrum result into Y
|
||||||
|
fft.forward(X, Y);
|
||||||
|
|
||||||
|
// print spectral output
|
||||||
|
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||||
|
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||||
|
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||||
|
{
|
||||||
|
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||||
|
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 16;
|
||||||
|
cxx11_forward_complex_double(N);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
66
pffft/examples/example_cpp11_real_dbl_fwd.cpp
Normal file
66
pffft/examples/example_cpp11_real_dbl_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
|
||||||
|
#include "pffft.hpp"
|
||||||
|
|
||||||
|
#include <complex>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
void cxx11_forward_real_double(const int transformLen)
|
||||||
|
{
|
||||||
|
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||||
|
|
||||||
|
// first check - might be skipped
|
||||||
|
using FFT_T = pffft::Fft<double>;
|
||||||
|
if (transformLen < FFT_T::minFFtsize())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// instantiate FFT and prepare transformation for length N
|
||||||
|
pffft::Fft<double> fft { transformLen };
|
||||||
|
|
||||||
|
// one more check
|
||||||
|
if (!fft.isValid())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||||
|
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||||
|
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate aligned vectors for (real) input X and (complex) output Y
|
||||||
|
auto X = fft.valueVector(); // input vector; type is AlignedVector<double>
|
||||||
|
auto Y = fft.spectrumVector(); // output vector; type is AlignedVector< std::complex<double> >
|
||||||
|
|
||||||
|
// alternative access: get raw pointers to aligned vectors
|
||||||
|
double *Xs = X.data();
|
||||||
|
std::complex<double> *Ys = Y.data();
|
||||||
|
|
||||||
|
// prepare some input data
|
||||||
|
for (int k = 0; k < transformLen; k += 2)
|
||||||
|
{
|
||||||
|
X[k] = k; // access through AlignedVector<double>
|
||||||
|
Xs[k+1] = -1-k; // access through raw pointer
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the forward transform; write complex spectrum result into Y
|
||||||
|
fft.forward(X, Y);
|
||||||
|
|
||||||
|
// print spectral output
|
||||||
|
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||||
|
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||||
|
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||||
|
{
|
||||||
|
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||||
|
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 32;
|
||||||
|
cxx11_forward_real_double(N);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
66
pffft/examples/example_cpp98_cplx_flt_fwd.cpp
Normal file
66
pffft/examples/example_cpp98_cplx_flt_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
|
||||||
|
#include "pffft.hpp"
|
||||||
|
|
||||||
|
#include <complex>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
void cxx98_forward_complex_float(const int transformLen)
|
||||||
|
{
|
||||||
|
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||||
|
|
||||||
|
// first check - might be skipped
|
||||||
|
typedef pffft::Fft< std::complex<float> > FFT_T;
|
||||||
|
if (transformLen < FFT_T::minFFtsize())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// instantiate FFT and prepare transformation for length N
|
||||||
|
pffft::Fft< std::complex<float> > fft(transformLen);
|
||||||
|
|
||||||
|
// one more check
|
||||||
|
if (!fft.isValid())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||||
|
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||||
|
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate aligned vectors for input X and output Y
|
||||||
|
pffft::AlignedVector< std::complex<float> > X = fft.valueVector();
|
||||||
|
pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
|
||||||
|
|
||||||
|
// alternative access: get raw pointers to aligned vectors
|
||||||
|
std::complex<float> *Xs = X.data();
|
||||||
|
std::complex<float> *Ys = Y.data();
|
||||||
|
|
||||||
|
// prepare some input data
|
||||||
|
for (int k = 0; k < transformLen; k += 2)
|
||||||
|
{
|
||||||
|
X[k] = std::complex<float>(k, k&1); // access through AlignedVector<float>
|
||||||
|
Xs[k+1] = std::complex<float>(-1-k, k&1); // access through raw pointer
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the forward transform; write complex spectrum result into Y
|
||||||
|
fft.forward(X, Y);
|
||||||
|
|
||||||
|
// print spectral output
|
||||||
|
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||||
|
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||||
|
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||||
|
{
|
||||||
|
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||||
|
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 16;
|
||||||
|
cxx98_forward_complex_float(N);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
66
pffft/examples/example_cpp98_real_flt_fwd.cpp
Normal file
66
pffft/examples/example_cpp98_real_flt_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
|
||||||
|
#include "pffft.hpp"
|
||||||
|
|
||||||
|
#include <complex>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
void cxx98_forward_real_float(const int transformLen)
|
||||||
|
{
|
||||||
|
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||||
|
|
||||||
|
// first check - might be skipped
|
||||||
|
typedef pffft::Fft<float> FFT_T;
|
||||||
|
if (transformLen < FFT_T::minFFtsize())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// instantiate FFT and prepare transformation for length N
|
||||||
|
pffft::Fft<float> fft(transformLen);
|
||||||
|
|
||||||
|
// one more check
|
||||||
|
if (!fft.isValid())
|
||||||
|
{
|
||||||
|
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||||
|
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||||
|
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate aligned vectors for input X and output Y
|
||||||
|
pffft::AlignedVector<float> X = fft.valueVector();
|
||||||
|
pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
|
||||||
|
|
||||||
|
// alternative access: get raw pointers to aligned vectors
|
||||||
|
float *Xs = X.data();
|
||||||
|
std::complex<float> *Ys = Y.data();
|
||||||
|
|
||||||
|
// prepare some input data
|
||||||
|
for (int k = 0; k < transformLen; k += 2)
|
||||||
|
{
|
||||||
|
X[k] = k; // access through AlignedVector<float>
|
||||||
|
Xs[k+1] = -1-k; // access through raw pointer
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the forward transform; write complex spectrum result into Y
|
||||||
|
fft.forward(X, Y);
|
||||||
|
|
||||||
|
// print spectral output
|
||||||
|
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||||
|
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||||
|
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||||
|
{
|
||||||
|
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||||
|
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 32;
|
||||||
|
cxx98_forward_real_float(N);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
3130
pffft/fftpack.c
Normal file
3130
pffft/fftpack.c
Normal file
File diff suppressed because it is too large
Load Diff
799
pffft/fftpack.h
Normal file
799
pffft/fftpack.h
Normal file
@@ -0,0 +1,799 @@
|
|||||||
|
/*
|
||||||
|
Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
|
||||||
|
|
||||||
|
FFTPACK license:
|
||||||
|
|
||||||
|
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||||
|
|
||||||
|
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||||
|
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||||
|
Computational and Information Systems Laboratory, UCAR,
|
||||||
|
www.cisl.ucar.edu.
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
|
ChangeLog:
|
||||||
|
2011/10/02: this is my first release of this file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef FFTPACK_H
|
||||||
|
#define FFTPACK_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft */
|
||||||
|
|
||||||
|
#ifndef FFTPACK_DOUBLE_PRECISION
|
||||||
|
typedef float fftpack_real;
|
||||||
|
typedef int fftpack_int;
|
||||||
|
#else
|
||||||
|
typedef double fftpack_real;
|
||||||
|
typedef int fftpack_int;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void cffti(fftpack_int n, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void rffti(fftpack_int n, fftpack_real *wsave);
|
||||||
|
void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
|
||||||
|
void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void cosqi(fftpack_int n, fftpack_real *wsave);
|
||||||
|
void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||||
|
void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void costi(fftpack_int n, fftpack_real *wsave);
|
||||||
|
void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void sinqi(fftpack_int n, fftpack_real *wsave);
|
||||||
|
void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||||
|
void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||||
|
|
||||||
|
void sinti(fftpack_int n, fftpack_real *wsave);
|
||||||
|
void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* FFTPACK_H */
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
FFTPACK
|
||||||
|
|
||||||
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||||
|
|
||||||
|
version 4 april 1985
|
||||||
|
|
||||||
|
a package of fortran subprograms for the fast fourier
|
||||||
|
transform of periodic and other symmetric sequences
|
||||||
|
|
||||||
|
by
|
||||||
|
|
||||||
|
paul n swarztrauber
|
||||||
|
|
||||||
|
national center for atmospheric research boulder,colorado 80307
|
||||||
|
|
||||||
|
which is sponsored by the national science foundation
|
||||||
|
|
||||||
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||||
|
|
||||||
|
|
||||||
|
this package consists of programs which perform fast fourier
|
||||||
|
transforms for both complex and real periodic sequences and
|
||||||
|
certain other symmetric sequences that are listed below.
|
||||||
|
|
||||||
|
1. rffti initialize rfftf and rfftb
|
||||||
|
2. rfftf forward transform of a real periodic sequence
|
||||||
|
3. rfftb backward transform of a real coefficient array
|
||||||
|
|
||||||
|
4. ezffti initialize ezfftf and ezfftb
|
||||||
|
5. ezfftf a simplified real periodic forward transform
|
||||||
|
6. ezfftb a simplified real periodic backward transform
|
||||||
|
|
||||||
|
7. sinti initialize sint
|
||||||
|
8. sint sine transform of a real odd sequence
|
||||||
|
|
||||||
|
9. costi initialize cost
|
||||||
|
10. cost cosine transform of a real even sequence
|
||||||
|
|
||||||
|
11. sinqi initialize sinqf and sinqb
|
||||||
|
12. sinqf forward sine transform with odd wave numbers
|
||||||
|
13. sinqb unnormalized inverse of sinqf
|
||||||
|
|
||||||
|
14. cosqi initialize cosqf and cosqb
|
||||||
|
15. cosqf forward cosine transform with odd wave numbers
|
||||||
|
16. cosqb unnormalized inverse of cosqf
|
||||||
|
|
||||||
|
17. cffti initialize cfftf and cfftb
|
||||||
|
18. cfftf forward transform of a complex periodic sequence
|
||||||
|
19. cfftb unnormalized inverse of cfftf
|
||||||
|
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine rffti(n,wsave)
|
||||||
|
|
||||||
|
****************************************************************
|
||||||
|
|
||||||
|
subroutine rffti initializes the array wsave which is used in
|
||||||
|
both rfftf and rfftb. the prime factorization of n together with
|
||||||
|
a tabulation of the trigonometric functions are computed and
|
||||||
|
stored in wsave.
|
||||||
|
|
||||||
|
input parameter
|
||||||
|
|
||||||
|
n the length of the sequence to be transformed.
|
||||||
|
|
||||||
|
output parameter
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 2*n+15.
|
||||||
|
the same work array can be used for both rfftf and rfftb
|
||||||
|
as long as n remains unchanged. different wsave arrays
|
||||||
|
are required for different values of n. the contents of
|
||||||
|
wsave must not be changed between calls of rfftf or rfftb.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine rfftf(n,r,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine rfftf computes the fourier coefficients of a real
|
||||||
|
perodic sequence (fourier analysis). the transform is defined
|
||||||
|
below at output parameter r.
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the array r to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
n may change so long as different work arrays are provided
|
||||||
|
|
||||||
|
r a real array of length n which contains the sequence
|
||||||
|
to be transformed
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 2*n+15.
|
||||||
|
in the program that calls rfftf. the wsave array must be
|
||||||
|
initialized by calling subroutine rffti(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
the same wsave array can be used by rfftf and rfftb.
|
||||||
|
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
r r(1) = the sum from i=1 to i=n of r(i)
|
||||||
|
|
||||||
|
if n is even set l =n/2 , if n is odd set l = (n+1)/2
|
||||||
|
|
||||||
|
then for k = 2,...,l
|
||||||
|
|
||||||
|
r(2*k-2) = the sum from i = 1 to i = n of
|
||||||
|
|
||||||
|
r(i)*cos((k-1)*(i-1)*2*pi/n)
|
||||||
|
|
||||||
|
r(2*k-1) = the sum from i = 1 to i = n of
|
||||||
|
|
||||||
|
-r(i)*sin((k-1)*(i-1)*2*pi/n)
|
||||||
|
|
||||||
|
if n is even
|
||||||
|
|
||||||
|
r(n) = the sum from i = 1 to i = n of
|
||||||
|
|
||||||
|
(-1)**(i-1)*r(i)
|
||||||
|
|
||||||
|
***** note
|
||||||
|
this transform is unnormalized since a call of rfftf
|
||||||
|
followed by a call of rfftb will multiply the input
|
||||||
|
sequence by n.
|
||||||
|
|
||||||
|
wsave contains results which must not be destroyed between
|
||||||
|
calls of rfftf or rfftb.
|
||||||
|
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine rfftb(n,r,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine rfftb computes the real perodic sequence from its
|
||||||
|
fourier coefficients (fourier synthesis). the transform is defined
|
||||||
|
below at output parameter r.
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the array r to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
n may change so long as different work arrays are provided
|
||||||
|
|
||||||
|
r a real array of length n which contains the sequence
|
||||||
|
to be transformed
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 2*n+15.
|
||||||
|
in the program that calls rfftb. the wsave array must be
|
||||||
|
initialized by calling subroutine rffti(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
the same wsave array can be used by rfftf and rfftb.
|
||||||
|
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
r for n even and for i = 1,...,n
|
||||||
|
|
||||||
|
r(i) = r(1)+(-1)**(i-1)*r(n)
|
||||||
|
|
||||||
|
plus the sum from k=2 to k=n/2 of
|
||||||
|
|
||||||
|
2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
|
||||||
|
|
||||||
|
-2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
|
||||||
|
|
||||||
|
for n odd and for i = 1,...,n
|
||||||
|
|
||||||
|
r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
|
||||||
|
|
||||||
|
2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
|
||||||
|
|
||||||
|
-2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
|
||||||
|
|
||||||
|
***** note
|
||||||
|
this transform is unnormalized since a call of rfftf
|
||||||
|
followed by a call of rfftb will multiply the input
|
||||||
|
sequence by n.
|
||||||
|
|
||||||
|
wsave contains results which must not be destroyed between
|
||||||
|
calls of rfftb or rfftf.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinti(n,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinti initializes the array wsave which is used in
|
||||||
|
subroutine sint. the prime factorization of n together with
|
||||||
|
a tabulation of the trigonometric functions are computed and
|
||||||
|
stored in wsave.
|
||||||
|
|
||||||
|
input parameter
|
||||||
|
|
||||||
|
n the length of the sequence to be transformed. the method
|
||||||
|
is most efficient when n+1 is a product of small primes.
|
||||||
|
|
||||||
|
output parameter
|
||||||
|
|
||||||
|
wsave a work array with at least int(2.5*n+15) locations.
|
||||||
|
different wsave arrays are required for different values
|
||||||
|
of n. the contents of wsave must not be changed between
|
||||||
|
calls of sint.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sint(n,x,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sint computes the discrete fourier sine transform
|
||||||
|
of an odd sequence x(i). the transform is defined below at
|
||||||
|
output parameter x.
|
||||||
|
|
||||||
|
sint is the unnormalized inverse of itself since a call of sint
|
||||||
|
followed by another call of sint will multiply the input sequence
|
||||||
|
x by 2*(n+1).
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine sint must be
|
||||||
|
initialized by calling subroutine sinti(n,wsave).
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the sequence to be transformed. the method
|
||||||
|
is most efficient when n+1 is the product of small primes.
|
||||||
|
|
||||||
|
x an array which contains the sequence to be transformed
|
||||||
|
|
||||||
|
|
||||||
|
wsave a work array with dimension at least int(2.5*n+15)
|
||||||
|
in the program that calls sint. the wsave array must be
|
||||||
|
initialized by calling subroutine sinti(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
x for i=1,...,n
|
||||||
|
|
||||||
|
x(i)= the sum from k=1 to k=n
|
||||||
|
|
||||||
|
2*x(k)*sin(k*i*pi/(n+1))
|
||||||
|
|
||||||
|
a call of sint followed by another call of
|
||||||
|
sint will multiply the sequence x by 2*(n+1).
|
||||||
|
hence sint is the unnormalized inverse
|
||||||
|
of itself.
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not be
|
||||||
|
destroyed between calls of sint.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine costi(n,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine costi initializes the array wsave which is used in
|
||||||
|
subroutine cost. the prime factorization of n together with
|
||||||
|
a tabulation of the trigonometric functions are computed and
|
||||||
|
stored in wsave.
|
||||||
|
|
||||||
|
input parameter
|
||||||
|
|
||||||
|
n the length of the sequence to be transformed. the method
|
||||||
|
is most efficient when n-1 is a product of small primes.
|
||||||
|
|
||||||
|
output parameter
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15.
|
||||||
|
different wsave arrays are required for different values
|
||||||
|
of n. the contents of wsave must not be changed between
|
||||||
|
calls of cost.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cost(n,x,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cost computes the discrete fourier cosine transform
|
||||||
|
of an even sequence x(i). the transform is defined below at output
|
||||||
|
parameter x.
|
||||||
|
|
||||||
|
cost is the unnormalized inverse of itself since a call of cost
|
||||||
|
followed by another call of cost will multiply the input sequence
|
||||||
|
x by 2*(n-1). the transform is defined below at output parameter x
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine cost must be
|
||||||
|
initialized by calling subroutine costi(n,wsave).
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the sequence x. n must be greater than 1.
|
||||||
|
the method is most efficient when n-1 is a product of
|
||||||
|
small primes.
|
||||||
|
|
||||||
|
x an array which contains the sequence to be transformed
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15
|
||||||
|
in the program that calls cost. the wsave array must be
|
||||||
|
initialized by calling subroutine costi(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
x for i=1,...,n
|
||||||
|
|
||||||
|
x(i) = x(1)+(-1)**(i-1)*x(n)
|
||||||
|
|
||||||
|
+ the sum from k=2 to k=n-1
|
||||||
|
|
||||||
|
2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
|
||||||
|
|
||||||
|
a call of cost followed by another call of
|
||||||
|
cost will multiply the sequence x by 2*(n-1)
|
||||||
|
hence cost is the unnormalized inverse
|
||||||
|
of itself.
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not be
|
||||||
|
destroyed between calls of cost.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinqi(n,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinqi initializes the array wsave which is used in
|
||||||
|
both sinqf and sinqb. the prime factorization of n together with
|
||||||
|
a tabulation of the trigonometric functions are computed and
|
||||||
|
stored in wsave.
|
||||||
|
|
||||||
|
input parameter
|
||||||
|
|
||||||
|
n the length of the sequence to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
|
||||||
|
output parameter
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15.
|
||||||
|
the same work array can be used for both sinqf and sinqb
|
||||||
|
as long as n remains unchanged. different wsave arrays
|
||||||
|
are required for different values of n. the contents of
|
||||||
|
wsave must not be changed between calls of sinqf or sinqb.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinqf(n,x,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinqf computes the fast fourier transform of quarter
|
||||||
|
wave data. that is , sinqf computes the coefficients in a sine
|
||||||
|
series representation with only odd wave numbers. the transform
|
||||||
|
is defined below at output parameter x.
|
||||||
|
|
||||||
|
sinqb is the unnormalized inverse of sinqf since a call of sinqf
|
||||||
|
followed by a call of sinqb will multiply the input sequence x
|
||||||
|
by 4*n.
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine sinqf must be
|
||||||
|
initialized by calling subroutine sinqi(n,wsave).
|
||||||
|
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the array x to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
|
||||||
|
x an array which contains the sequence to be transformed
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15.
|
||||||
|
in the program that calls sinqf. the wsave array must be
|
||||||
|
initialized by calling subroutine sinqi(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
x for i=1,...,n
|
||||||
|
|
||||||
|
x(i) = (-1)**(i-1)*x(n)
|
||||||
|
|
||||||
|
+ the sum from k=1 to k=n-1 of
|
||||||
|
|
||||||
|
2*x(k)*sin((2*i-1)*k*pi/(2*n))
|
||||||
|
|
||||||
|
a call of sinqf followed by a call of
|
||||||
|
sinqb will multiply the sequence x by 4*n.
|
||||||
|
therefore sinqb is the unnormalized inverse
|
||||||
|
of sinqf.
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not
|
||||||
|
be destroyed between calls of sinqf or sinqb.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinqb(n,x,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine sinqb computes the fast fourier transform of quarter
|
||||||
|
wave data. that is , sinqb computes a sequence from its
|
||||||
|
representation in terms of a sine series with odd wave numbers.
|
||||||
|
the transform is defined below at output parameter x.
|
||||||
|
|
||||||
|
sinqf is the unnormalized inverse of sinqb since a call of sinqb
|
||||||
|
followed by a call of sinqf will multiply the input sequence x
|
||||||
|
by 4*n.
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine sinqb must be
|
||||||
|
initialized by calling subroutine sinqi(n,wsave).
|
||||||
|
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the array x to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
|
||||||
|
x an array which contains the sequence to be transformed
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15.
|
||||||
|
in the program that calls sinqb. the wsave array must be
|
||||||
|
initialized by calling subroutine sinqi(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
x for i=1,...,n
|
||||||
|
|
||||||
|
x(i)= the sum from k=1 to k=n of
|
||||||
|
|
||||||
|
4*x(k)*sin((2k-1)*i*pi/(2*n))
|
||||||
|
|
||||||
|
a call of sinqb followed by a call of
|
||||||
|
sinqf will multiply the sequence x by 4*n.
|
||||||
|
therefore sinqf is the unnormalized inverse
|
||||||
|
of sinqb.
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not
|
||||||
|
be destroyed between calls of sinqb or sinqf.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cosqi(n,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cosqi initializes the array wsave which is used in
|
||||||
|
both cosqf and cosqb. the prime factorization of n together with
|
||||||
|
a tabulation of the trigonometric functions are computed and
|
||||||
|
stored in wsave.
|
||||||
|
|
||||||
|
input parameter
|
||||||
|
|
||||||
|
n the length of the array to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
|
||||||
|
output parameter
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15.
|
||||||
|
the same work array can be used for both cosqf and cosqb
|
||||||
|
as long as n remains unchanged. different wsave arrays
|
||||||
|
are required for different values of n. the contents of
|
||||||
|
wsave must not be changed between calls of cosqf or cosqb.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cosqf(n,x,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cosqf computes the fast fourier transform of quarter
|
||||||
|
wave data. that is , cosqf computes the coefficients in a cosine
|
||||||
|
series representation with only odd wave numbers. the transform
|
||||||
|
is defined below at output parameter x
|
||||||
|
|
||||||
|
cosqf is the unnormalized inverse of cosqb since a call of cosqf
|
||||||
|
followed by a call of cosqb will multiply the input sequence x
|
||||||
|
by 4*n.
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine cosqf must be
|
||||||
|
initialized by calling subroutine cosqi(n,wsave).
|
||||||
|
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the array x to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
|
||||||
|
x an array which contains the sequence to be transformed
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 3*n+15
|
||||||
|
in the program that calls cosqf. the wsave array must be
|
||||||
|
initialized by calling subroutine cosqi(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
x for i=1,...,n
|
||||||
|
|
||||||
|
x(i) = x(1) plus the sum from k=2 to k=n of
|
||||||
|
|
||||||
|
2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
|
||||||
|
|
||||||
|
a call of cosqf followed by a call of
|
||||||
|
cosqb will multiply the sequence x by 4*n.
|
||||||
|
therefore cosqb is the unnormalized inverse
|
||||||
|
of cosqf.
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not
|
||||||
|
be destroyed between calls of cosqf or cosqb.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cosqb(n,x,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cosqb computes the fast fourier transform of quarter
|
||||||
|
wave data. that is , cosqb computes a sequence from its
|
||||||
|
representation in terms of a cosine series with odd wave numbers.
|
||||||
|
the transform is defined below at output parameter x.
|
||||||
|
|
||||||
|
cosqb is the unnormalized inverse of cosqf since a call of cosqb
|
||||||
|
followed by a call of cosqf will multiply the input sequence x
|
||||||
|
by 4*n.
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine cosqb must be
|
||||||
|
initialized by calling subroutine cosqi(n,wsave).
|
||||||
|
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
n the length of the array x to be transformed. the method
|
||||||
|
is most efficient when n is a product of small primes.
|
||||||
|
|
||||||
|
x an array which contains the sequence to be transformed
|
||||||
|
|
||||||
|
wsave a work array that must be dimensioned at least 3*n+15
|
||||||
|
in the program that calls cosqb. the wsave array must be
|
||||||
|
initialized by calling subroutine cosqi(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
x for i=1,...,n
|
||||||
|
|
||||||
|
x(i)= the sum from k=1 to k=n of
|
||||||
|
|
||||||
|
4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
|
||||||
|
|
||||||
|
a call of cosqb followed by a call of
|
||||||
|
cosqf will multiply the sequence x by 4*n.
|
||||||
|
therefore cosqf is the unnormalized inverse
|
||||||
|
of cosqb.
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not
|
||||||
|
be destroyed between calls of cosqb or cosqf.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cffti(n,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cffti initializes the array wsave which is used in
|
||||||
|
both cfftf and cfftb. the prime factorization of n together with
|
||||||
|
a tabulation of the trigonometric functions are computed and
|
||||||
|
stored in wsave.
|
||||||
|
|
||||||
|
input parameter
|
||||||
|
|
||||||
|
n the length of the sequence to be transformed
|
||||||
|
|
||||||
|
output parameter
|
||||||
|
|
||||||
|
wsave a work array which must be dimensioned at least 4*n+15
|
||||||
|
the same work array can be used for both cfftf and cfftb
|
||||||
|
as long as n remains unchanged. different wsave arrays
|
||||||
|
are required for different values of n. the contents of
|
||||||
|
wsave must not be changed between calls of cfftf or cfftb.
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cfftf(n,c,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cfftf computes the forward complex discrete fourier
|
||||||
|
transform (the fourier analysis). equivalently , cfftf computes
|
||||||
|
the fourier coefficients of a complex periodic sequence.
|
||||||
|
the transform is defined below at output parameter c.
|
||||||
|
|
||||||
|
the transform is not normalized. to obtain a normalized transform
|
||||||
|
the output must be divided by n. otherwise a call of cfftf
|
||||||
|
followed by a call of cfftb will multiply the sequence by n.
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine cfftf must be
|
||||||
|
initialized by calling subroutine cffti(n,wsave).
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
|
||||||
|
n the length of the complex sequence c. the method is
|
||||||
|
more efficient when n is the product of small primes. n
|
||||||
|
|
||||||
|
c a complex array of length n which contains the sequence
|
||||||
|
|
||||||
|
wsave a real work array which must be dimensioned at least 4n+15
|
||||||
|
in the program that calls cfftf. the wsave array must be
|
||||||
|
initialized by calling subroutine cffti(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
the same wsave array can be used by cfftf and cfftb.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
c for j=1,...,n
|
||||||
|
|
||||||
|
c(j)=the sum from k=1,...,n of
|
||||||
|
|
||||||
|
c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
|
||||||
|
|
||||||
|
where i=sqrt(-1)
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not be
|
||||||
|
destroyed between calls of subroutine cfftf or cfftb
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cfftb(n,c,wsave)
|
||||||
|
|
||||||
|
******************************************************************
|
||||||
|
|
||||||
|
subroutine cfftb computes the backward complex discrete fourier
|
||||||
|
transform (the fourier synthesis). equivalently , cfftb computes
|
||||||
|
a complex periodic sequence from its fourier coefficients.
|
||||||
|
the transform is defined below at output parameter c.
|
||||||
|
|
||||||
|
a call of cfftf followed by a call of cfftb will multiply the
|
||||||
|
sequence by n.
|
||||||
|
|
||||||
|
the array wsave which is used by subroutine cfftb must be
|
||||||
|
initialized by calling subroutine cffti(n,wsave).
|
||||||
|
|
||||||
|
input parameters
|
||||||
|
|
||||||
|
|
||||||
|
n the length of the complex sequence c. the method is
|
||||||
|
more efficient when n is the product of small primes.
|
||||||
|
|
||||||
|
c a complex array of length n which contains the sequence
|
||||||
|
|
||||||
|
wsave a real work array which must be dimensioned at least 4n+15
|
||||||
|
in the program that calls cfftb. the wsave array must be
|
||||||
|
initialized by calling subroutine cffti(n,wsave) and a
|
||||||
|
different wsave array must be used for each different
|
||||||
|
value of n. this initialization does not have to be
|
||||||
|
repeated so long as n remains unchanged thus subsequent
|
||||||
|
transforms can be obtained faster than the first.
|
||||||
|
the same wsave array can be used by cfftf and cfftb.
|
||||||
|
|
||||||
|
output parameters
|
||||||
|
|
||||||
|
c for j=1,...,n
|
||||||
|
|
||||||
|
c(j)=the sum from k=1,...,n of
|
||||||
|
|
||||||
|
c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
|
||||||
|
|
||||||
|
where i=sqrt(-1)
|
||||||
|
|
||||||
|
wsave contains initialization calculations which must not be
|
||||||
|
destroyed between calls of subroutine cfftf or cfftb
|
||||||
|
|
||||||
|
*/
|
||||||
20
pffft/fmv.h
Normal file
20
pffft/fmv.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
#ifndef FMV_H
|
||||||
|
|
||||||
|
#if HAVE_FUNC_ATTRIBUTE_IFUNC
|
||||||
|
#if defined(__has_attribute)
|
||||||
|
#if __has_attribute(target_clones)
|
||||||
|
#if defined(__x86_64)
|
||||||
|
|
||||||
|
// see https://gcc.gnu.org/wiki/FunctionMultiVersioning
|
||||||
|
#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
|
||||||
|
#define HAVE_PF_TARGET_CLONES 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef PF_TARGET_CLONES
|
||||||
|
#define PF_TARGET_CLONES
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
25
pffft/mingw-w32-i686.cmake
Normal file
25
pffft/mingw-w32-i686.cmake
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Sample toolchain file for building for Windows from an Ubuntu Linux system.
|
||||||
|
#
|
||||||
|
# Typical usage:
|
||||||
|
# *) install cross compiler: `sudo apt-get install mingw-w64`
|
||||||
|
# *) cd build
|
||||||
|
# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w32-i686.cmake ..
|
||||||
|
#
|
||||||
|
# build for Windows' 32 bit architecture
|
||||||
|
|
||||||
|
set(CMAKE_SYSTEM_NAME Windows)
|
||||||
|
set(CMAKE_SYSTEM_PROCESSOR x86_64)
|
||||||
|
set(TOOLCHAIN_PREFIX i686-w64-mingw32)
|
||||||
|
|
||||||
|
# cross compilers to use for C, C++ and Fortran
|
||||||
|
set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
|
||||||
|
set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
|
||||||
|
set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
|
||||||
|
|
||||||
|
# target environment on the build host system
|
||||||
|
set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
|
||||||
|
|
||||||
|
# modify default behavior of FIND_XXX() commands
|
||||||
|
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||||
|
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||||
|
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||||
25
pffft/mingw-w64-x64_64.cmake
Normal file
25
pffft/mingw-w64-x64_64.cmake
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Sample toolchain file for building for Windows from an Ubuntu Linux system.
|
||||||
|
#
|
||||||
|
# Typical usage:
|
||||||
|
# *) install cross compiler: `sudo apt-get install mingw-w64`
|
||||||
|
# *) cd build
|
||||||
|
# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w64-x86_64.cmake ..
|
||||||
|
#
|
||||||
|
# build for Windows' 64 bit architecture
|
||||||
|
|
||||||
|
set(CMAKE_SYSTEM_NAME Windows)
|
||||||
|
set(CMAKE_SYSTEM_PROCESSOR x86_64)
|
||||||
|
set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
|
||||||
|
|
||||||
|
# cross compilers to use for C, C++ and Fortran
|
||||||
|
set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
|
||||||
|
set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
|
||||||
|
set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
|
||||||
|
|
||||||
|
# target environment on the build host system
|
||||||
|
set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
|
||||||
|
|
||||||
|
# modify default behavior of FIND_XXX() commands
|
||||||
|
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||||
|
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||||
|
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||||
97
pffft/papi_perf_counter.h
Normal file
97
pffft/papi_perf_counter.h
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
/* for measurement of CPU cycles ..
|
||||||
|
*
|
||||||
|
* requires
|
||||||
|
* sudo apt-get install libpapi-dev papi-tools
|
||||||
|
* on debian/ubuntu linux distributions
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_PAPI
|
||||||
|
#include <papi.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
|
||||||
|
struct papi_perf_counter
|
||||||
|
{
|
||||||
|
papi_perf_counter()
|
||||||
|
: realTime(0.0F), processTime(0.0F), instructions(0LL), ipc(0.0F)
|
||||||
|
, started(false), finished(false), print_at_destruction(false)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
papi_perf_counter(int _start, bool print_at_destruction_ = true)
|
||||||
|
: print_at_destruction(print_at_destruction_)
|
||||||
|
{
|
||||||
|
(void)_start;
|
||||||
|
start();
|
||||||
|
}
|
||||||
|
|
||||||
|
~papi_perf_counter()
|
||||||
|
{
|
||||||
|
if (print_at_destruction)
|
||||||
|
print(stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool start()
|
||||||
|
{
|
||||||
|
static bool reported_start_error = false;
|
||||||
|
#ifdef HAVE_PAPI
|
||||||
|
int ret = PAPI_ipc(&realTime, &processTime, &instructions, &ipc);
|
||||||
|
if (ret && !reported_start_error)
|
||||||
|
{
|
||||||
|
reported_start_error = true;
|
||||||
|
fprintf(stderr, "papi_perf_counter::start(): PAPI_ipc() returned error %d\n", ret);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (!reported_start_error)
|
||||||
|
{
|
||||||
|
reported_start_error = true;
|
||||||
|
fprintf(stderr, "papi_perf_counter::start(): no HAVE_PAPI\n");
|
||||||
|
}
|
||||||
|
int ret = 1;
|
||||||
|
#endif
|
||||||
|
started = (!ret);
|
||||||
|
finished = false;
|
||||||
|
return started;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool finish()
|
||||||
|
{
|
||||||
|
papi_perf_counter end(1, false);
|
||||||
|
if (started && !finished && end.started)
|
||||||
|
{
|
||||||
|
realTime = end.realTime - realTime;
|
||||||
|
processTime = end.processTime - processTime;
|
||||||
|
instructions = end.instructions - instructions;
|
||||||
|
ipc = end.ipc;
|
||||||
|
finished = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void print(FILE *f = stdout)
|
||||||
|
{
|
||||||
|
if (started && !finished)
|
||||||
|
finish();
|
||||||
|
if (!started || !finished)
|
||||||
|
return;
|
||||||
|
double cycles = instructions / ipc;
|
||||||
|
fprintf(f, "real %g, process %g, instructions %lld, ins/cycle %f => cycles %g\n"
|
||||||
|
, realTime, processTime, instructions, ipc, cycles
|
||||||
|
);
|
||||||
|
started = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
float realTime;
|
||||||
|
float processTime;
|
||||||
|
long long instructions;
|
||||||
|
float ipc;
|
||||||
|
bool started;
|
||||||
|
bool finished;
|
||||||
|
bool print_at_destruction;
|
||||||
|
};
|
||||||
|
|
||||||
298
pffft/pf_carrier.cpp
Normal file
298
pffft/pf_carrier.cpp
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
/*
|
||||||
|
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||||
|
|
||||||
|
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||||
|
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* include own header first, to see missing includes */
|
||||||
|
#include "pf_carrier.h"
|
||||||
|
#include "fmv.h"
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_dc_f(float* output, int size)
|
||||||
|
{
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+i*0 */
|
||||||
|
output[i++]=(127.0F / 128.0F);
|
||||||
|
output[i++]=0.0F;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_dc_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+i*0 */
|
||||||
|
output[i++]=SHRT_MAX;
|
||||||
|
output[i++]=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_pos_fs4_f(float* output, int size)
|
||||||
|
{
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+i*0 */
|
||||||
|
output[i++]=(127.0F / 128.0F);
|
||||||
|
output[i++]=0.0F;
|
||||||
|
/* exp(i* +pi/2) = 0+i*1 */
|
||||||
|
output[i++]=0.0F;
|
||||||
|
output[i++]=(127.0F / 128.0F);
|
||||||
|
/* exp(i* +pi) = -1+i*0 */
|
||||||
|
output[i++]=(-127.0F / 128.0F);
|
||||||
|
output[i++]=0.0F;
|
||||||
|
/* exp(i* -pi/2) = 0+i*-1 */
|
||||||
|
output[i++]=0.0F;
|
||||||
|
output[i++]=(-127.0F / 128.0F);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_pos_fs4_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+i*0 */
|
||||||
|
output[i++]=SHRT_MAX;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* +pi/2) = 0+i*1 */
|
||||||
|
output[i++]=0;
|
||||||
|
output[i++]=SHRT_MAX;
|
||||||
|
/* exp(i* +pi) = -1+i*0 */
|
||||||
|
output[i++]=-SHRT_MAX;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* -pi/2) = 0+i*-1 */
|
||||||
|
output[i++]=0;
|
||||||
|
output[i++]=-SHRT_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_neg_fs4_f(float* output, int size)
|
||||||
|
{
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+i*0 */
|
||||||
|
output[i++]=(127.0F / 128.0F);
|
||||||
|
output[i++]=0.0F;
|
||||||
|
/* exp(i* -pi/2) = 0+i*-1 */
|
||||||
|
output[i++]=0.0F;
|
||||||
|
output[i++]=(-127.0F / 128.0F);
|
||||||
|
/* exp(i* +pi) = -1+i*0 */
|
||||||
|
output[i++]=(-127.0F / 128.0F);
|
||||||
|
output[i++]=0.0F;
|
||||||
|
/* exp(i* +pi/2) = 0+i*1 */
|
||||||
|
output[i++]=0.0F;
|
||||||
|
output[i++]=(127.0F / 128.0F);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_neg_fs4_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+i*0 */
|
||||||
|
output[i++]=SHRT_MAX;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* -pi/2) = 0+i*-1 */
|
||||||
|
output[i++]=0;
|
||||||
|
output[i++]=-SHRT_MAX;
|
||||||
|
/* exp(i* +pi) = -1+i*0 */
|
||||||
|
output[i++]=-SHRT_MAX;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* +pi/2) = 0+i*1 */
|
||||||
|
output[i++]=0;
|
||||||
|
output[i++]=SHRT_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************/
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_dc_pos_fs4_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
const int m = SHRT_MAX / 2;
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+1+i*0 */
|
||||||
|
output[i++]=m+m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* +pi/2) = 1+0+i*1 */
|
||||||
|
output[i++]=m+0;
|
||||||
|
output[i++]=m;
|
||||||
|
/* exp(i* +pi) = 1-1+i*0 */
|
||||||
|
output[i++]=m-m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* -pi/2) = 1+0+i*-1 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=-m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_dc_neg_fs4_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
const int m = SHRT_MAX / 2;
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* exp(i*0) = 1+1+i*0 */
|
||||||
|
output[i++]=m+m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* -pi/2) = 1+0+i*-1 */
|
||||||
|
output[i++]=m+0;
|
||||||
|
output[i++]=-m;
|
||||||
|
/* exp(i* +pi) = 1-1+i*0 */
|
||||||
|
output[i++]=m-m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* exp(i* +pi/2) = 1+0+i*1 */
|
||||||
|
output[i++]=m+0;
|
||||||
|
output[i++]=m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_pos_neg_fs4_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
const int m = SHRT_MAX / 2;
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* pos(0) + neg(0) = exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=-m;
|
||||||
|
|
||||||
|
/* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */
|
||||||
|
output[i++]=-m;
|
||||||
|
output[i++]=m;
|
||||||
|
|
||||||
|
/* pos(2) + neg(2) = exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */
|
||||||
|
output[i++]=-m;
|
||||||
|
output[i++]=m;
|
||||||
|
|
||||||
|
/* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=-m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_dc_pos_neg_fs4_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
const int m = SHRT_MAX / 2;
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* dc + pos(0) + neg(0) = dc + exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */
|
||||||
|
output[i++]=m+m;
|
||||||
|
output[i++]=-m;
|
||||||
|
|
||||||
|
/* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */
|
||||||
|
output[i++]=0;
|
||||||
|
output[i++]=m;
|
||||||
|
|
||||||
|
/* dc + pos(2) + neg(2) = dc + exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */
|
||||||
|
output[i++]=0;
|
||||||
|
output[i++]=m;
|
||||||
|
|
||||||
|
/* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */
|
||||||
|
output[i++]=m+m;
|
||||||
|
output[i++]=-m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_pos_neg_fs2_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
const int m = SHRT_MAX / 2;
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* dc + exp(i* 0 ) = +1 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* dc + exp(i* pi) = -1 */
|
||||||
|
output[i++]=-m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* dc + exp(i* 0 ) = +1 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=0;
|
||||||
|
/* dc + exp(i* pi) = -1 */
|
||||||
|
output[i++]=-m;
|
||||||
|
output[i++]=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void generate_dc_pos_neg_fs2_s16(short* output, int size)
|
||||||
|
{
|
||||||
|
const int m = SHRT_MAX / 2;
|
||||||
|
/* size must be multiple of 4 */
|
||||||
|
assert(!(size&3));
|
||||||
|
for(int i=0;i<2*size;)
|
||||||
|
{
|
||||||
|
/* with dc = i*1 */
|
||||||
|
/* dc + exp(i* 0 ) = i*1 +1 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=m;
|
||||||
|
/* dc + exp(i* pi) = i*1 -1 */
|
||||||
|
output[i++]=-m;
|
||||||
|
output[i++]=m;
|
||||||
|
/* dc + exp(i* 0 ) = i*1 +1 */
|
||||||
|
output[i++]=m;
|
||||||
|
output[i++]=m;
|
||||||
|
/* dc + exp(i* pi) = i*1 -1 */
|
||||||
|
output[i++]=-m;
|
||||||
|
output[i++]=m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
75
pffft/pf_carrier.h
Normal file
75
pffft/pf_carrier.h
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
/*
|
||||||
|
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||||
|
|
||||||
|
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||||
|
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
_____ _
|
||||||
|
/ ____| | |
|
||||||
|
| | ___ _ __ ___ _ __ | | _____ __
|
||||||
|
| | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
|
||||||
|
| |___| (_) | | | | | | |_) | | __/> <
|
||||||
|
\_____\___/|_| |_| |_| .__/|_|\___/_/\_\
|
||||||
|
| |
|
||||||
|
|_|
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct complexf_s { float i; float q; } complexf;
|
||||||
|
|
||||||
|
|
||||||
|
/* generation functions */
|
||||||
|
void generate_dc_f(float* output, int size);
|
||||||
|
void generate_dc_s16(short* output, int size);
|
||||||
|
void generate_pos_fs4_f(float* output, int size);
|
||||||
|
void generate_pos_fs4_s16(short* output, int size);
|
||||||
|
void generate_neg_fs4_f(float* output, int size);
|
||||||
|
void generate_neg_fs4_s16(short* output, int size);
|
||||||
|
|
||||||
|
void generate_dc_pos_fs4_s16(short* output, int size);
|
||||||
|
void generate_dc_neg_fs4_s16(short* output, int size);
|
||||||
|
void generate_pos_neg_fs4_s16(short* output, int size);
|
||||||
|
void generate_dc_pos_neg_fs4_s16(short* output, int size);
|
||||||
|
|
||||||
|
void generate_pos_neg_fs2_s16(short* output, int size);
|
||||||
|
void generate_dc_pos_neg_fs2_s16(short* output, int size);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
255
pffft/pf_cic.cpp
Normal file
255
pffft/pf_cic.cpp
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
/*
|
||||||
|
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||||
|
|
||||||
|
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||||
|
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* gcc requires this for M_PI !? */
|
||||||
|
#undef __STRICT_ANSI__
|
||||||
|
|
||||||
|
/* include own header first, to see missing includes */
|
||||||
|
#include "pf_cic.h"
|
||||||
|
#include "fmv.h"
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
____ ___ ____ ____ ____ ____
|
||||||
|
/ ___|_ _/ ___| | _ \| _ \ / ___|
|
||||||
|
| | | | | | | | | | | | |
|
||||||
|
| |___ | | |___ | |_| | |_| | |___
|
||||||
|
\____|___\____| |____/|____/ \____|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define SINESHIFT 12
|
||||||
|
#define SINESIZE (1<<SINESHIFT)
|
||||||
|
typedef int64_t cic_dt; // data type used for integrators and combs
|
||||||
|
typedef struct {
|
||||||
|
int factor;
|
||||||
|
uint64_t phase;
|
||||||
|
float gain;
|
||||||
|
cic_dt ig0a, ig0b, ig1a, ig1b;
|
||||||
|
cic_dt comb0a, comb0b, comb1a, comb1b;
|
||||||
|
int16_t *sinetable;
|
||||||
|
} cicddc_t;
|
||||||
|
|
||||||
|
void *cicddc_init(int factor) {
|
||||||
|
int i;
|
||||||
|
int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
|
||||||
|
cicddc_t *s;
|
||||||
|
s = (cicddc_t *)malloc(sizeof(cicddc_t));
|
||||||
|
memset(s, 0, sizeof(cicddc_t));
|
||||||
|
|
||||||
|
float sineamp = 32767.0f;
|
||||||
|
s->factor = factor;
|
||||||
|
s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
|
||||||
|
|
||||||
|
s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
|
||||||
|
double f = 2.0 * M_PI / (double)SINESIZE;
|
||||||
|
for(i = 0; i < sinesize2; i++) {
|
||||||
|
s->sinetable[i] = sineamp * cos(f * i);
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cicddc_free(void *state) {
|
||||||
|
cicddc_t *s = (cicddc_t *)state;
|
||||||
|
free(s->sinetable);
|
||||||
|
free(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
|
||||||
|
cicddc_t *s = (cicddc_t *)state;
|
||||||
|
int k;
|
||||||
|
int factor = s->factor;
|
||||||
|
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
|
||||||
|
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
|
||||||
|
uint64_t phase = s->phase, freq;
|
||||||
|
int16_t *sinetable = s->sinetable;
|
||||||
|
float gain = s->gain;
|
||||||
|
|
||||||
|
freq = rate * ((float)(1ULL << 63) * 2);
|
||||||
|
|
||||||
|
int16_t *inp = input;
|
||||||
|
for(k = 0; k < outsize; k++) {
|
||||||
|
int i;
|
||||||
|
cic_dt out0a, out0b, out1a, out1b;
|
||||||
|
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
|
||||||
|
for(i = 0; i < factor; i++) {
|
||||||
|
cic_dt in_a, in_b;
|
||||||
|
int sinep = phase >> (64-SINESHIFT);
|
||||||
|
in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
|
||||||
|
in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
|
||||||
|
phase += freq;
|
||||||
|
/* integrators:
|
||||||
|
The calculations are ordered so that each integrator
|
||||||
|
takes a result from previous loop iteration
|
||||||
|
to make the code more "pipeline-friendly". */
|
||||||
|
ig2a += ig1a; ig2b += ig1b;
|
||||||
|
ig1a += ig0a; ig1b += ig0b;
|
||||||
|
ig0a += in_a; ig0b += in_b;
|
||||||
|
}
|
||||||
|
inp += factor;
|
||||||
|
// comb filters:
|
||||||
|
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
|
||||||
|
comb0a = ig2a; comb0b = ig2b;
|
||||||
|
out1a = out0a - comb1a; out1b = out0b - comb1b;
|
||||||
|
comb1a = out0a; comb1b = out0b;
|
||||||
|
|
||||||
|
output[k].i = (float)out1a * gain;
|
||||||
|
output[k].q = (float)out1b * gain;
|
||||||
|
}
|
||||||
|
|
||||||
|
s->ig0a = ig0a; s->ig0b = ig0b;
|
||||||
|
s->ig1a = ig1a; s->ig1b = ig1b;
|
||||||
|
s->comb0a = comb0a; s->comb0b = comb0b;
|
||||||
|
s->comb1a = comb1a; s->comb1b = comb1b;
|
||||||
|
s->phase = phase;
|
||||||
|
}
|
||||||
|
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
|
||||||
|
cicddc_t *s = (cicddc_t *)state;
|
||||||
|
int k;
|
||||||
|
int factor = s->factor;
|
||||||
|
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
|
||||||
|
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
|
||||||
|
uint64_t phase = s->phase, freq;
|
||||||
|
int16_t *sinetable = s->sinetable;
|
||||||
|
float gain = s->gain;
|
||||||
|
|
||||||
|
freq = rate * ((float)(1ULL << 63) * 2);
|
||||||
|
|
||||||
|
int16_t *inp = input;
|
||||||
|
for(k = 0; k < outsize; k++) {
|
||||||
|
int i;
|
||||||
|
cic_dt out0a, out0b, out1a, out1b;
|
||||||
|
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
|
||||||
|
for(i = 0; i < factor; i++) {
|
||||||
|
cic_dt in_a, in_b;
|
||||||
|
int32_t m_a, m_b, m_c, m_d;
|
||||||
|
int sinep = phase >> (64-SINESHIFT);
|
||||||
|
m_a = inp[2*i];
|
||||||
|
m_b = inp[2*i+1];
|
||||||
|
m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
|
||||||
|
m_d = (int32_t)sinetable[sinep];
|
||||||
|
// complex multiplication:
|
||||||
|
in_a = m_a*m_c - m_b*m_d;
|
||||||
|
in_b = m_a*m_d + m_b*m_c;
|
||||||
|
phase += freq;
|
||||||
|
/* integrators:
|
||||||
|
The calculations are ordered so that each integrator
|
||||||
|
takes a result from previous loop iteration
|
||||||
|
to make the code more "pipeline-friendly". */
|
||||||
|
ig2a += ig1a; ig2b += ig1b;
|
||||||
|
ig1a += ig0a; ig1b += ig0b;
|
||||||
|
ig0a += in_a; ig0b += in_b;
|
||||||
|
}
|
||||||
|
inp += 2*factor;
|
||||||
|
// comb filters:
|
||||||
|
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
|
||||||
|
comb0a = ig2a; comb0b = ig2b;
|
||||||
|
out1a = out0a - comb1a; out1b = out0b - comb1b;
|
||||||
|
comb1a = out0a; comb1b = out0b;
|
||||||
|
|
||||||
|
output[k].i = (float)out1a * gain;
|
||||||
|
output[k].q = (float)out1b * gain;
|
||||||
|
}
|
||||||
|
|
||||||
|
s->ig0a = ig0a; s->ig0b = ig0b;
|
||||||
|
s->ig1a = ig1a; s->ig1b = ig1b;
|
||||||
|
s->comb0a = comb0a; s->comb0b = comb0b;
|
||||||
|
s->comb1a = comb1a; s->comb1b = comb1b;
|
||||||
|
s->phase = phase;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* This is almost copy paste from cicddc_cs16_c.
|
||||||
|
I'm afraid this is going to be annoying to maintain... */
|
||||||
|
PF_TARGET_CLONES
|
||||||
|
void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
|
||||||
|
cicddc_t *s = (cicddc_t *)state;
|
||||||
|
int k;
|
||||||
|
int factor = s->factor;
|
||||||
|
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
|
||||||
|
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
|
||||||
|
uint64_t phase = s->phase, freq;
|
||||||
|
int16_t *sinetable = s->sinetable;
|
||||||
|
float gain = s->gain;
|
||||||
|
|
||||||
|
freq = rate * ((float)(1ULL << 63) * 2);
|
||||||
|
|
||||||
|
uint8_t *inp = input;
|
||||||
|
for(k = 0; k < outsize; k++) {
|
||||||
|
int i;
|
||||||
|
cic_dt out0a, out0b, out1a, out1b;
|
||||||
|
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
|
||||||
|
for(i = 0; i < factor; i++) {
|
||||||
|
cic_dt in_a, in_b;
|
||||||
|
int32_t m_a, m_b, m_c, m_d;
|
||||||
|
int sinep = phase >> (64-SINESHIFT);
|
||||||
|
// subtract 127.4 (good for rtl-sdr)
|
||||||
|
m_a = (((int32_t)inp[2*i]) << 8) - 32614;
|
||||||
|
m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
|
||||||
|
m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
|
||||||
|
m_d = (int32_t)sinetable[sinep];
|
||||||
|
// complex multiplication:
|
||||||
|
in_a = m_a*m_c - m_b*m_d;
|
||||||
|
in_b = m_a*m_d + m_b*m_c;
|
||||||
|
phase += freq;
|
||||||
|
/* integrators:
|
||||||
|
The calculations are ordered so that each integrator
|
||||||
|
takes a result from previous loop iteration
|
||||||
|
to make the code more "pipeline-friendly". */
|
||||||
|
ig2a += ig1a; ig2b += ig1b;
|
||||||
|
ig1a += ig0a; ig1b += ig0b;
|
||||||
|
ig0a += in_a; ig0b += in_b;
|
||||||
|
}
|
||||||
|
inp += 2*factor;
|
||||||
|
// comb filters:
|
||||||
|
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
|
||||||
|
comb0a = ig2a; comb0b = ig2b;
|
||||||
|
out1a = out0a - comb1a; out1b = out0b - comb1b;
|
||||||
|
comb1a = out0a; comb1b = out0b;
|
||||||
|
|
||||||
|
output[k].i = (float)out1a * gain;
|
||||||
|
output[k].q = (float)out1b * gain;
|
||||||
|
}
|
||||||
|
|
||||||
|
s->ig0a = ig0a; s->ig0b = ig0b;
|
||||||
|
s->ig1a = ig1a; s->ig1b = ig1b;
|
||||||
|
s->comb0a = comb0a; s->comb0b = comb0b;
|
||||||
|
s->comb1a = comb1a; s->comb1b = comb1b;
|
||||||
|
s->phase = phase;
|
||||||
|
}
|
||||||
|
|
||||||
58
pffft/pf_cic.h
Normal file
58
pffft/pf_cic.h
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
/*
|
||||||
|
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||||
|
|
||||||
|
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||||
|
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
____ ___ ____ ____ ____ ____
|
||||||
|
/ ___|_ _/ ___| | _ \| _ \ / ___|
|
||||||
|
| | | | | | | | | | | | |
|
||||||
|
| |___ | | |___ | |_| | |_| | |___
|
||||||
|
\____|___\____| |____/|____/ \____|
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct complexf_s { float i; float q; } complexf;
|
||||||
|
|
||||||
|
void *cicddc_init(int factor);
|
||||||
|
void cicddc_free(void *state);
|
||||||
|
void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
|
||||||
|
void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
|
||||||
|
void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
322
pffft/pf_conv.cpp
Normal file
322
pffft/pf_conv.cpp
Normal file
@@ -0,0 +1,322 @@
|
|||||||
|
|
||||||
|
#include "pf_conv.h"
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define DPRINT(...) do { } while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef HAVE_MIPP
|
||||||
|
#include <mipp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef CONV_ARCH_POST
|
||||||
|
#error CONV_ARCH_POST not defined
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PP_STRINGIFY(X) #X
|
||||||
|
#define PP_TOSTRING(X) PP_STRINGIFY(X)
|
||||||
|
#define PP_CONCAT_IMPL(x, y) x##y
|
||||||
|
#define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y )
|
||||||
|
|
||||||
|
#define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST)
|
||||||
|
|
||||||
|
|
||||||
|
const char * ARCHFUNCNAME(id)()
|
||||||
|
{
|
||||||
|
return PP_TOSTRING(CONV_ARCH_POST);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_float_simd_size)()
|
||||||
|
{
|
||||||
|
#if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP)
|
||||||
|
// have a completely MIPP independent implementation
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return mipp::N<float>();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state)
|
||||||
|
{
|
||||||
|
int R = state->size - state->offset; // this many samples from prev conv_float were not processed
|
||||||
|
if (R > 0)
|
||||||
|
{
|
||||||
|
// memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
|
||||||
|
std::copy(&s[state->offset], &s[state->size], s);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
R = 0;
|
||||||
|
state->offset = 0; // data - to be processed - is at begin
|
||||||
|
state->size = R; // this many unprocessed samples
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state)
|
||||||
|
{
|
||||||
|
int R = state->size - state->offset; // this many samples from prev conv_float were not processed
|
||||||
|
if (R > 0)
|
||||||
|
{
|
||||||
|
// memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
|
||||||
|
std::copy(&s[state->offset], &s[state->size], s);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
R = 0;
|
||||||
|
state->offset = 0; // data - to be processed - is at begin
|
||||||
|
state->size = R; // this many unprocessed samples
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(MIPP_NO_INTRINSICS)
|
||||||
|
// have a completely MIPP independent implementation
|
||||||
|
// #error missing HAVE_MIPP: there is no MIPP-independent implementation
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_float_inplace)(
|
||||||
|
float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int off0 = state->offset;
|
||||||
|
const int sz_s = state->size;
|
||||||
|
int offset;
|
||||||
|
|
||||||
|
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||||
|
{
|
||||||
|
float accu = 0.0F;
|
||||||
|
for (int k = 0; k < sz_filter; ++k)
|
||||||
|
accu += s[offset+k] * filter[k];
|
||||||
|
s[offset] = accu;
|
||||||
|
}
|
||||||
|
|
||||||
|
state->offset = offset;
|
||||||
|
return offset - off0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_float_oop)(
|
||||||
|
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter,
|
||||||
|
float * RESTRICT y
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int off0 = state->offset;
|
||||||
|
const int sz_s = state->size;
|
||||||
|
int offset;
|
||||||
|
|
||||||
|
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||||
|
{
|
||||||
|
float accu = 0.0F;
|
||||||
|
for (int k = 0; k < sz_filter; ++k)
|
||||||
|
accu += s[offset+k] * filter[k];
|
||||||
|
y[offset] = accu;
|
||||||
|
}
|
||||||
|
|
||||||
|
state->offset = offset;
|
||||||
|
return offset - off0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_cplx_float_oop)(
|
||||||
|
const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter,
|
||||||
|
complexf * RESTRICT y_cplx
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int off0 = state->offset;
|
||||||
|
const int sz_s = state->size;
|
||||||
|
const int sz_f = sz_filter;
|
||||||
|
int offset;
|
||||||
|
|
||||||
|
for ( offset = off0; offset + sz_f <= sz_s; ++offset)
|
||||||
|
{
|
||||||
|
float accu_re = 0.0F;
|
||||||
|
float accu_im = 0.0F;
|
||||||
|
for (int k = 0; k < sz_filter; ++k)
|
||||||
|
{
|
||||||
|
accu_re = s_cplx[offset+k].i * filter[k]; // accu += rS * rH;
|
||||||
|
accu_im = s_cplx[offset+k].q * filter[k]; // accu += rS * rH;
|
||||||
|
}
|
||||||
|
y_cplx[offset].i = accu_re; // == hadd() == sum of real parts
|
||||||
|
y_cplx[offset].q = accu_im; // == hadd() == sum of imag parts
|
||||||
|
}
|
||||||
|
|
||||||
|
state->offset = offset;
|
||||||
|
return offset - off0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined(HAVE_MIPP)
|
||||||
|
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_float_inplace)(
|
||||||
|
float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter
|
||||||
|
)
|
||||||
|
{
|
||||||
|
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
|
||||||
|
|
||||||
|
mipp::Reg<float> accu, rS, rH;
|
||||||
|
const int off0 = state->offset;
|
||||||
|
const int sz_s = state->size;
|
||||||
|
int offset;
|
||||||
|
|
||||||
|
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||||
|
{
|
||||||
|
accu.set0();
|
||||||
|
for (int k = 0; k < sz_filter; k += mipp::N<float>())
|
||||||
|
{
|
||||||
|
rS.load(&s[offset+k]);
|
||||||
|
rH.load(&filter[k]);
|
||||||
|
accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
|
||||||
|
}
|
||||||
|
s[offset] = accu.sum(); // == hadd()
|
||||||
|
}
|
||||||
|
|
||||||
|
state->offset = offset;
|
||||||
|
return offset - off0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_float_oop)(
|
||||||
|
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter,
|
||||||
|
float * RESTRICT y
|
||||||
|
)
|
||||||
|
{
|
||||||
|
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
|
||||||
|
|
||||||
|
mipp::Reg<float> accu, rS, rH;
|
||||||
|
const int off0 = state->offset;
|
||||||
|
const int sz_s = state->size;
|
||||||
|
int offset;
|
||||||
|
|
||||||
|
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||||
|
{
|
||||||
|
accu.set0();
|
||||||
|
for (int k = 0; k < sz_filter; k += mipp::N<float>())
|
||||||
|
{
|
||||||
|
rS.loadu(&s[offset+k]);
|
||||||
|
rH.load(&filter[k]);
|
||||||
|
accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
|
||||||
|
}
|
||||||
|
y[offset] = accu.sum(); // == hadd()
|
||||||
|
}
|
||||||
|
|
||||||
|
state->offset = offset;
|
||||||
|
return offset - off0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int ARCHFUNCNAME(conv_cplx_float_oop)(
|
||||||
|
const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter,
|
||||||
|
complexf * RESTRICT y_cplx
|
||||||
|
)
|
||||||
|
{
|
||||||
|
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
|
||||||
|
const float * RESTRICT s = &(s_cplx[0].i);
|
||||||
|
float * RESTRICT y = &(y_cplx[0].i);
|
||||||
|
|
||||||
|
mipp::Regx2<float> accu_x2, rS_x2, H_x2;
|
||||||
|
const int off0 = 2 * state->offset;
|
||||||
|
const int sz_s = 2 * state->size;
|
||||||
|
const int sz_f2 = 2 * sz_filter;
|
||||||
|
int offset;
|
||||||
|
|
||||||
|
for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2)
|
||||||
|
{
|
||||||
|
accu_x2.val[0].set0();
|
||||||
|
accu_x2.val[1].set0();
|
||||||
|
for (int k = 0; k < sz_filter; k += mipp::N<float>())
|
||||||
|
{
|
||||||
|
mipp::Reg<float> rH;
|
||||||
|
rS_x2.loadu(&s[offset+2*k]);
|
||||||
|
rH.load(&filter[k]);
|
||||||
|
H_x2 = mipp::interleave<float>(rH, rH);
|
||||||
|
accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]); // accu += rS * rH;
|
||||||
|
accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]); // accu += rS * rH;
|
||||||
|
}
|
||||||
|
H_x2 = mipp::deinterleave(accu_x2);
|
||||||
|
y[offset] = H_x2.val[0].sum(); // == hadd() == sum of real parts
|
||||||
|
y[offset+1] = H_x2.val[1].sum(); // == hadd() == sum of imag parts
|
||||||
|
}
|
||||||
|
|
||||||
|
state->offset = offset /2;
|
||||||
|
return (offset - off0) / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static const conv_f_ptrs conv_ptrs =
|
||||||
|
{
|
||||||
|
PP_TOSTRING(CONV_ARCH_POST),
|
||||||
|
#ifndef MIPP_NO_INTRINSICS
|
||||||
|
1,
|
||||||
|
#else
|
||||||
|
0,
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ARCHFUNCNAME(id),
|
||||||
|
ARCHFUNCNAME(conv_float_simd_size),
|
||||||
|
|
||||||
|
#if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP)
|
||||||
|
ARCHFUNCNAME(conv_float_move_rest),
|
||||||
|
ARCHFUNCNAME(conv_float_inplace),
|
||||||
|
ARCHFUNCNAME(conv_float_oop),
|
||||||
|
|
||||||
|
ARCHFUNCNAME(conv_cplx_move_rest),
|
||||||
|
ARCHFUNCNAME(conv_cplx_float_oop)
|
||||||
|
#else
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
|
||||||
|
nullptr,
|
||||||
|
nullptr
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)()
|
||||||
|
{
|
||||||
|
DPRINT("arch pointer for '%s':\n", conv_ptrs.id);
|
||||||
|
if (!strcmp(conv_ptrs.id, "none"))
|
||||||
|
return &conv_ptrs;
|
||||||
|
|
||||||
|
#if defined(MIPP_NO_INTRINSICS)
|
||||||
|
DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id);
|
||||||
|
return &conv_ptrs;
|
||||||
|
#elif defined(HAVE_MIPP)
|
||||||
|
DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id);
|
||||||
|
DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp);
|
||||||
|
DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size());
|
||||||
|
if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1)
|
||||||
|
return &conv_ptrs;
|
||||||
|
else
|
||||||
|
DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size());
|
||||||
|
#else
|
||||||
|
DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id);
|
||||||
|
#endif
|
||||||
|
DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__cplusplus) && (__cplusplus >= 201703L)
|
||||||
|
[[maybe_unused]]
|
||||||
|
#endif
|
||||||
|
static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);
|
||||||
|
|
||||||
109
pffft/pf_conv.h
Normal file
109
pffft/pf_conv.h
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
/* pf_conv.h/.cpp implements linear "slow" convolution.
|
||||||
|
* this code is primarily for test/demonstration of runtime dispatching.
|
||||||
|
* each "kernel" is compiled with different compiler/architecture options,
|
||||||
|
* that activates different implementations in the MIPP headers.
|
||||||
|
*
|
||||||
|
* the dispatcher library 'pf_conv_dispatcher' collects (links agains)
|
||||||
|
* all the pf_conv_arch_<opt> libraries ..
|
||||||
|
* and provides the get_all_conv_arch_ptrs() function,
|
||||||
|
* which delivers an array of pointers to the struct (conv_f_ptrs)
|
||||||
|
* containing the function pointers for the different implementations.
|
||||||
|
*
|
||||||
|
* requirement(s):
|
||||||
|
* - installed MIPP headers
|
||||||
|
* - compiler definitions for the different architecture types:
|
||||||
|
* see CMakeLists.txt CONV_ARCH_MSVC_AMD64, CONV_ARCH_GCC_ARM32NEON, ..
|
||||||
|
* - one cmake library target pf_conv_arch_<opt> for each architecture option.
|
||||||
|
* each one gets it's specific architecture/compiler options
|
||||||
|
* utilizing the target_set_cxx_arch_option() macro in the CMakeLists.txt
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "pf_cplx.h"
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
#else
|
||||||
|
# define RESTRICT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
struct conv_buffer_state
|
||||||
|
{
|
||||||
|
int offset; // sample index where data (to process) starts
|
||||||
|
int size; // actual - or previous - size in amount of samples from buffer start (NOT offset)
|
||||||
|
};
|
||||||
|
|
||||||
|
// declare provided function pointer types
|
||||||
|
|
||||||
|
typedef const char * (*f_conv_id)();
|
||||||
|
|
||||||
|
typedef int (*f_conv_float_simd_size)();
|
||||||
|
|
||||||
|
typedef void (*f_conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state);
|
||||||
|
typedef void (*f_conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state);
|
||||||
|
|
||||||
|
typedef int (*f_conv_float_inplace)(
|
||||||
|
float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter
|
||||||
|
);
|
||||||
|
|
||||||
|
typedef int (*f_conv_float_oop)(
|
||||||
|
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter,
|
||||||
|
float * RESTRICT y
|
||||||
|
);
|
||||||
|
|
||||||
|
typedef int (*f_conv_cplx_float_oop)(
|
||||||
|
const complexf * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||||
|
const float * RESTRICT filter, const int sz_filter,
|
||||||
|
complexf * RESTRICT y
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// struct with the provided function pointers
|
||||||
|
struct conv_f_ptrs
|
||||||
|
{
|
||||||
|
const char * id;
|
||||||
|
const int using_mipp;
|
||||||
|
f_conv_id fp_id;
|
||||||
|
f_conv_float_simd_size fp_conv_float_simd_size;
|
||||||
|
|
||||||
|
f_conv_float_move_rest fp_conv_float_move_rest;
|
||||||
|
f_conv_float_inplace fp_conv_float_inplace;
|
||||||
|
f_conv_float_oop fp_conv_float_oop;
|
||||||
|
|
||||||
|
f_conv_cplx_move_rest fp_conv_cplx_move_rest;
|
||||||
|
f_conv_cplx_float_oop fp_conv_cplx_float_oop;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef const conv_f_ptrs * ptr_to_conv_f_ptrs;
|
||||||
|
|
||||||
|
// function pointer type, delivering the struct with the function pointers
|
||||||
|
typedef const conv_f_ptrs* (*f_conv_ptrs)();
|
||||||
|
|
||||||
|
|
||||||
|
// helper for systematic function names
|
||||||
|
#define CONV_FN_ARCH(FN, ARCH) FN##_##ARCH
|
||||||
|
|
||||||
|
// declare all functions - returning the structs with the function pointers
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, none)(); // = conv_ptrs_none()
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, dflt)(); // simd / mipp is activated
|
||||||
|
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse3)(); // = conv_ptrs_sse3()
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse4)();
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)();
|
||||||
|
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse2)();
|
||||||
|
//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)(); // already declared
|
||||||
|
//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); // already declared
|
||||||
|
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_vfpv4)(); // for armv7l / 32-bit ARM
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
|
||||||
|
|
||||||
|
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, armv8a)(); // for aarch64
|
||||||
61
pffft/pf_conv_dispatcher.cpp
Normal file
61
pffft/pf_conv_dispatcher.cpp
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
|
||||||
|
#include "pf_conv_dispatcher.h"
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define DPRINT(...) do { } while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define N_DEFAULT_ARCHES 2
|
||||||
|
// 0 is "none"
|
||||||
|
// 1 "dflt"
|
||||||
|
|
||||||
|
ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch)
|
||||||
|
{
|
||||||
|
static ptr_to_conv_f_ptrs * all_arches = nullptr;
|
||||||
|
static int n_arch = 0;
|
||||||
|
if (!all_arches)
|
||||||
|
{
|
||||||
|
n_arch = N_DEFAULT_ARCHES;
|
||||||
|
// @TODO: runtime check if actual CPU supports specific architecture
|
||||||
|
#if defined(CONV_ARCH_GCC_AMD64)
|
||||||
|
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+4] = {0};
|
||||||
|
DPRINT("CONV_ARCH_GCC_AMD64: sse3, sse4, avx, avx2\n");
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse3)();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse4)();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
|
||||||
|
#elif defined(CONV_ARCH_MSVC_AMD64)
|
||||||
|
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
|
||||||
|
DPRINT("CONV_ARCH_MSVC_AMD64: sse2, avx, avx2\n");
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse2)();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
|
||||||
|
#elif defined(CONV_ARCH_GCC_ARM32NEON)
|
||||||
|
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
|
||||||
|
DPRINT("CONV_ARCH_GCC_ARM32NEON: neon_vfpv4, neon_rpi3_a53\n");
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
|
||||||
|
#elif defined(CONV_ARCH_GCC_AARCH64)
|
||||||
|
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+1] = {0};
|
||||||
|
DPRINT("CONV_ARCH_GCC_AARCH64: -\n");
|
||||||
|
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, armv8a)();
|
||||||
|
#else
|
||||||
|
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES] = {0};
|
||||||
|
DPRINT("unknown CONV_ARCH: -\n");
|
||||||
|
#endif
|
||||||
|
conv_arch_ptrs[0] = CONV_FN_ARCH(conv_ptrs, none)();
|
||||||
|
conv_arch_ptrs[1] = CONV_FN_ARCH(conv_ptrs, dflt)();
|
||||||
|
all_arches = conv_arch_ptrs;
|
||||||
|
}
|
||||||
|
if (p_num_arch)
|
||||||
|
*p_num_arch = n_arch;
|
||||||
|
return all_arches;
|
||||||
|
}
|
||||||
|
|
||||||
6
pffft/pf_conv_dispatcher.h
Normal file
6
pffft/pf_conv_dispatcher.h
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "pf_conv.h"
|
||||||
|
|
||||||
|
ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch);
|
||||||
|
|
||||||
44
pffft/pf_cplx.h
Normal file
44
pffft/pf_cplx.h
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||||
|
|
||||||
|
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
/*
|
||||||
|
_____ _
|
||||||
|
/ ____| | |
|
||||||
|
| | ___ _ __ ___ _ __ | | _____ __
|
||||||
|
| | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
|
||||||
|
| |___| (_) | | | | | | |_) | | __/> <
|
||||||
|
\_____\___/|_| |_| |_| .__/|_|\___/_/\_\
|
||||||
|
| |
|
||||||
|
|_|
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct complexf_s { float i; float q; } complexf;
|
||||||
|
|
||||||
1148
pffft/pf_mixer.cpp
Normal file
1148
pffft/pf_mixer.cpp
Normal file
File diff suppressed because it is too large
Load Diff
270
pffft/pf_mixer.h
Normal file
270
pffft/pf_mixer.h
Normal file
@@ -0,0 +1,270 @@
|
|||||||
|
/*
|
||||||
|
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||||
|
|
||||||
|
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||||
|
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the
|
||||||
|
names of its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "pf_cplx.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// =================================================================================
|
||||||
|
|
||||||
|
int have_sse_shift_mixer_impl();
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO A ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase);
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO B ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
typedef struct shift_table_data_s
|
||||||
|
{
|
||||||
|
float* table;
|
||||||
|
int table_size;
|
||||||
|
} shift_table_data_t;
|
||||||
|
|
||||||
|
void shift_table_deinit(shift_table_data_t table_data);
|
||||||
|
shift_table_data_t shift_table_init(int table_size);
|
||||||
|
float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO C ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
typedef struct shift_addfast_data_s
|
||||||
|
{
|
||||||
|
float dsin[4];
|
||||||
|
float dcos[4];
|
||||||
|
float phase_increment;
|
||||||
|
} shift_addfast_data_t;
|
||||||
|
|
||||||
|
shift_addfast_data_t shift_addfast_init(float rate);
|
||||||
|
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
|
||||||
|
float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO D ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
typedef struct shift_unroll_data_s
|
||||||
|
{
|
||||||
|
float* dsin;
|
||||||
|
float* dcos;
|
||||||
|
float phase_increment;
|
||||||
|
int size;
|
||||||
|
} shift_unroll_data_t;
|
||||||
|
|
||||||
|
shift_unroll_data_t shift_unroll_init(float rate, int size);
|
||||||
|
void shift_unroll_deinit(shift_unroll_data_t* d);
|
||||||
|
float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
|
||||||
|
float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO E ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
/* similar to shift_unroll_cc() - but, have fixed and limited precalc size
|
||||||
|
* idea: smaller cache usage by table
|
||||||
|
* size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
|
||||||
|
*/
|
||||||
|
#define PF_SHIFT_LIMITED_UNROLL_SIZE 128
|
||||||
|
#define PF_SHIFT_LIMITED_SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef struct shift_limited_unroll_data_s
|
||||||
|
{
|
||||||
|
float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
|
||||||
|
float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
|
||||||
|
complexf complex_phase;
|
||||||
|
float phase_increment;
|
||||||
|
} shift_limited_unroll_data_t;
|
||||||
|
|
||||||
|
shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
|
||||||
|
/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
|
||||||
|
/* starting_phase for next call is kept internal in state */
|
||||||
|
void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
|
||||||
|
void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO F ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
typedef struct shift_limited_unroll_A_sse_data_s
|
||||||
|
{
|
||||||
|
/* small/limited trig table */
|
||||||
|
float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
/* 4 times complex phase */
|
||||||
|
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
/* N_cplx_per_block times increment - for future parallel variants */
|
||||||
|
float dcos_blk;
|
||||||
|
float dsin_blk;
|
||||||
|
/* */
|
||||||
|
float phase_increment;
|
||||||
|
} shift_limited_unroll_A_sse_data_t;
|
||||||
|
|
||||||
|
shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
|
||||||
|
void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO G ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
typedef struct shift_limited_unroll_B_sse_data_s
|
||||||
|
{
|
||||||
|
/* small/limited trig table */
|
||||||
|
float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
/* 4 times complex phase */
|
||||||
|
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
/* N_cplx_per_block times increment - for future parallel variants */
|
||||||
|
float dcos_blk;
|
||||||
|
float dsin_blk;
|
||||||
|
/* */
|
||||||
|
float phase_increment;
|
||||||
|
} shift_limited_unroll_B_sse_data_t;
|
||||||
|
|
||||||
|
shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
|
||||||
|
void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO H ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
typedef struct shift_limited_unroll_C_sse_data_s
|
||||||
|
{
|
||||||
|
/* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
|
||||||
|
float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
|
||||||
|
/* 4 times complex phase */
|
||||||
|
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||||
|
/* N_cplx_per_block times increment - for future parallel variants */
|
||||||
|
float dcos_blk;
|
||||||
|
float dsin_blk;
|
||||||
|
/* */
|
||||||
|
float phase_increment;
|
||||||
|
} shift_limited_unroll_C_sse_data_t;
|
||||||
|
|
||||||
|
shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
|
||||||
|
void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO I ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
/* Recursive Quadrature Oscillator functions "recursive_osc"
|
||||||
|
* see https://www.vicanek.de/articles/QuadOsc.pdf
|
||||||
|
*/
|
||||||
|
#define PF_SHIFT_RECURSIVE_SIMD_SZ 8
|
||||||
|
typedef struct shift_recursive_osc_s
|
||||||
|
{
|
||||||
|
float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
|
||||||
|
float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
|
||||||
|
} shift_recursive_osc_t;
|
||||||
|
|
||||||
|
typedef struct shift_recursive_osc_conf_s
|
||||||
|
{
|
||||||
|
float k1;
|
||||||
|
float k2;
|
||||||
|
} shift_recursive_osc_conf_t;
|
||||||
|
|
||||||
|
void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
|
||||||
|
void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||||
|
|
||||||
|
/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
|
||||||
|
/* starting_phase for next call is kept internal in state */
|
||||||
|
void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||||
|
void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||||
|
void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
/*** ALGO J ***/
|
||||||
|
/**************/
|
||||||
|
|
||||||
|
#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ 4
|
||||||
|
typedef struct shift_recursive_osc_sse_s
|
||||||
|
{
|
||||||
|
float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
|
||||||
|
float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
|
||||||
|
} shift_recursive_osc_sse_t;
|
||||||
|
|
||||||
|
typedef struct shift_recursive_osc_sse_conf_s
|
||||||
|
{
|
||||||
|
float k1;
|
||||||
|
float k2;
|
||||||
|
} shift_recursive_osc_sse_conf_t;
|
||||||
|
|
||||||
|
void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
|
||||||
|
void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
|
||||||
|
void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
264
pffft/pffastconv.c
Normal file
264
pffft/pffastconv.c
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "pffastconv.h"
|
||||||
|
#include "pffft.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define FASTCONV_DBG_OUT 0
|
||||||
|
|
||||||
|
|
||||||
|
/* detect compiler flavour */
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
void *pffastconv_malloc(size_t nb_bytes)
|
||||||
|
{
|
||||||
|
return pffft_aligned_malloc(nb_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void pffastconv_free(void *p)
|
||||||
|
{
|
||||||
|
pffft_aligned_free(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
int pffastconv_simd_size()
|
||||||
|
{
|
||||||
|
return pffft_simd_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct PFFASTCONV_Setup
|
||||||
|
{
|
||||||
|
float * Xt; /* input == x in time domain - copy for alignment */
|
||||||
|
float * Xf; /* input == X in freq domain */
|
||||||
|
float * Hf; /* filterCoeffs == H in freq domain */
|
||||||
|
float * Mf; /* input * filterCoeffs in freq domain */
|
||||||
|
PFFFT_Setup *st;
|
||||||
|
int filterLen; /* convolution length */
|
||||||
|
int Nfft; /* FFT/block length */
|
||||||
|
int flags;
|
||||||
|
float scale;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
|
||||||
|
{
|
||||||
|
PFFASTCONV_Setup * s = NULL;
|
||||||
|
const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
|
||||||
|
const int minFftLen = 2*pffft_simd_size()*pffft_simd_size();
|
||||||
|
int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1);
|
||||||
|
#if FASTCONV_DBG_OUT
|
||||||
|
const int iOldBlkLen = *blockLen;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ( Nfft < minFftLen )
|
||||||
|
Nfft = minFftLen;
|
||||||
|
|
||||||
|
if ( flags & PFFASTCONV_CPLX_FILTER )
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
|
||||||
|
|
||||||
|
if ( *blockLen > Nfft ) {
|
||||||
|
Nfft = *blockLen;
|
||||||
|
Nfft = pffft_next_power_of_two(Nfft);
|
||||||
|
}
|
||||||
|
*blockLen = Nfft; /* this is in (complex) samples */
|
||||||
|
|
||||||
|
Nfft *= cplxFactor;
|
||||||
|
|
||||||
|
if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
|
||||||
|
s->Xt = NULL;
|
||||||
|
else
|
||||||
|
s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||||
|
s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||||
|
s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||||
|
s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||||
|
s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */
|
||||||
|
s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */
|
||||||
|
if ( cplxFactor == 2 )
|
||||||
|
s->filterLen = 2 * filterLen - 1;
|
||||||
|
s->Nfft = Nfft; /* FFT/block length */
|
||||||
|
s->flags = flags;
|
||||||
|
s->scale = (float)( 1.0 / Nfft );
|
||||||
|
|
||||||
|
memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
|
||||||
|
if ( flags & PFFASTCONV_CORRELATION ) {
|
||||||
|
for ( i = 0; i < filterLen; ++i )
|
||||||
|
s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ];
|
||||||
|
} else {
|
||||||
|
for ( i = 0; i < filterLen; ++i )
|
||||||
|
s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
|
||||||
|
}
|
||||||
|
|
||||||
|
pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||||
|
|
||||||
|
#if FASTCONV_DBG_OUT
|
||||||
|
printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
|
||||||
|
, filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
|
||||||
|
{
|
||||||
|
if (!s)
|
||||||
|
return;
|
||||||
|
pffft_destroy_setup(s->st);
|
||||||
|
pffastconv_free(s->Mf);
|
||||||
|
pffastconv_free(s->Hf);
|
||||||
|
pffastconv_free(s->Xf);
|
||||||
|
if ( s->Xt )
|
||||||
|
pffastconv_free(s->Xt);
|
||||||
|
pffastconv_free(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
|
||||||
|
{
|
||||||
|
const float * RESTRICT X = input_;
|
||||||
|
float * RESTRICT Y = output_;
|
||||||
|
const int Nfft = s->Nfft;
|
||||||
|
const int filterLen = s->filterLen;
|
||||||
|
const int flags = s->flags;
|
||||||
|
const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
|
||||||
|
const int inputLen = cplxFactor * cplxInputLen;
|
||||||
|
int inpOff, procLen, numOut = 0, j, part, cplxOff;
|
||||||
|
|
||||||
|
/* applyFlush != 0:
|
||||||
|
* inputLen - inpOff -filterLen + 1 > 0
|
||||||
|
* <=> inputLen -filterLen + 1 > inpOff
|
||||||
|
* <=> inpOff < inputLen -filterLen + 1
|
||||||
|
*
|
||||||
|
* applyFlush == 0:
|
||||||
|
* inputLen - inpOff >= Nfft
|
||||||
|
* <=> inputLen - Nfft >= inpOff
|
||||||
|
* <=> inpOff <= inputLen - Nfft
|
||||||
|
* <=> inpOff < inputLen - Nfft + 1
|
||||||
|
*/
|
||||||
|
|
||||||
|
if ( cplxFactor == 2 )
|
||||||
|
{
|
||||||
|
const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
|
||||||
|
#if 0
|
||||||
|
printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
|
||||||
|
#endif
|
||||||
|
for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
|
||||||
|
{
|
||||||
|
procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
|
||||||
|
numOut = ( procLen - filterLen + 1 ) & ( ~1 );
|
||||||
|
if (!numOut)
|
||||||
|
break;
|
||||||
|
#if 0
|
||||||
|
if (!inpOff)
|
||||||
|
printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
|
||||||
|
if (inpOff + filterLen + 2 >= maxOff )
|
||||||
|
printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ( flags & PFFASTCONV_DIRECT_INP )
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
|
||||||
|
if ( procLen < Nfft )
|
||||||
|
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
|
||||||
|
|
||||||
|
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||||
|
}
|
||||||
|
|
||||||
|
pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
|
||||||
|
|
||||||
|
if ( flags & PFFASTCONV_DIRECT_OUT )
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
|
||||||
|
memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return inpOff / cplxFactor;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
|
||||||
|
const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
|
||||||
|
|
||||||
|
for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
|
||||||
|
{
|
||||||
|
procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
|
||||||
|
numOut = procLen - filterLen + 1;
|
||||||
|
|
||||||
|
for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */
|
||||||
|
{
|
||||||
|
|
||||||
|
if ( flags & PFFASTCONV_CPLX_INP_OUT )
|
||||||
|
{
|
||||||
|
cplxOff = 2 * inpOff + part;
|
||||||
|
for ( j = 0; j < procLen; ++j )
|
||||||
|
s->Xt[j] = X[cplxOff + 2 * j];
|
||||||
|
if ( procLen < Nfft )
|
||||||
|
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
|
||||||
|
|
||||||
|
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||||
|
}
|
||||||
|
else if ( flags & PFFASTCONV_DIRECT_INP )
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
|
||||||
|
if ( procLen < Nfft )
|
||||||
|
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
|
||||||
|
|
||||||
|
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||||
|
}
|
||||||
|
|
||||||
|
pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
|
||||||
|
|
||||||
|
if ( flags & PFFASTCONV_CPLX_INP_OUT )
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
|
||||||
|
|
||||||
|
cplxOff = 2 * inpOff + part;
|
||||||
|
for ( j = 0; j < numOut; ++j )
|
||||||
|
Y[ cplxOff + 2 * j ] = s->Xf[j];
|
||||||
|
}
|
||||||
|
else if ( flags & PFFASTCONV_DIRECT_OUT )
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
|
||||||
|
memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return inpOff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
171
pffft/pffastconv.h
Normal file
171
pffft/pffastconv.h
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
/* Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of PFFFT, PFFASTCONV, nor the names of its
|
||||||
|
sponsors or contributors may be used to endorse or promote products
|
||||||
|
derived from this Software without specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
PFFASTCONV : a Pretty Fast Fast Convolution
|
||||||
|
|
||||||
|
This is basically the implementation of fast convolution,
|
||||||
|
utilizing the FFT (pffft).
|
||||||
|
|
||||||
|
Restrictions:
|
||||||
|
|
||||||
|
- 1D transforms only, with 32-bit single precision.
|
||||||
|
|
||||||
|
- all (float*) pointers in the functions below are expected to
|
||||||
|
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||||
|
powerpc CPUs.
|
||||||
|
|
||||||
|
You can allocate such buffers with the functions
|
||||||
|
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||||
|
posix_memalign..)
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PFFASTCONV_H
|
||||||
|
#define PFFASTCONV_H
|
||||||
|
|
||||||
|
#include <stddef.h> /* for size_t */
|
||||||
|
#include "pffft.h"
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* opaque struct holding internal stuff
|
||||||
|
this struct can't be shared by many threads as it contains
|
||||||
|
temporary data, computed within the convolution
|
||||||
|
*/
|
||||||
|
typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
PFFASTCONV_CPLX_INP_OUT = 1,
|
||||||
|
/* set when input and output is complex,
|
||||||
|
* with real and imag part interleaved in both vectors.
|
||||||
|
* input[] has inputLen complex values: 2 * inputLen floats,
|
||||||
|
* output[] is also written with complex values.
|
||||||
|
* without this flag, the input is interpreted as real vector
|
||||||
|
*/
|
||||||
|
|
||||||
|
PFFASTCONV_CPLX_FILTER = 2,
|
||||||
|
/* set when filterCoeffs is complex,
|
||||||
|
* with real and imag part interleaved.
|
||||||
|
* filterCoeffs[] has filterLen complex values: 2 * filterLen floats
|
||||||
|
* without this flag, the filter is interpreted as real vector
|
||||||
|
* ATTENTION: this is not implemented yet!
|
||||||
|
*/
|
||||||
|
|
||||||
|
PFFASTCONV_DIRECT_INP = 4,
|
||||||
|
/* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
|
||||||
|
* 1- input vecor X must be aligned
|
||||||
|
* 2- (all) inputLen <= ouput blockLen
|
||||||
|
* 3- X must have minimum length of output BlockLen
|
||||||
|
* 4- the additional samples from inputLen .. BlockLen-1
|
||||||
|
* must contain valid small and non-NAN samples (ideally zero)
|
||||||
|
*
|
||||||
|
* this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
|
||||||
|
*/
|
||||||
|
|
||||||
|
PFFASTCONV_DIRECT_OUT = 8,
|
||||||
|
/* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
|
||||||
|
* 1- output vector Y must be aligned
|
||||||
|
* 2- (all) inputLen <= ouput blockLen
|
||||||
|
* 3- Y must have minimum length of output blockLen
|
||||||
|
*
|
||||||
|
* this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
|
||||||
|
*/
|
||||||
|
|
||||||
|
PFFASTCONV_CPLX_SINGLE_FFT = 16,
|
||||||
|
/* hint to process complex data with one single FFT;
|
||||||
|
* default is to use 2 FFTs: one for real part, one for imag part
|
||||||
|
* */
|
||||||
|
|
||||||
|
|
||||||
|
PFFASTCONV_SYMMETRIC = 32,
|
||||||
|
/* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
|
||||||
|
|
||||||
|
PFFASTCONV_CORRELATION = 64,
|
||||||
|
/* filterCoeffs[] of pffastconv_new_setup are for correlation;
|
||||||
|
* thus, do not flip them for the internal fft calculation
|
||||||
|
* - as necessary for the fast convolution */
|
||||||
|
|
||||||
|
} pffastconv_flags_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
|
||||||
|
The output 'blockLen' might be bigger to allow the fast convolution.
|
||||||
|
|
||||||
|
'flags' are bitmask over the 'pffastconv_flags_t' enum.
|
||||||
|
|
||||||
|
PFFASTCONV_Setup structure can't be shared accross multiple filters
|
||||||
|
or concurrent threads.
|
||||||
|
*/
|
||||||
|
PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
|
||||||
|
|
||||||
|
void pffastconv_destroy_setup(PFFASTCONV_Setup *);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Perform the fast convolution.
|
||||||
|
|
||||||
|
'input' and 'output' don't need to be aligned - unless any of
|
||||||
|
PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
|
||||||
|
|
||||||
|
inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
|
||||||
|
in this case, multiple FFTs are called internally, to process the
|
||||||
|
input[].
|
||||||
|
|
||||||
|
'output' vector must have size >= (inputLen - filterLen + 1)
|
||||||
|
|
||||||
|
set bool option 'applyFlush' to process the full input[].
|
||||||
|
with this option, 'tail samples' of input are also processed.
|
||||||
|
This might be inefficient, because the FFT is called to produce
|
||||||
|
few(er) output samples, than possible.
|
||||||
|
This option is useful to process the last samples of an input (file)
|
||||||
|
or to reduce latency.
|
||||||
|
|
||||||
|
return value is the number of produced samples in output[].
|
||||||
|
the same amount of samples is processed from input[]. to continue
|
||||||
|
processing, the caller must save/move the remaining samples of
|
||||||
|
input[].
|
||||||
|
|
||||||
|
*/
|
||||||
|
int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
|
||||||
|
|
||||||
|
void *pffastconv_malloc(size_t nb_bytes);
|
||||||
|
void pffastconv_free(void *);
|
||||||
|
|
||||||
|
/* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
|
||||||
|
int pffastconv_simd_size();
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PFFASTCONV_H */
|
||||||
134
pffft/pffft.c
Normal file
134
pffft/pffft.c
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
|
||||||
|
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||||
|
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||||
|
of NCAR, in 1985.
|
||||||
|
|
||||||
|
As confirmed by the NCAR fftpack software curators, the following
|
||||||
|
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||||
|
released under the same terms.
|
||||||
|
|
||||||
|
FFTPACK license:
|
||||||
|
|
||||||
|
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||||
|
|
||||||
|
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||||
|
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||||
|
Computational and Information Systems Laboratory, UCAR,
|
||||||
|
www.cisl.ucar.edu.
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
PFFFT : a Pretty Fast FFT.
|
||||||
|
|
||||||
|
This file is largerly based on the original FFTPACK implementation, modified in
|
||||||
|
order to take advantage of SIMD instructions of modern CPUs.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
ChangeLog:
|
||||||
|
- 2011/10/02, version 1: This is the very first release of this file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "pffft.h"
|
||||||
|
|
||||||
|
/* detect compiler flavour */
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define COMPILER_MSVC
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# define COMPILER_GCC
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
#if defined(COMPILER_GCC)
|
||||||
|
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||||
|
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||||
|
#elif defined(COMPILER_MSVC)
|
||||||
|
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||||
|
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef COMPILER_MSVC
|
||||||
|
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
vector support macros: the rest of the code is independant of
|
||||||
|
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||||||
|
vectors should be limited to these macros
|
||||||
|
*/
|
||||||
|
#include "simd/pf_float.h"
|
||||||
|
|
||||||
|
/* have code comparable with this definition */
|
||||||
|
#define SETUP_STRUCT PFFFT_Setup
|
||||||
|
#define FUNC_NEW_SETUP pffft_new_setup
|
||||||
|
#define FUNC_DESTROY pffft_destroy_setup
|
||||||
|
#define FUNC_TRANSFORM_UNORDRD pffft_transform
|
||||||
|
#define FUNC_TRANSFORM_ORDERED pffft_transform_ordered
|
||||||
|
#define FUNC_ZREORDER pffft_zreorder
|
||||||
|
#define FUNC_ZCONVOLVE_ACCUMULATE pffft_zconvolve_accumulate
|
||||||
|
#define FUNC_ZCONVOLVE_NO_ACCU pffft_zconvolve_no_accu
|
||||||
|
|
||||||
|
#define FUNC_ALIGNED_MALLOC pffft_aligned_malloc
|
||||||
|
#define FUNC_ALIGNED_FREE pffft_aligned_free
|
||||||
|
#define FUNC_SIMD_SIZE pffft_simd_size
|
||||||
|
#define FUNC_MIN_FFT_SIZE pffft_min_fft_size
|
||||||
|
#define FUNC_IS_VALID_SIZE pffft_is_valid_size
|
||||||
|
#define FUNC_NEAREST_SIZE pffft_nearest_transform_size
|
||||||
|
#define FUNC_SIMD_ARCH pffft_simd_arch
|
||||||
|
#define FUNC_VALIDATE_SIMD_A validate_pffft_simd
|
||||||
|
#define FUNC_VALIDATE_SIMD_EX validate_pffft_simd_ex
|
||||||
|
|
||||||
|
#define FUNC_CPLX_FINALIZE pffft_cplx_finalize
|
||||||
|
#define FUNC_CPLX_PREPROCESS pffft_cplx_preprocess
|
||||||
|
#define FUNC_REAL_PREPROCESS_4X4 pffft_real_preprocess_4x4
|
||||||
|
#define FUNC_REAL_PREPROCESS pffft_real_preprocess
|
||||||
|
#define FUNC_REAL_FINALIZE_4X4 pffft_real_finalize_4x4
|
||||||
|
#define FUNC_REAL_FINALIZE pffft_real_finalize
|
||||||
|
#define FUNC_TRANSFORM_INTERNAL pffft_transform_internal
|
||||||
|
|
||||||
|
#define FUNC_COS cosf
|
||||||
|
#define FUNC_SIN sinf
|
||||||
|
|
||||||
|
|
||||||
|
#include "pffft_priv_impl.h"
|
||||||
|
|
||||||
|
|
||||||
241
pffft/pffft.h
Normal file
241
pffft/pffft.h
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||||
|
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||||
|
|
||||||
|
As confirmed by the NCAR fftpack software curators, the following
|
||||||
|
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||||
|
released under the same terms.
|
||||||
|
|
||||||
|
FFTPACK license:
|
||||||
|
|
||||||
|
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||||
|
|
||||||
|
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||||
|
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||||
|
Computational and Information Systems Laboratory, UCAR,
|
||||||
|
www.cisl.ucar.edu.
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
PFFFT : a Pretty Fast FFT.
|
||||||
|
|
||||||
|
This is basically an adaptation of the single precision fftpack
|
||||||
|
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||||
|
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||||
|
|
||||||
|
For architectures where no SIMD instruction is available, the code
|
||||||
|
falls back to a scalar version.
|
||||||
|
|
||||||
|
Restrictions:
|
||||||
|
|
||||||
|
- 1D transforms only, with 32-bit single precision.
|
||||||
|
|
||||||
|
- supports only transforms for inputs of length N of the form
|
||||||
|
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||||
|
144, 160, etc are all acceptable lengths). Performance is best for
|
||||||
|
128<=N<=8192.
|
||||||
|
|
||||||
|
- all (float*) pointers in the functions below are expected to
|
||||||
|
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||||
|
powerpc CPUs.
|
||||||
|
|
||||||
|
You can allocate such buffers with the functions
|
||||||
|
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||||
|
posix_memalign..)
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PFFFT_H
|
||||||
|
#define PFFFT_H
|
||||||
|
|
||||||
|
#include <stddef.h> /* for size_t */
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||||
|
this struct can be shared by many threads as it contains only
|
||||||
|
read-only data.
|
||||||
|
*/
|
||||||
|
typedef struct PFFFT_Setup PFFFT_Setup;
|
||||||
|
|
||||||
|
#ifndef PFFFT_COMMON_ENUMS
|
||||||
|
#define PFFFT_COMMON_ENUMS
|
||||||
|
|
||||||
|
/* direction of the transform */
|
||||||
|
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||||
|
|
||||||
|
/* type of transform */
|
||||||
|
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
prepare for performing transforms of size N -- the returned
|
||||||
|
PFFFT_Setup structure is read-only so it can safely be shared by
|
||||||
|
multiple concurrent threads.
|
||||||
|
*/
|
||||||
|
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
|
||||||
|
void pffft_destroy_setup(PFFFT_Setup *);
|
||||||
|
/*
|
||||||
|
Perform a Fourier transform , The z-domain data is stored in the
|
||||||
|
most efficient order for transforming it back, or using it for
|
||||||
|
convolution. If you need to have its content sorted in the
|
||||||
|
"usual" way, that is as an array of interleaved complex numbers,
|
||||||
|
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||||
|
the forward fft, and before the backward fft.
|
||||||
|
|
||||||
|
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||||
|
Typically you will want to scale the backward transform by 1/N.
|
||||||
|
|
||||||
|
The 'work' pointer should point to an area of N (2*N for complex
|
||||||
|
fft) floats, properly aligned. If 'work' is NULL, then stack will
|
||||||
|
be used instead (this is probably the best strategy for small
|
||||||
|
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||||
|
there's no sufficient amount of memory, usually leading to a crash!
|
||||||
|
Use the heap with pffft_aligned_malloc() in this case.
|
||||||
|
|
||||||
|
For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
|
||||||
|
input with input(=transformation) length N, the output array is
|
||||||
|
'mostly' complex:
|
||||||
|
index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N
|
||||||
|
index k == 0 is a special case:
|
||||||
|
the real() part contains the result for the DC frequency 0,
|
||||||
|
the imag() part contains the result for the Nyquist frequency Samplerate/2
|
||||||
|
both 0-frequency and half frequency components, which are real,
|
||||||
|
are assembled in the first entry as F(0)+i*F(N/2).
|
||||||
|
With the output size N/2 complex values (=N real/imag values), it is
|
||||||
|
obvious, that the result for negative frequencies are not output,
|
||||||
|
cause of symmetry.
|
||||||
|
|
||||||
|
input and output may alias.
|
||||||
|
*/
|
||||||
|
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Similar to pffft_transform, but makes sure that the output is
|
||||||
|
ordered as expected (interleaved complex numbers). This is
|
||||||
|
similar to calling pffft_transform and then pffft_zreorder.
|
||||||
|
|
||||||
|
input and output may alias.
|
||||||
|
*/
|
||||||
|
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||||
|
|
||||||
|
/*
|
||||||
|
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||||
|
PFFFT_FORWARD) if you want to have the frequency components in
|
||||||
|
the correct "canonical" order, as interleaved complex numbers.
|
||||||
|
|
||||||
|
(for real transforms, both 0-frequency and half frequency
|
||||||
|
components, which are real, are assembled in the first entry as
|
||||||
|
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||||
|
F(n/2+1) at the end of the arrays).
|
||||||
|
|
||||||
|
input and output should not alias.
|
||||||
|
*/
|
||||||
|
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Perform a multiplication of the frequency components of dft_a and
|
||||||
|
dft_b and accumulate them into dft_ab. The arrays should have
|
||||||
|
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||||
|
*not* have been reordered with pffft_zreorder (otherwise just
|
||||||
|
perform the operation yourself as the dft coefs are stored as
|
||||||
|
interleaved complex numbers).
|
||||||
|
|
||||||
|
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||||
|
|
||||||
|
The dft_a, dft_b and dft_ab pointers may alias.
|
||||||
|
*/
|
||||||
|
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Perform a multiplication of the frequency components of dft_a and
|
||||||
|
dft_b and put result in dft_ab. The arrays should have
|
||||||
|
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||||
|
*not* have been reordered with pffft_zreorder (otherwise just
|
||||||
|
perform the operation yourself as the dft coefs are stored as
|
||||||
|
interleaved complex numbers).
|
||||||
|
|
||||||
|
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||||
|
|
||||||
|
The dft_a, dft_b and dft_ab pointers may alias.
|
||||||
|
*/
|
||||||
|
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||||
|
|
||||||
|
/* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
|
||||||
|
int pffft_simd_size();
|
||||||
|
|
||||||
|
/* return string identifier of used architecture (SSE/NEON/Altivec/..) */
|
||||||
|
const char * pffft_simd_arch();
|
||||||
|
|
||||||
|
|
||||||
|
/* following functions are identical to the pffftd_ functions */
|
||||||
|
|
||||||
|
/* simple helper to get minimum possible fft size */
|
||||||
|
int pffft_min_fft_size(pffft_transform_t transform);
|
||||||
|
|
||||||
|
/* simple helper to determine next power of 2
|
||||||
|
- without inexact/rounding floating point operations
|
||||||
|
*/
|
||||||
|
int pffft_next_power_of_two(int N);
|
||||||
|
|
||||||
|
/* simple helper to determine if power of 2 - returns bool */
|
||||||
|
int pffft_is_power_of_two(int N);
|
||||||
|
|
||||||
|
/* simple helper to determine size N is valid
|
||||||
|
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||||
|
returns bool
|
||||||
|
*/
|
||||||
|
int pffft_is_valid_size(int N, pffft_transform_t cplx);
|
||||||
|
|
||||||
|
/* determine nearest valid transform size (by brute-force testing)
|
||||||
|
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||||
|
higher: bool-flag to find nearest higher value; else lower.
|
||||||
|
*/
|
||||||
|
int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||||
|
|
||||||
|
/*
|
||||||
|
the float buffers must have the correct alignment (16-byte boundary
|
||||||
|
on intel and powerpc). This function may be used to obtain such
|
||||||
|
correctly aligned buffers.
|
||||||
|
*/
|
||||||
|
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||||
|
void pffft_aligned_free(void *);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PFFFT_H */
|
||||||
|
|
||||||
1060
pffft/pffft.hpp
Normal file
1060
pffft/pffft.hpp
Normal file
File diff suppressed because it is too large
Load Diff
53
pffft/pffft_common.c
Normal file
53
pffft/pffft_common.c
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
|
||||||
|
#include "pffft.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
/* SSE and co like 16-bytes aligned pointers
|
||||||
|
* with a 64-byte alignment, we are even aligned on L2 cache lines... */
|
||||||
|
#define MALLOC_V4SF_ALIGNMENT 64
|
||||||
|
|
||||||
|
static void * Valigned_malloc(size_t nb_bytes) {
|
||||||
|
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||||
|
if (!p0) return (void *) 0;
|
||||||
|
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
|
||||||
|
*((void **) p - 1) = p0;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Valigned_free(void *p) {
|
||||||
|
if (p) free(*((void **) p - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int next_power_of_two(int N) {
|
||||||
|
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
|
||||||
|
/* compute the next highest power of 2 of 32-bit v */
|
||||||
|
unsigned v = N;
|
||||||
|
v--;
|
||||||
|
v |= v >> 1;
|
||||||
|
v |= v >> 2;
|
||||||
|
v |= v >> 4;
|
||||||
|
v |= v >> 8;
|
||||||
|
v |= v >> 16;
|
||||||
|
v++;
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int is_power_of_two(int N) {
|
||||||
|
/* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
|
||||||
|
int f = N && !(N & (N - 1));
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||||
|
void pffft_aligned_free(void *p) { Valigned_free(p); }
|
||||||
|
int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||||
|
int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||||
|
|
||||||
|
void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||||
|
void pffftd_aligned_free(void *p) { Valigned_free(p); }
|
||||||
|
int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||||
|
int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||||
147
pffft/pffft_double.c
Normal file
147
pffft/pffft_double.c
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||||
|
|
||||||
|
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||||
|
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||||
|
of NCAR, in 1985.
|
||||||
|
|
||||||
|
As confirmed by the NCAR fftpack software curators, the following
|
||||||
|
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||||
|
released under the same terms.
|
||||||
|
|
||||||
|
FFTPACK license:
|
||||||
|
|
||||||
|
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||||
|
|
||||||
|
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||||
|
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||||
|
Computational and Information Systems Laboratory, UCAR,
|
||||||
|
www.cisl.ucar.edu.
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
PFFFT : a Pretty Fast FFT.
|
||||||
|
|
||||||
|
This file is largerly based on the original FFTPACK implementation, modified in
|
||||||
|
order to take advantage of SIMD instructions of modern CPUs.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||||
|
which works on 32 bit floating point precision using SSE instructions,
|
||||||
|
to work with 64 bit floating point precision using AVX instructions.
|
||||||
|
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "pffft_double.h"
|
||||||
|
|
||||||
|
/* detect compiler flavour */
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define COMPILER_MSVC
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# define COMPILER_GCC
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef COMPILER_MSVC
|
||||||
|
# define _USE_MATH_DEFINES
|
||||||
|
# include <malloc.h>
|
||||||
|
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||||
|
# include <malloc.h>
|
||||||
|
#else
|
||||||
|
# include <alloca.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
#if defined(COMPILER_GCC)
|
||||||
|
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||||
|
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||||
|
#elif defined(COMPILER_MSVC)
|
||||||
|
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||||
|
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef COMPILER_MSVC
|
||||||
|
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
vector support macros: the rest of the code is independant of
|
||||||
|
AVX -- adding support for other platforms with 4-element
|
||||||
|
vectors should be limited to these macros
|
||||||
|
*/
|
||||||
|
#include "simd/pf_double.h"
|
||||||
|
|
||||||
|
/* have code comparable with this definition */
|
||||||
|
#define float double
|
||||||
|
#define SETUP_STRUCT PFFFTD_Setup
|
||||||
|
#define FUNC_NEW_SETUP pffftd_new_setup
|
||||||
|
#define FUNC_DESTROY pffftd_destroy_setup
|
||||||
|
#define FUNC_TRANSFORM_UNORDRD pffftd_transform
|
||||||
|
#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered
|
||||||
|
#define FUNC_ZREORDER pffftd_zreorder
|
||||||
|
#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate
|
||||||
|
#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu
|
||||||
|
|
||||||
|
#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc
|
||||||
|
#define FUNC_ALIGNED_FREE pffftd_aligned_free
|
||||||
|
#define FUNC_SIMD_SIZE pffftd_simd_size
|
||||||
|
#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size
|
||||||
|
#define FUNC_IS_VALID_SIZE pffftd_is_valid_size
|
||||||
|
#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size
|
||||||
|
#define FUNC_SIMD_ARCH pffftd_simd_arch
|
||||||
|
#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd
|
||||||
|
#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex
|
||||||
|
|
||||||
|
#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize
|
||||||
|
#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess
|
||||||
|
#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4
|
||||||
|
#define FUNC_REAL_PREPROCESS pffftd_real_preprocess
|
||||||
|
#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4
|
||||||
|
#define FUNC_REAL_FINALIZE pffftd_real_finalize
|
||||||
|
#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal
|
||||||
|
|
||||||
|
#define FUNC_COS cos
|
||||||
|
#define FUNC_SIN sin
|
||||||
|
|
||||||
|
|
||||||
|
#include "pffft_priv_impl.h"
|
||||||
|
|
||||||
|
|
||||||
236
pffft/pffft_double.h
Normal file
236
pffft/pffft_double.h
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||||
|
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||||
|
|
||||||
|
As confirmed by the NCAR fftpack software curators, the following
|
||||||
|
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||||
|
released under the same terms.
|
||||||
|
|
||||||
|
FFTPACK license:
|
||||||
|
|
||||||
|
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||||
|
|
||||||
|
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||||
|
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||||
|
Computational and Information Systems Laboratory, UCAR,
|
||||||
|
www.cisl.ucar.edu.
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||||
|
which works on 32 bit floating point precision using SSE instructions,
|
||||||
|
to work with 64 bit floating point precision using AVX instructions.
|
||||||
|
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
PFFFT : a Pretty Fast FFT.
|
||||||
|
|
||||||
|
This is basically an adaptation of the single precision fftpack
|
||||||
|
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||||
|
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||||
|
|
||||||
|
For architectures where no SIMD instruction is available, the code
|
||||||
|
falls back to a scalar version.
|
||||||
|
|
||||||
|
Restrictions:
|
||||||
|
|
||||||
|
- 1D transforms only, with 64-bit double precision.
|
||||||
|
|
||||||
|
- supports only transforms for inputs of length N of the form
|
||||||
|
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||||
|
144, 160, etc are all acceptable lengths). Performance is best for
|
||||||
|
128<=N<=8192.
|
||||||
|
|
||||||
|
- all (double*) pointers in the functions below are expected to
|
||||||
|
have an "simd-compatible" alignment, that is 32 bytes on x86 and
|
||||||
|
powerpc CPUs.
|
||||||
|
|
||||||
|
You can allocate such buffers with the functions
|
||||||
|
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||||
|
posix_memalign..)
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PFFFT_DOUBLE_H
|
||||||
|
#define PFFFT_DOUBLE_H
|
||||||
|
|
||||||
|
#include <stddef.h> /* for size_t */
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||||
|
this struct can be shared by many threads as it contains only
|
||||||
|
read-only data.
|
||||||
|
*/
|
||||||
|
typedef struct PFFFTD_Setup PFFFTD_Setup;
|
||||||
|
|
||||||
|
#ifndef PFFFT_COMMON_ENUMS
|
||||||
|
#define PFFFT_COMMON_ENUMS
|
||||||
|
|
||||||
|
/* direction of the transform */
|
||||||
|
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||||
|
|
||||||
|
/* type of transform */
|
||||||
|
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
prepare for performing transforms of size N -- the returned
|
||||||
|
PFFFTD_Setup structure is read-only so it can safely be shared by
|
||||||
|
multiple concurrent threads.
|
||||||
|
*/
|
||||||
|
PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
|
||||||
|
void pffftd_destroy_setup(PFFFTD_Setup *);
|
||||||
|
/*
|
||||||
|
Perform a Fourier transform , The z-domain data is stored in the
|
||||||
|
most efficient order for transforming it back, or using it for
|
||||||
|
convolution. If you need to have its content sorted in the
|
||||||
|
"usual" way, that is as an array of interleaved complex numbers,
|
||||||
|
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||||
|
the forward fft, and before the backward fft.
|
||||||
|
|
||||||
|
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||||
|
Typically you will want to scale the backward transform by 1/N.
|
||||||
|
|
||||||
|
The 'work' pointer should point to an area of N (2*N for complex
|
||||||
|
fft) doubles, properly aligned. If 'work' is NULL, then stack will
|
||||||
|
be used instead (this is probably the best strategy for small
|
||||||
|
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||||
|
there's no sufficient amount of memory, usually leading to a crash!
|
||||||
|
Use the heap with pffft_aligned_malloc() in this case.
|
||||||
|
|
||||||
|
input and output may alias.
|
||||||
|
*/
|
||||||
|
void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Similar to pffft_transform, but makes sure that the output is
|
||||||
|
ordered as expected (interleaved complex numbers). This is
|
||||||
|
similar to calling pffft_transform and then pffft_zreorder.
|
||||||
|
|
||||||
|
input and output may alias.
|
||||||
|
*/
|
||||||
|
void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||||
|
|
||||||
|
/*
|
||||||
|
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||||
|
PFFFT_FORWARD) if you want to have the frequency components in
|
||||||
|
the correct "canonical" order, as interleaved complex numbers.
|
||||||
|
|
||||||
|
(for real transforms, both 0-frequency and half frequency
|
||||||
|
components, which are real, are assembled in the first entry as
|
||||||
|
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||||
|
F(n/2+1) at the end of the arrays).
|
||||||
|
|
||||||
|
input and output should not alias.
|
||||||
|
*/
|
||||||
|
void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Perform a multiplication of the frequency components of dft_a and
|
||||||
|
dft_b and accumulate them into dft_ab. The arrays should have
|
||||||
|
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||||
|
*not* have been reordered with pffft_zreorder (otherwise just
|
||||||
|
perform the operation yourself as the dft coefs are stored as
|
||||||
|
interleaved complex numbers).
|
||||||
|
|
||||||
|
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||||
|
|
||||||
|
The dft_a, dft_b and dft_ab pointers may alias.
|
||||||
|
*/
|
||||||
|
void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Perform a multiplication of the frequency components of dft_a and
|
||||||
|
dft_b and put result in dft_ab. The arrays should have
|
||||||
|
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||||
|
*not* have been reordered with pffft_zreorder (otherwise just
|
||||||
|
perform the operation yourself as the dft coefs are stored as
|
||||||
|
interleaved complex numbers).
|
||||||
|
|
||||||
|
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||||
|
|
||||||
|
The dft_a, dft_b and dft_ab pointers may alias.
|
||||||
|
*/
|
||||||
|
void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
|
||||||
|
|
||||||
|
/* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
|
||||||
|
int pffftd_simd_size();
|
||||||
|
|
||||||
|
/* return string identifier of used architecture (AVX/..) */
|
||||||
|
const char * pffftd_simd_arch();
|
||||||
|
|
||||||
|
/* simple helper to get minimum possible fft size */
|
||||||
|
int pffftd_min_fft_size(pffft_transform_t transform);
|
||||||
|
|
||||||
|
/* simple helper to determine size N is valid
|
||||||
|
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||||
|
*/
|
||||||
|
int pffftd_is_valid_size(int N, pffft_transform_t cplx);
|
||||||
|
|
||||||
|
/* determine nearest valid transform size (by brute-force testing)
|
||||||
|
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||||
|
higher: bool-flag to find nearest higher value; else lower.
|
||||||
|
*/
|
||||||
|
int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||||
|
|
||||||
|
|
||||||
|
/* following functions are identical to the pffft_ functions - both declared */
|
||||||
|
|
||||||
|
/* simple helper to determine next power of 2
|
||||||
|
- without inexact/rounding floating point operations
|
||||||
|
*/
|
||||||
|
int pffftd_next_power_of_two(int N);
|
||||||
|
int pffft_next_power_of_two(int N);
|
||||||
|
|
||||||
|
/* simple helper to determine if power of 2 - returns bool */
|
||||||
|
int pffftd_is_power_of_two(int N);
|
||||||
|
int pffft_is_power_of_two(int N);
|
||||||
|
|
||||||
|
/*
|
||||||
|
the double buffers must have the correct alignment (32-byte boundary
|
||||||
|
on intel and powerpc). This function may be used to obtain such
|
||||||
|
correctly aligned buffers.
|
||||||
|
*/
|
||||||
|
void *pffftd_aligned_malloc(size_t nb_bytes);
|
||||||
|
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||||
|
void pffftd_aligned_free(void *);
|
||||||
|
void pffft_aligned_free(void *);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PFFFT_DOUBLE_H */
|
||||||
|
|
||||||
2233
pffft/pffft_priv_impl.h
Normal file
2233
pffft/pffft_priv_impl.h
Normal file
File diff suppressed because it is too large
Load Diff
50
pffft/plots.sh
Executable file
50
pffft/plots.sh
Executable file
@@ -0,0 +1,50 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
OUTPNG="1"
|
||||||
|
W="1024"
|
||||||
|
H="768"
|
||||||
|
PTS="20"
|
||||||
|
LWS="20"
|
||||||
|
|
||||||
|
for f in $(ls -1 *-4-*.csv *-6-*.csv); do
|
||||||
|
b=$(basename "$f" ".csv")
|
||||||
|
#echo $b
|
||||||
|
LASTCOL="$(head -n 1 $f |sed 's/,/,\n/g' |grep -c ',')"
|
||||||
|
echo "${b}: last column is $LASTCOL"
|
||||||
|
if [ $(echo "$b" |grep -c -- "-1-") -gt 0 ]; then
|
||||||
|
YL="duration in ms; less is better"
|
||||||
|
elif [ $(echo "$b" |grep -c -- "-4-") -gt 0 ]; then
|
||||||
|
YL="duration relative to pffft; less is better"
|
||||||
|
else
|
||||||
|
YL=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
E=""
|
||||||
|
if [ "${OUTPNG}" = "1" ]; then
|
||||||
|
E="set terminal png size $W,$H"
|
||||||
|
E="${E} ; set output '${b}.png'"
|
||||||
|
fi
|
||||||
|
if [ -z "${E}" ]; then
|
||||||
|
E="set key outside"
|
||||||
|
else
|
||||||
|
E="${E} ; set key outside"
|
||||||
|
fi
|
||||||
|
E="${E} ; set datafile separator ','"
|
||||||
|
E="${E} ; set title '${b}'"
|
||||||
|
E="${E} ; set xlabel 'fft order: fft size N = 2\\^order'"
|
||||||
|
if [ ! -z "${YL}" ]; then
|
||||||
|
#echo " setting Y label to ${YL}"
|
||||||
|
E="${E} ; set ylabel '${YL}'"
|
||||||
|
fi
|
||||||
|
# unfortunately no effect for
|
||||||
|
#for LNO in $(seq 1 ${LASTCOL}) ; do
|
||||||
|
# E="${E} ; set style line ${LNO} ps ${PTS} lw ${LWS}"
|
||||||
|
#done
|
||||||
|
E="${E} ; plot for [col=3:${LASTCOL}] '${f}' using 2:col with lines title columnhead"
|
||||||
|
|
||||||
|
if [ "${OUTPNG}" = "1" ]; then
|
||||||
|
gnuplot -e "${E}"
|
||||||
|
else
|
||||||
|
gnuplot -e "${E}" --persist
|
||||||
|
fi
|
||||||
|
done
|
||||||
81
pffft/simd/pf_altivec_float.h
Normal file
81
pffft/simd/pf_altivec_float.h
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_ALTIVEC_FLT_H
|
||||||
|
#define PF_ALTIVEC_FLT_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
Altivec support macros
|
||||||
|
*/
|
||||||
|
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
|
||||||
|
#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
|
||||||
|
typedef vector float v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
float f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
||||||
|
# define VARCH "ALTIVEC"
|
||||||
|
# define VZERO() ((vector float) vec_splat_u8(0))
|
||||||
|
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
||||||
|
# define VADD(a,b) vec_add(a,b)
|
||||||
|
# define VMADD(a,b,c) vec_madd(a,b,c)
|
||||||
|
# define VSUB(a,b) vec_sub(a,b)
|
||||||
|
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
||||||
|
# define LD_PS1(p) ld_ps1(&p)
|
||||||
|
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
||||||
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
||||||
|
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
||||||
|
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
||||||
|
}
|
||||||
|
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||||
|
v4sf y0 = vec_mergeh(x0, x2); \
|
||||||
|
v4sf y1 = vec_mergel(x0, x2); \
|
||||||
|
v4sf y2 = vec_mergeh(x1, x3); \
|
||||||
|
v4sf y3 = vec_mergel(x1, x3); \
|
||||||
|
x0 = vec_mergeh(y0, y2); \
|
||||||
|
x1 = vec_mergel(y0, y2); \
|
||||||
|
x2 = vec_mergeh(y1, y3); \
|
||||||
|
x3 = vec_mergel(y1, y3); \
|
||||||
|
}
|
||||||
|
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_SSE1_FLT_H */
|
||||||
|
|
||||||
145
pffft/simd/pf_avx_double.h
Normal file
145
pffft/simd/pf_avx_double.h
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_AVX_DBL_H
|
||||||
|
#define PF_AVX_DBL_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
vector support macros: the rest of the code is independant of
|
||||||
|
AVX -- adding support for other platforms with 4-element
|
||||||
|
vectors should be limited to these macros
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
AVX support macros
|
||||||
|
*/
|
||||||
|
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
|
||||||
|
#pragma message( __FILE__ ": AVX macros are defined" )
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
typedef __m256d v4sf;
|
||||||
|
|
||||||
|
/* 4 doubles by simd vector */
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
double f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "AVX"
|
||||||
|
# define VREQUIRES_ALIGN 1
|
||||||
|
# define VZERO() _mm256_setzero_pd()
|
||||||
|
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||||
|
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||||
|
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||||
|
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||||
|
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||||
|
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||||
|
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||||
|
|
||||||
|
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||||
|
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||||
|
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||||
|
*/
|
||||||
|
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||||
|
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||||
|
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||||
|
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||||
|
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||||
|
_mm_shuffle_pd(low1__, low2__, 3), \
|
||||||
|
1); \
|
||||||
|
out2 = _mm256_insertf128_pd( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||||
|
_mm_shuffle_pd(high1__, high2__, 3), \
|
||||||
|
1); \
|
||||||
|
out1 = tmp__; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||||
|
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||||
|
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||||
|
*/
|
||||||
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||||
|
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||||
|
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||||
|
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||||
|
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||||
|
_mm_shuffle_pd(low2__, high2__, 0), \
|
||||||
|
1); \
|
||||||
|
out2 = _mm256_insertf128_pd( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||||
|
_mm_shuffle_pd(low2__, high2__, 3), \
|
||||||
|
1); \
|
||||||
|
out1 = tmp__; \
|
||||||
|
}
|
||||||
|
|
||||||
|
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||||
|
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||||
|
\
|
||||||
|
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
|
||||||
|
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
|
||||||
|
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
|
||||||
|
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
|
||||||
|
\
|
||||||
|
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
|
||||||
|
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
|
||||||
|
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
|
||||||
|
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*VSWAPHL(a, b) pseudo code:
|
||||||
|
return [ b[0], b[1], a[2], a[3] ]
|
||||||
|
*/
|
||||||
|
# define VSWAPHL(a,b) \
|
||||||
|
_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
|
||||||
|
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_AVX_DBL_H */
|
||||||
|
|
||||||
84
pffft/simd/pf_double.h
Normal file
84
pffft/simd/pf_double.h
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_DBL_H
|
||||||
|
#define PF_DBL_H
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SIMD reference material:
|
||||||
|
*
|
||||||
|
* general SIMD introduction:
|
||||||
|
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||||
|
*
|
||||||
|
* SSE 1:
|
||||||
|
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||||
|
*
|
||||||
|
* ARM NEON:
|
||||||
|
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||||
|
*
|
||||||
|
* Altivec:
|
||||||
|
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||||
|
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||||
|
* better one?
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef double vsfscalar;
|
||||||
|
|
||||||
|
#include "pf_avx_double.h"
|
||||||
|
#include "pf_sse2_double.h"
|
||||||
|
#include "pf_neon_double.h"
|
||||||
|
|
||||||
|
#ifndef SIMD_SZ
|
||||||
|
# if !defined(PFFFT_SIMD_DISABLE)
|
||||||
|
# pragma message( "building double with simd disabled !" )
|
||||||
|
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "pf_scalar_double.h"
|
||||||
|
|
||||||
|
/* shortcuts for complex multiplcations */
|
||||||
|
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||||
|
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||||
|
#ifndef SVMUL
|
||||||
|
/* multiply a scalar with a vector */
|
||||||
|
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_DBL_H */
|
||||||
|
|
||||||
84
pffft/simd/pf_float.h
Normal file
84
pffft/simd/pf_float.h
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_FLT_H
|
||||||
|
#define PF_FLT_H
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SIMD reference material:
|
||||||
|
*
|
||||||
|
* general SIMD introduction:
|
||||||
|
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||||
|
*
|
||||||
|
* SSE 1:
|
||||||
|
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||||
|
*
|
||||||
|
* ARM NEON:
|
||||||
|
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||||
|
*
|
||||||
|
* Altivec:
|
||||||
|
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||||
|
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||||
|
* better one?
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef float vsfscalar;
|
||||||
|
|
||||||
|
#include "pf_sse1_float.h"
|
||||||
|
#include "pf_neon_float.h"
|
||||||
|
#include "pf_altivec_float.h"
|
||||||
|
|
||||||
|
#ifndef SIMD_SZ
|
||||||
|
# if !defined(PFFFT_SIMD_DISABLE)
|
||||||
|
# pragma message( "building float with simd disabled !" )
|
||||||
|
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "pf_scalar_float.h"
|
||||||
|
|
||||||
|
/* shortcuts for complex multiplcations */
|
||||||
|
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||||
|
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||||
|
#ifndef SVMUL
|
||||||
|
/* multiply a scalar with a vector */
|
||||||
|
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_FLT_H */
|
||||||
|
|
||||||
203
pffft/simd/pf_neon_double.h
Normal file
203
pffft/simd/pf_neon_double.h
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_NEON_DBL_H
|
||||||
|
#define PF_NEON_DBL_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
NEON 64bit support macros
|
||||||
|
*/
|
||||||
|
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
|
||||||
|
|
||||||
|
#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
|
||||||
|
|
||||||
|
#include "pf_neon_double_from_avx.h"
|
||||||
|
typedef __m256d v4sf;
|
||||||
|
|
||||||
|
/* 4 doubles by simd vector */
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
double f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "NEON"
|
||||||
|
# define VREQUIRES_ALIGN 1
|
||||||
|
# define VZERO() _mm256_setzero_pd()
|
||||||
|
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||||
|
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||||
|
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||||
|
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||||
|
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||||
|
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||||
|
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = a.vect_f64[0];
|
||||||
|
res.vect_f64[1] = b;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
|
||||||
|
{
|
||||||
|
float64x1_t al = vget_low_f64(a);
|
||||||
|
float64x1_t bl = vget_low_f64(b);
|
||||||
|
return vcombine_f64(al, bl);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
|
||||||
|
{
|
||||||
|
float64x1_t ah = vget_high_f64(a);
|
||||||
|
float64x1_t bh = vget_high_f64(b);
|
||||||
|
return vcombine_f64(ah, bh);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
|
||||||
|
res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
|
||||||
|
res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = a.vect_f64[0];
|
||||||
|
res.vect_f64[1] = b.vect_f64[0];
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = a.vect_f64[1];
|
||||||
|
res.vect_f64[1] = b.vect_f64[1];
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_reverse(__m256d x)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
float64x2_t low = x.vect_f64[0];
|
||||||
|
float64x2_t high = x.vect_f64[1];
|
||||||
|
float64x1_t a = vget_low_f64(low);
|
||||||
|
float64x1_t b = vget_high_f64(low);
|
||||||
|
float64x1_t c = vget_low_f64(high);
|
||||||
|
float64x1_t d = vget_high_f64(high);
|
||||||
|
res.vect_f64[0] = vcombine_f64(d, c);
|
||||||
|
res.vect_f64[1] = vcombine_f64(b, a);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||||
|
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||||
|
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||||
|
*/
|
||||||
|
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||||
|
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||||
|
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||||
|
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||||
|
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
|
||||||
|
_mm_shuffle_pd_11(low1__, low2__)); \
|
||||||
|
out2 = _mm256_insertf128_pd_1( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
|
||||||
|
_mm_shuffle_pd_11(high1__, high2__)); \
|
||||||
|
out1 = tmp__; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||||
|
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||||
|
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||||
|
*/
|
||||||
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||||
|
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||||
|
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||||
|
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||||
|
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
|
||||||
|
_mm_shuffle_pd_00(low2__, high2__)); \
|
||||||
|
out2 = _mm256_insertf128_pd_1( \
|
||||||
|
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
|
||||||
|
_mm_shuffle_pd_11(low2__, high2__)); \
|
||||||
|
out1 = tmp__; \
|
||||||
|
}
|
||||||
|
|
||||||
|
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||||
|
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||||
|
\
|
||||||
|
tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
|
||||||
|
tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
|
||||||
|
tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
|
||||||
|
tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
|
||||||
|
\
|
||||||
|
(row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||||
|
(row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||||
|
(row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||||
|
(row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*VSWAPHL(a, b) pseudo code:
|
||||||
|
return [ b[0], b[1], a[2], a[3] ]
|
||||||
|
*/
|
||||||
|
# define VSWAPHL(a,b) \
|
||||||
|
_mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
# define VREV_S(a) _mm256_reverse(a)
|
||||||
|
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_AVX_DBL_H */
|
||||||
|
|
||||||
123
pffft/simd/pf_neon_double_from_avx.h
Normal file
123
pffft/simd/pf_neon_double_from_avx.h
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
|
||||||
|
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//see https://github.com/kunpengcompute/AvxToNeon
|
||||||
|
|
||||||
|
#ifndef PF_NEON_DBL_FROM_AVX_H
|
||||||
|
#define PF_NEON_DBL_FROM_AVX_H
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
|
|
||||||
|
#pragma push_macro("FORCE_INLINE")
|
||||||
|
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#error "Macro name collisions may happens with unknown compiler"
|
||||||
|
#ifdef FORCE_INLINE
|
||||||
|
#undef FORCE_INLINE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FORCE_INLINE static inline
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
float32x4_t vect_f32[2];
|
||||||
|
} __m256;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
float64x2_t vect_f64[2];
|
||||||
|
} __m256d;
|
||||||
|
|
||||||
|
typedef float64x2_t __m128d;
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_setzero_pd(void)
|
||||||
|
{
|
||||||
|
__m256d ret;
|
||||||
|
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
|
||||||
|
{
|
||||||
|
__m256d res_m256d;
|
||||||
|
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||||
|
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||||
|
return res_m256d;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
|
||||||
|
{
|
||||||
|
__m256d res_m256d;
|
||||||
|
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||||
|
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||||
|
return res_m256d;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
|
||||||
|
{
|
||||||
|
__m256d res_m256d;
|
||||||
|
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||||
|
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||||
|
return res_m256d;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_set1_pd(double a)
|
||||||
|
{
|
||||||
|
__m256d ret;
|
||||||
|
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||||
|
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||||
|
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
|
||||||
|
{
|
||||||
|
return a.vect_f64[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
|
||||||
|
{
|
||||||
|
assert(imm8 >= 0 && imm8 <= 1);
|
||||||
|
return a.vect_f64[imm8];
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
|
||||||
|
{
|
||||||
|
__m256d res;
|
||||||
|
res.vect_f64[0] = a;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* PF_AVX_DBL_H */
|
||||||
|
|
||||||
87
pffft/simd/pf_neon_float.h
Normal file
87
pffft/simd/pf_neon_float.h
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_NEON_FLT_H
|
||||||
|
#define PF_NEON_FLT_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
ARM NEON support macros
|
||||||
|
*/
|
||||||
|
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
|
||||||
|
#pragma message( __FILE__ ": ARM NEON macros are defined" )
|
||||||
|
|
||||||
|
# include <arm_neon.h>
|
||||||
|
typedef float32x4_t v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
float f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "NEON"
|
||||||
|
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
||||||
|
# define VZERO() vdupq_n_f32(0)
|
||||||
|
# define VMUL(a,b) vmulq_f32(a,b)
|
||||||
|
# define VADD(a,b) vaddq_f32(a,b)
|
||||||
|
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
||||||
|
# define VSUB(a,b) vsubq_f32(a,b)
|
||||||
|
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
||||||
|
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||||
|
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||||
|
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||||
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||||
|
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||||
|
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
||||||
|
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
||||||
|
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
||||||
|
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
||||||
|
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
||||||
|
}
|
||||||
|
// marginally faster version
|
||||||
|
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||||
|
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
# define VREV_C(a) vextq_f32(a, a, 2)
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_NEON_FLT_H */
|
||||||
|
|
||||||
185
pffft/simd/pf_scalar_double.h
Normal file
185
pffft/simd/pf_scalar_double.h
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_SCAL_DBL_H
|
||||||
|
#define PF_SCAL_DBL_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||||
|
#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
vsfscalar a;
|
||||||
|
vsfscalar b;
|
||||||
|
vsfscalar c;
|
||||||
|
vsfscalar d;
|
||||||
|
} v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
vsfscalar f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "4xScalar"
|
||||||
|
# define VREQUIRES_ALIGN 0
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||||
|
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||||
|
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||||
|
v4sf r = { v, v, v, v };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||||
|
|
||||||
|
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||||
|
|
||||||
|
|
||||||
|
/* INTERLEAVE2() */
|
||||||
|
#define INTERLEAVE2( A, B, C, D) \
|
||||||
|
do { \
|
||||||
|
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||||
|
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||||
|
C = Cr; \
|
||||||
|
D = Dr; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/* UNINTERLEAVE2() */
|
||||||
|
#define UNINTERLEAVE2(A, B, C, D) \
|
||||||
|
do { \
|
||||||
|
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||||
|
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||||
|
C = Cr; \
|
||||||
|
D = Dr; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/* VTRANSPOSE4() */
|
||||||
|
#define VTRANSPOSE4(A, B, C, D) \
|
||||||
|
do { \
|
||||||
|
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||||
|
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||||
|
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||||
|
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||||
|
A = Ar; \
|
||||||
|
B = Br; \
|
||||||
|
C = Cr; \
|
||||||
|
D = Dr; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/* VSWAPHL() */
|
||||||
|
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { B.a, B.b, A.c, A.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||||
|
v4sf r = { A.d, A.c, A.b, A.a };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||||
|
v4sf r = { A.c, A.d, A.a, A.b };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if !defined(SIMD_SZ)
|
||||||
|
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||||
|
typedef vsfscalar v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 1
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
vsfscalar f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "Scalar"
|
||||||
|
# define VREQUIRES_ALIGN 0
|
||||||
|
# define VZERO() 0.0
|
||||||
|
# define VMUL(a,b) ((a)*(b))
|
||||||
|
# define VADD(a,b) ((a)+(b))
|
||||||
|
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||||
|
# define VSUB(a,b) ((a)-(b))
|
||||||
|
# define LD_PS1(p) (p)
|
||||||
|
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||||
|
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* PF_SCAL_DBL_H */
|
||||||
|
|
||||||
185
pffft/simd/pf_scalar_float.h
Normal file
185
pffft/simd/pf_scalar_float.h
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_SCAL_FLT_H
|
||||||
|
#define PF_SCAL_FLT_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||||
|
#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
vsfscalar a;
|
||||||
|
vsfscalar b;
|
||||||
|
vsfscalar c;
|
||||||
|
vsfscalar d;
|
||||||
|
} v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
vsfscalar f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "4xScalar"
|
||||||
|
# define VREQUIRES_ALIGN 0
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||||
|
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||||
|
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||||
|
v4sf r = { v, v, v, v };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||||
|
|
||||||
|
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||||
|
|
||||||
|
|
||||||
|
/* INTERLEAVE2() */
|
||||||
|
#define INTERLEAVE2( A, B, C, D) \
|
||||||
|
do { \
|
||||||
|
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||||
|
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||||
|
C = Cr; \
|
||||||
|
D = Dr; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/* UNINTERLEAVE2() */
|
||||||
|
#define UNINTERLEAVE2(A, B, C, D) \
|
||||||
|
do { \
|
||||||
|
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||||
|
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||||
|
C = Cr; \
|
||||||
|
D = Dr; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/* VTRANSPOSE4() */
|
||||||
|
#define VTRANSPOSE4(A, B, C, D) \
|
||||||
|
do { \
|
||||||
|
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||||
|
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||||
|
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||||
|
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||||
|
A = Ar; \
|
||||||
|
B = Br; \
|
||||||
|
C = Cr; \
|
||||||
|
D = Dr; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/* VSWAPHL() */
|
||||||
|
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||||
|
v4sf r = { B.a, B.b, A.c, A.d };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||||
|
v4sf r = { A.d, A.c, A.b, A.a };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||||
|
v4sf r = { A.c, A.d, A.a, A.b };
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if !defined(SIMD_SZ)
|
||||||
|
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||||
|
typedef vsfscalar v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 1
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
vsfscalar f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "Scalar"
|
||||||
|
# define VREQUIRES_ALIGN 0
|
||||||
|
# define VZERO() 0.f
|
||||||
|
# define VMUL(a,b) ((a)*(b))
|
||||||
|
# define VADD(a,b) ((a)+(b))
|
||||||
|
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||||
|
# define VSUB(a,b) ((a)-(b))
|
||||||
|
# define LD_PS1(p) (p)
|
||||||
|
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||||
|
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* PF_SCAL_FLT_H */
|
||||||
|
|
||||||
82
pffft/simd/pf_sse1_float.h
Normal file
82
pffft/simd/pf_sse1_float.h
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_SSE1_FLT_H
|
||||||
|
#define PF_SSE1_FLT_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
SSE1 support macros
|
||||||
|
*/
|
||||||
|
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
|
||||||
|
#pragma message( __FILE__ ": SSE1 float macros are defined" )
|
||||||
|
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
typedef __m128 v4sf;
|
||||||
|
|
||||||
|
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
||||||
|
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
float f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
# define VARCH "SSE1"
|
||||||
|
# define VREQUIRES_ALIGN 1
|
||||||
|
# define VZERO() _mm_setzero_ps()
|
||||||
|
# define VMUL(a,b) _mm_mul_ps(a,b)
|
||||||
|
# define VADD(a,b) _mm_add_ps(a,b)
|
||||||
|
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||||
|
# define VSUB(a,b) _mm_sub_ps(a,b)
|
||||||
|
# define LD_PS1(p) _mm_set1_ps(p)
|
||||||
|
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
||||||
|
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
||||||
|
|
||||||
|
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
||||||
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||||
|
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||||
|
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PF_SSE1_FLT_H */
|
||||||
|
|
||||||
281
pffft/simd/pf_sse2_double.h
Normal file
281
pffft/simd/pf_sse2_double.h
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
|
||||||
|
Redistribution and use of the Software in source and binary forms,
|
||||||
|
with or without modification, is permitted provided that the
|
||||||
|
following conditions are met:
|
||||||
|
|
||||||
|
- Neither the names of NCAR's Computational and Information Systems
|
||||||
|
Laboratory, the University Corporation for Atmospheric Research,
|
||||||
|
nor the names of its sponsors or contributors may be used to
|
||||||
|
endorse or promote products derived from this Software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notices, this list of conditions, and the disclaimer below.
|
||||||
|
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer below in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||||
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PF_SSE2_DBL_H
|
||||||
|
#define PF_SSE2_DBL_H
|
||||||
|
|
||||||
|
//detect sse2 support under MSVC
|
||||||
|
#if defined ( _M_IX86_FP )
|
||||||
|
# if _M_IX86_FP == 2
|
||||||
|
# if !defined(__SSE2__)
|
||||||
|
# define __SSE2__
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
SSE2 64bit support macros
|
||||||
|
*/
|
||||||
|
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
|
||||||
|
#pragma message (__FILE__ ": SSE2 double macros are defined" )
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__m128d d128[2];
|
||||||
|
} m256d;
|
||||||
|
|
||||||
|
typedef m256d v4sf;
|
||||||
|
|
||||||
|
# define SIMD_SZ 4
|
||||||
|
|
||||||
|
typedef union v4sf_union {
|
||||||
|
v4sf v;
|
||||||
|
double f[SIMD_SZ];
|
||||||
|
} v4sf_union;
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
|
|
||||||
|
#pragma push_macro("FORCE_INLINE")
|
||||||
|
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||||
|
|
||||||
|
#elif defined (_MSC_VER)
|
||||||
|
#define FORCE_INLINE static __forceinline
|
||||||
|
|
||||||
|
#else
|
||||||
|
#error "Macro name collisions may happens with unknown compiler"
|
||||||
|
#ifdef FORCE_INLINE
|
||||||
|
#undef FORCE_INLINE
|
||||||
|
#endif
|
||||||
|
#define FORCE_INLINE static inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_setzero_pd(void)
|
||||||
|
{
|
||||||
|
m256d ret;
|
||||||
|
ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
|
||||||
|
{
|
||||||
|
m256d ret;
|
||||||
|
ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
|
||||||
|
ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
|
||||||
|
{
|
||||||
|
m256d ret;
|
||||||
|
ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
|
||||||
|
ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
|
||||||
|
{
|
||||||
|
m256d ret;
|
||||||
|
ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
|
||||||
|
ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_set1_pd(double a)
|
||||||
|
{
|
||||||
|
m256d ret;
|
||||||
|
ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = _mm_load_pd((const double *)mem_addr);
|
||||||
|
res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
|
||||||
|
res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# define VARCH "SSE2"
|
||||||
|
# define VREQUIRES_ALIGN 1
|
||||||
|
# define VZERO() mm256_setzero_pd()
|
||||||
|
# define VMUL(a,b) mm256_mul_pd(a,b)
|
||||||
|
# define VADD(a,b) mm256_add_pd(a,b)
|
||||||
|
# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
|
||||||
|
# define VSUB(a,b) mm256_sub_pd(a,b)
|
||||||
|
# define LD_PS1(p) mm256_set1_pd(p)
|
||||||
|
# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
|
||||||
|
# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
|
||||||
|
|
||||||
|
|
||||||
|
FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
|
||||||
|
{
|
||||||
|
return a.d128[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
|
||||||
|
{
|
||||||
|
assert(imm8 >= 0 && imm8 <= 1);
|
||||||
|
return a.d128[imm8];
|
||||||
|
}
|
||||||
|
FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = a.d128[0];
|
||||||
|
res.d128[1] = b;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = a;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
|
||||||
|
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
|
||||||
|
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = a.d128[0];
|
||||||
|
res.d128[1] = b.d128[0];
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = a.d128[1];
|
||||||
|
res.d128[1] = b.d128[1];
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE m256d mm256_reverse(m256d x)
|
||||||
|
{
|
||||||
|
m256d res;
|
||||||
|
res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
|
||||||
|
res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||||
|
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||||
|
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||||
|
*/
|
||||||
|
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||||
|
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||||
|
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||||
|
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||||
|
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||||
|
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||||
|
_mm_shuffle_pd(low1__, low2__, 3)); \
|
||||||
|
out2 = mm256_insertf128_pd_1( \
|
||||||
|
mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||||
|
_mm_shuffle_pd(high1__, high2__, 3)); \
|
||||||
|
out1 = tmp__; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||||
|
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||||
|
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||||
|
*/
|
||||||
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||||
|
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||||
|
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||||
|
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||||
|
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||||
|
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||||
|
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||||
|
_mm_shuffle_pd(low2__, high2__, 0)); \
|
||||||
|
out2 = mm256_insertf128_pd_1( \
|
||||||
|
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||||
|
_mm_shuffle_pd(low2__, high2__, 3)); \
|
||||||
|
out1 = tmp__; \
|
||||||
|
}
|
||||||
|
|
||||||
|
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||||
|
m256d tmp3, tmp2, tmp1, tmp0; \
|
||||||
|
\
|
||||||
|
tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
|
||||||
|
tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
|
||||||
|
tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
|
||||||
|
tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
|
||||||
|
\
|
||||||
|
(row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||||
|
(row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||||
|
(row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||||
|
(row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*VSWAPHL(a, b) pseudo code:
|
||||||
|
return [ b[0], b[1], a[2], a[3] ]
|
||||||
|
*/
|
||||||
|
# define VSWAPHL(a,b) \
|
||||||
|
mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
|
||||||
|
|
||||||
|
/* reverse/flip all floats */
|
||||||
|
# define VREV_S(a) mm256_reverse(a)
|
||||||
|
|
||||||
|
/* reverse/flip complex floats */
|
||||||
|
# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
|
||||||
|
|
||||||
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
5956
pffft/sse2neon.h
Normal file
5956
pffft/sse2neon.h
Normal file
File diff suppressed because it is too large
Load Diff
142
pffft/test_fft_factors.c
Normal file
142
pffft/test_fft_factors.c
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
#include "pffft.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_DOUBLE
|
||||||
|
#include "pffft_double.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
int test_float(int TL)
|
||||||
|
{
|
||||||
|
PFFFT_Setup * S;
|
||||||
|
|
||||||
|
for (int dir_i = 0; dir_i <= 1; ++dir_i)
|
||||||
|
{
|
||||||
|
for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
|
||||||
|
{
|
||||||
|
const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
|
||||||
|
const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
|
||||||
|
const int N_min = pffft_min_fft_size(cplx);
|
||||||
|
const int N_max = N_min * 11 + N_min;
|
||||||
|
int NTL = pffft_nearest_transform_size(TL, cplx, (!dir_i));
|
||||||
|
double near_off = (NTL - TL) * 100.0 / (double)TL;
|
||||||
|
|
||||||
|
fprintf(stderr, "testing float, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
|
||||||
|
(!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
|
||||||
|
|
||||||
|
for (int N = (N_min/2); N <= N_max; N += (N_min/2))
|
||||||
|
{
|
||||||
|
int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
|
||||||
|
const int factorizable = pffft_is_valid_size(N, cplx);
|
||||||
|
while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; }
|
||||||
|
while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; }
|
||||||
|
while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; }
|
||||||
|
tmp_f = (R == N_min) ? 1 : 0;
|
||||||
|
assert( factorizable == tmp_f );
|
||||||
|
|
||||||
|
S = pffft_new_setup(N, cplx);
|
||||||
|
|
||||||
|
if ( S && !factorizable )
|
||||||
|
{
|
||||||
|
fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else if ( !S && factorizable)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (S)
|
||||||
|
pffft_destroy_setup(S);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_DOUBLE
|
||||||
|
int test_double(int TL)
|
||||||
|
{
|
||||||
|
PFFFTD_Setup * S;
|
||||||
|
for (int dir_i = 0; dir_i <= 1; ++dir_i)
|
||||||
|
{
|
||||||
|
for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
|
||||||
|
{
|
||||||
|
const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
|
||||||
|
const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
|
||||||
|
const int N_min = pffftd_min_fft_size(cplx);
|
||||||
|
const int N_max = N_min * 11 + N_min;
|
||||||
|
int NTL = pffftd_nearest_transform_size(TL, cplx, (!dir_i));
|
||||||
|
double near_off = (NTL - TL) * 100.0 / (double)TL;
|
||||||
|
|
||||||
|
fprintf(stderr, "testing double, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
|
||||||
|
(!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
|
||||||
|
|
||||||
|
for (int N = (N_min/2); N <= N_max; N += (N_min/2))
|
||||||
|
{
|
||||||
|
int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
|
||||||
|
const int factorizable = pffftd_is_valid_size(N, cplx);
|
||||||
|
while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; }
|
||||||
|
while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; }
|
||||||
|
while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; }
|
||||||
|
tmp_f = (R == N_min) ? 1 : 0;
|
||||||
|
assert( factorizable == tmp_f );
|
||||||
|
|
||||||
|
S = pffftd_new_setup(N, cplx);
|
||||||
|
|
||||||
|
if ( S && !factorizable )
|
||||||
|
{
|
||||||
|
fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else if ( !S && factorizable)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (S)
|
||||||
|
pffftd_destroy_setup(S);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int N = (1 < argc) ? atoi(argv[1]) : 2;
|
||||||
|
|
||||||
|
int r = 0;
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
r = test_float(N);
|
||||||
|
if (r)
|
||||||
|
return r;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_DOUBLE
|
||||||
|
r = test_double(N);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
991
pffft/test_pffastconv.c
Normal file
991
pffft/test_pffastconv.c
Normal file
@@ -0,0 +1,991 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2013 Julien Pommier.
|
||||||
|
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define _WANT_SNAN 1
|
||||||
|
|
||||||
|
#include "pffft.h"
|
||||||
|
#include "pffastconv.h"
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <float.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifdef HAVE_SYS_TIMES
|
||||||
|
# include <sys/times.h>
|
||||||
|
# include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* benchmark duration: 250 ms */
|
||||||
|
#define BENCH_TEST_DURATION_IN_SEC 0.5
|
||||||
|
|
||||||
|
/*
|
||||||
|
vector support macros: the rest of the code is independant of
|
||||||
|
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||||||
|
vectors should be limited to these macros
|
||||||
|
*/
|
||||||
|
#if 0
|
||||||
|
#include "simd/pf_float.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# define RESTRICT __restrict
|
||||||
|
#else
|
||||||
|
# define RESTRICT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning( disable : 4244 )
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef SNANF
|
||||||
|
#define INVALID_FLOAT_VAL SNANF
|
||||||
|
#elif defined(SNAN)
|
||||||
|
#define INVALID_FLOAT_VAL SNAN
|
||||||
|
#elif defined(NAN)
|
||||||
|
#define INVALID_FLOAT_VAL NAN
|
||||||
|
#elif defined(INFINITY)
|
||||||
|
#define INVALID_FLOAT_VAL INFINITY
|
||||||
|
#else
|
||||||
|
#define INVALID_FLOAT_VAL FLT_MAX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(HAVE_SYS_TIMES)
|
||||||
|
inline double uclock_sec(void) {
|
||||||
|
static double ttclk = 0.;
|
||||||
|
struct tms t;
|
||||||
|
if (ttclk == 0.)
|
||||||
|
ttclk = sysconf(_SC_CLK_TCK);
|
||||||
|
times(&t);
|
||||||
|
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
|
||||||
|
return ((double)t.tms_utime)) / ttclk;
|
||||||
|
}
|
||||||
|
# else
|
||||||
|
double uclock_sec(void)
|
||||||
|
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
|
||||||
|
typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags);
|
||||||
|
typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
|
||||||
|
typedef void (*pfnConvDestroy) (void * setup);
|
||||||
|
|
||||||
|
|
||||||
|
struct ConvSetup
|
||||||
|
{
|
||||||
|
pfnConvolution pfn;
|
||||||
|
int N;
|
||||||
|
int B;
|
||||||
|
float * H;
|
||||||
|
int flags;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
void * convSetupRev( float * H, int N, int * BlkLen, int flags )
|
||||||
|
{
|
||||||
|
struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
|
||||||
|
int i, Nr = N;
|
||||||
|
if (flags & PFFASTCONV_CPLX_INP_OUT)
|
||||||
|
Nr *= 2;
|
||||||
|
Nr += 4;
|
||||||
|
s->pfn = NULL;
|
||||||
|
s->N = N;
|
||||||
|
s->B = *BlkLen;
|
||||||
|
s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
|
||||||
|
s->flags = flags;
|
||||||
|
memset(s->H, 0, (unsigned)Nr * sizeof(float));
|
||||||
|
if (flags & PFFASTCONV_CPLX_INP_OUT)
|
||||||
|
{
|
||||||
|
for ( i = 0; i < N; ++i ) {
|
||||||
|
s->H[2*(N-1 -i) ] = H[i];
|
||||||
|
s->H[2*(N-1 -i)+1] = H[i];
|
||||||
|
}
|
||||||
|
/* simpler detection of overruns */
|
||||||
|
s->H[ 2*N ] = INVALID_FLOAT_VAL;
|
||||||
|
s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
|
||||||
|
s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
|
||||||
|
s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for ( i = 0; i < N; ++i )
|
||||||
|
s->H[ N-1 -i ] = H[i];
|
||||||
|
/* simpler detection of overruns */
|
||||||
|
s->H[ N ] = INVALID_FLOAT_VAL;
|
||||||
|
s->H[ N +1 ] = INVALID_FLOAT_VAL;
|
||||||
|
s->H[ N +2 ] = INVALID_FLOAT_VAL;
|
||||||
|
s->H[ N +3 ] = INVALID_FLOAT_VAL;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
void convDestroyRev( void * setup )
|
||||||
|
{
|
||||||
|
struct ConvSetup * s = (struct ConvSetup*)setup;
|
||||||
|
pffastconv_free(s->H);
|
||||||
|
pffastconv_free(setup);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pfnConvolution ConvGetFnPtrRev( void * setup )
|
||||||
|
{
|
||||||
|
struct ConvSetup * s = (struct ConvSetup*)setup;
|
||||||
|
if (!s)
|
||||||
|
return NULL;
|
||||||
|
return s->pfn;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void convSimdDestroy( void * setup )
|
||||||
|
{
|
||||||
|
convDestroyRev(setup);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
|
||||||
|
{
|
||||||
|
void * p = pffastconv_new_setup( H, N, BlkLen, flags );
|
||||||
|
if (!p)
|
||||||
|
printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void fastConvDestroy( void * setup )
|
||||||
|
{
|
||||||
|
pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||||||
|
{
|
||||||
|
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||||||
|
const float * RESTRICT X = input;
|
||||||
|
const float * RESTRICT Hrev = p->H;
|
||||||
|
float * RESTRICT Y = output;
|
||||||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||||
|
int i, j;
|
||||||
|
(void)Yref;
|
||||||
|
(void)applyFlush;
|
||||||
|
|
||||||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||||||
|
{
|
||||||
|
float sumRe = 0.0F, sumIm = 0.0F;
|
||||||
|
for ( j = 0; j < Nr; j += 2 )
|
||||||
|
{
|
||||||
|
sumRe += X[i+j ] * Hrev[j];
|
||||||
|
sumIm += X[i+j+1] * Hrev[j+1];
|
||||||
|
}
|
||||||
|
Y[i ] = sumRe;
|
||||||
|
Y[i+1] = sumIm;
|
||||||
|
}
|
||||||
|
return i/2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; ++i )
|
||||||
|
{
|
||||||
|
float sum = 0.0F;
|
||||||
|
for (j = 0; j < Nr; ++j )
|
||||||
|
sum += X[i+j] * Hrev[j];
|
||||||
|
Y[i] = sum;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||||||
|
{
|
||||||
|
float sum[4];
|
||||||
|
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||||||
|
const float * RESTRICT X = input;
|
||||||
|
const float * RESTRICT Hrev = p->H;
|
||||||
|
float * RESTRICT Y = output;
|
||||||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||||
|
int i, j;
|
||||||
|
(void)Yref;
|
||||||
|
(void)applyFlush;
|
||||||
|
|
||||||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||||
|
{
|
||||||
|
if ( (Nr & 3) == 0 )
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||||||
|
{
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < Nr; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j] * Hrev[j];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
Y[i ] = sum[0] + sum[2];
|
||||||
|
Y[i+1] = sum[1] + sum[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int M = Nr & (~3);
|
||||||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||||||
|
{
|
||||||
|
float tailSumRe = 0.0F, tailSumIm = 0.0F;
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < M; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j ] * Hrev[j ];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
for ( ; j < Nr; j += 2 ) {
|
||||||
|
tailSumRe += X[i+j ] * Hrev[j ];
|
||||||
|
tailSumIm += X[i+j+1] * Hrev[j+1];
|
||||||
|
}
|
||||||
|
Y[i ] = ( sum[0] + sum[2] ) + tailSumRe;
|
||||||
|
Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return i/2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( (Nr & 3) == 0 )
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; ++i )
|
||||||
|
{
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < Nr; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j] * Hrev[j];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int M = Nr & (~3);
|
||||||
|
/* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
|
||||||
|
for ( i = 0; i <= lenNr; ++i )
|
||||||
|
{
|
||||||
|
float tailSum = 0.0;
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < M; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j] * Hrev[j];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
for ( ; j < Nr; ++j )
|
||||||
|
tailSum += X[i+j] * Hrev[j];
|
||||||
|
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||||||
|
{
|
||||||
|
float sum[4];
|
||||||
|
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||||||
|
(void)Yref;
|
||||||
|
(void)applyFlush;
|
||||||
|
if (p->flags & PFFASTCONV_SYMMETRIC)
|
||||||
|
{
|
||||||
|
const float * RESTRICT X = input;
|
||||||
|
const float * RESTRICT Hrev = p->H;
|
||||||
|
float * RESTRICT Y = output;
|
||||||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||||
|
const int h = Nr / 2 -4;
|
||||||
|
const int E = Nr -4;
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||||||
|
{
|
||||||
|
const int k = i + E;
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j <= h; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] );
|
||||||
|
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
|
||||||
|
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] );
|
||||||
|
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
|
||||||
|
}
|
||||||
|
Y[i ] = sum[0] + sum[2];
|
||||||
|
Y[i+1] = sum[1] + sum[3];
|
||||||
|
}
|
||||||
|
return i/2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; ++i )
|
||||||
|
{
|
||||||
|
const int k = i + E;
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j <= h; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] );
|
||||||
|
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
|
||||||
|
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
|
||||||
|
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] );
|
||||||
|
}
|
||||||
|
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const float * RESTRICT X = input;
|
||||||
|
const float * RESTRICT Hrev = p->H;
|
||||||
|
float * RESTRICT Y = output;
|
||||||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||||||
|
{
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < Nr; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j] * Hrev[j];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
Y[i ] = sum[0] + sum[2];
|
||||||
|
Y[i+1] = sum[1] + sum[3];
|
||||||
|
}
|
||||||
|
return i/2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( (Nr & 3) == 0 )
|
||||||
|
{
|
||||||
|
for ( i = 0; i <= lenNr; ++i )
|
||||||
|
{
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < Nr; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j] * Hrev[j];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int M = Nr & (~3);
|
||||||
|
/* printf("B: Nr = %d\n", Nr ); */
|
||||||
|
for ( i = 0; i <= lenNr; ++i )
|
||||||
|
{
|
||||||
|
float tailSum = 0.0;
|
||||||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||||
|
for (j = 0; j < M; j += 4 )
|
||||||
|
{
|
||||||
|
sum[0] += X[i+j] * Hrev[j];
|
||||||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||||
|
}
|
||||||
|
for ( ; j < Nr; ++j )
|
||||||
|
tailSum += X[i+j] * Hrev[j];
|
||||||
|
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
|
||||||
|
{
|
||||||
|
(void)Yref;
|
||||||
|
return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void printFirst( const float * V, const char * st, const int N, const int perLine )
|
||||||
|
{
|
||||||
|
(void)V; (void)st; (void)N; (void)perLine;
|
||||||
|
return;
|
||||||
|
#if 0
|
||||||
|
int i;
|
||||||
|
for ( i = 0; i < N; ++i )
|
||||||
|
{
|
||||||
|
if ( (i % perLine) == 0 )
|
||||||
|
printf("\n%s[%d]", st, i);
|
||||||
|
printf("\t%.1f", V[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define NUMY 15
|
||||||
|
|
||||||
|
|
||||||
|
int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed, int abortFirstFastAlgo, int printErrValues, int printAsCSV, int *pIsFirstFilterLen) {
|
||||||
|
double t0, t1, tstop, td, tdref;
|
||||||
|
float *X, *H;
|
||||||
|
float *Y[NUMY];
|
||||||
|
int64_t outN[NUMY];
|
||||||
|
/* 256 KFloats or 16 MFloats data */
|
||||||
|
#if 1
|
||||||
|
const int len = testOutLen ? (1 << 18) : (1 << 24);
|
||||||
|
#elif 0
|
||||||
|
const int len = testOutLen ? (1 << 18) : (1 << 13);
|
||||||
|
#else
|
||||||
|
const int len = testOutLen ? (1 << 18) : (1024);
|
||||||
|
#endif
|
||||||
|
const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
|
||||||
|
const int lenC = len / cplxFactor;
|
||||||
|
|
||||||
|
int yi, yc, posMaxErr;
|
||||||
|
float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
|
||||||
|
int i, j, numErrOverLimit, iter;
|
||||||
|
int retErr = 0;
|
||||||
|
|
||||||
|
/* 0 1 2 3 4 5 6 7 8 9, 10, 11, 12, 13 */
|
||||||
|
pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, };
|
||||||
|
pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, };
|
||||||
|
pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
|
||||||
|
pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, };
|
||||||
|
const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K", "fast_conv_8K", "fast_conv_16K", "fast_conv_32K", "fast_conv_64K", };
|
||||||
|
int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, };
|
||||||
|
void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
|
||||||
|
//int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, };
|
||||||
|
int aBlkLen[NUMY] = { 8192, 8192, 8192, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, };
|
||||||
|
#if 1
|
||||||
|
int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096, FILTERLEN<8192, FILTERLEN<16384, FILTERLEN<32768, FILTERLEN<65536, };
|
||||||
|
#elif 0
|
||||||
|
int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, };
|
||||||
|
#else
|
||||||
|
int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, };
|
||||||
|
#endif
|
||||||
|
double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
|
||||||
|
int aNumIters[NUMY], aNumLoops[NUMY];
|
||||||
|
|
||||||
|
X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
|
||||||
|
for ( i=0; i < NUMY; ++i)
|
||||||
|
{
|
||||||
|
if ( 1 || i < 2 )
|
||||||
|
Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
|
||||||
|
else
|
||||||
|
Y[i] = Y[1];
|
||||||
|
|
||||||
|
Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */
|
||||||
|
aSpeedFactor[i] = -1.0;
|
||||||
|
aDuration[i] = -1.0;
|
||||||
|
procSmpPerSec[i] = -1.0;
|
||||||
|
aNumIters[i] = 0;
|
||||||
|
aNumLoops[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
|
||||||
|
|
||||||
|
/* initialize input */
|
||||||
|
if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
|
||||||
|
{
|
||||||
|
for ( i = 0; i < lenC; ++i )
|
||||||
|
{
|
||||||
|
X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
|
||||||
|
X[2*i+1] = (float)((i+2048) % 4093);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for ( i = 0; i < len; ++i )
|
||||||
|
X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
|
||||||
|
}
|
||||||
|
X[ len ] = INVALID_FLOAT_VAL;
|
||||||
|
X[ len +1 ] = INVALID_FLOAT_VAL;
|
||||||
|
X[ len +2 ] = INVALID_FLOAT_VAL;
|
||||||
|
X[ len +3 ] = INVALID_FLOAT_VAL;
|
||||||
|
|
||||||
|
if (!testOutLen)
|
||||||
|
printFirst( X, "X", 64, 8 );
|
||||||
|
|
||||||
|
/* filter coeffs */
|
||||||
|
memset( H, 0, FILTERLEN * sizeof(float) );
|
||||||
|
#if 1
|
||||||
|
if ( convFlags & PFFASTCONV_SYMMETRIC )
|
||||||
|
{
|
||||||
|
const int half = FILTERLEN / 2;
|
||||||
|
for ( j = 0; j < half; ++j ) {
|
||||||
|
switch (j % 3) {
|
||||||
|
case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break;
|
||||||
|
case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break;
|
||||||
|
case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for ( j = 0; j < FILTERLEN; ++j ) {
|
||||||
|
switch (j % 3) {
|
||||||
|
case 0: H[j] = -1.0F; break;
|
||||||
|
case 1: H[j] = 1.0F; break;
|
||||||
|
case 2: H[j] = 0.5F; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
H[0] = 1.0F;
|
||||||
|
H[FILTERLEN -1] = 1.0F;
|
||||||
|
#endif
|
||||||
|
if (!testOutLen)
|
||||||
|
printFirst( H, "H", FILTERLEN, 8 );
|
||||||
|
|
||||||
|
if (!printAsCSV)
|
||||||
|
{
|
||||||
|
printf("\n");
|
||||||
|
printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
|
||||||
|
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
|
||||||
|
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
|
||||||
|
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
|
||||||
|
}
|
||||||
|
|
||||||
|
int hadFastAlgo = 0;
|
||||||
|
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
|
||||||
|
for ( yi = 0; yi < NUMY; ++yi )
|
||||||
|
{
|
||||||
|
if (!aRunAlgo[yi])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if ( aFastAlgo[yi] && abortFirstFastAlgo && hadFastAlgo )
|
||||||
|
{
|
||||||
|
aRunAlgo[yi] = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
hadFastAlgo = hadFastAlgo | aFastAlgo[yi];
|
||||||
|
|
||||||
|
aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
|
||||||
|
|
||||||
|
/* get effective apply function ptr */
|
||||||
|
if ( aSetupCfg[yi] && aGetFnPtr[yi] )
|
||||||
|
aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
|
||||||
|
|
||||||
|
if ( aSetupCfg[yi] && aConv[yi] )
|
||||||
|
{
|
||||||
|
if (testOutLen)
|
||||||
|
{
|
||||||
|
t0 = uclock_sec();
|
||||||
|
outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
|
||||||
|
t1 = uclock_sec();
|
||||||
|
td = t1 - t0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//const int blkLen = 4096; /* required for 'fast_conv_4K' */
|
||||||
|
const int blkLen = aBlkLen[yi];
|
||||||
|
int64_t offC = 0, offS, Nout;
|
||||||
|
int k;
|
||||||
|
iter = 0;
|
||||||
|
outN[yi] = 0;
|
||||||
|
aNumLoops[yi] = 1;
|
||||||
|
t0 = uclock_sec();
|
||||||
|
tstop = t0 + BENCH_TEST_DURATION_IN_SEC;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
const int prev_iter = iter;
|
||||||
|
for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
|
||||||
|
{
|
||||||
|
offS = cplxFactor * offC;
|
||||||
|
Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], 0 /* applyFlush */ );
|
||||||
|
offC += Nout;
|
||||||
|
++iter;
|
||||||
|
if ( !Nout )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//if ( !Nout )
|
||||||
|
// break;
|
||||||
|
t1 = uclock_sec();
|
||||||
|
if ( prev_iter == iter ) // restart from begin of input?
|
||||||
|
{
|
||||||
|
offC = 0;
|
||||||
|
++aNumLoops[yi];
|
||||||
|
}
|
||||||
|
} while ( t1 < tstop );
|
||||||
|
outN[yi] = offC;
|
||||||
|
td = t1 - t0;
|
||||||
|
procSmpPerSec[yi] = cplxFactor * (double)outN[yi] * (1.0 / td);
|
||||||
|
aNumIters[yi] = iter;
|
||||||
|
aDuration[yi] = td;
|
||||||
|
|
||||||
|
//printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%.1f ms\n",
|
||||||
|
// convText[yi], (double)outN[yi]/(1000.0 * 1000.0), 1000.0 * aDuration[yi], procSmpPerSec[yi] * 0.001, aNumIters[yi], 1000.0 * td );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
outN[yi] = 0;
|
||||||
|
}
|
||||||
|
if ( yi == 0 ) {
|
||||||
|
const float * Yvals = Y[0];
|
||||||
|
const int64_t refOutLen = cplxFactor * outN[0];
|
||||||
|
tdref = td;
|
||||||
|
if (printDbg) {
|
||||||
|
printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
|
||||||
|
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
|
||||||
|
}
|
||||||
|
aSpeedFactor[yi] = 1.0;
|
||||||
|
/* */
|
||||||
|
yRangeMin = FLT_MAX;
|
||||||
|
yRangeMax = FLT_MIN;
|
||||||
|
for ( i = 0; i < refOutLen; ++i )
|
||||||
|
{
|
||||||
|
if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i];
|
||||||
|
if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i];
|
||||||
|
}
|
||||||
|
yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
|
||||||
|
/* yErrLimit = 0.01F; */
|
||||||
|
if (testOutLen) {
|
||||||
|
if (1) {
|
||||||
|
printf("reference output len = %" PRId64 " smp\n", outN[0]);
|
||||||
|
printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
|
||||||
|
}
|
||||||
|
printFirst( Yvals, "Yref", 64, 8 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
aSpeedFactor[yi] = tdref / td;
|
||||||
|
if (printDbg) {
|
||||||
|
printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
|
||||||
|
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int iMaxSpeedSlowAlgo = -1;
|
||||||
|
int iFirstFastAlgo = -1;
|
||||||
|
int iMaxSpeedFastAlgo = -1;
|
||||||
|
int iPrintedRefOutLen = 0;
|
||||||
|
{
|
||||||
|
for ( yc = 1; yc < NUMY; ++yc )
|
||||||
|
{
|
||||||
|
if (!aRunAlgo[yc])
|
||||||
|
continue;
|
||||||
|
if (aFastAlgo[yc]) {
|
||||||
|
if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
|
||||||
|
iMaxSpeedFastAlgo = yc;
|
||||||
|
|
||||||
|
if (iFirstFastAlgo < 0)
|
||||||
|
iFirstFastAlgo = yc;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
|
||||||
|
iMaxSpeedSlowAlgo = yc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (printSpeed)
|
||||||
|
{
|
||||||
|
if (testOutLen)
|
||||||
|
{
|
||||||
|
if (iMaxSpeedSlowAlgo >= 0 )
|
||||||
|
printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
|
||||||
|
if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
|
||||||
|
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
|
||||||
|
if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
|
||||||
|
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
|
||||||
|
|
||||||
|
if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
|
||||||
|
printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]);
|
||||||
|
if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
|
||||||
|
printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]);
|
||||||
|
|
||||||
|
if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
|
||||||
|
{
|
||||||
|
printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
|
||||||
|
if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
|
||||||
|
printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// print columns in 1st line
|
||||||
|
if (printAsCSV && *pIsFirstFilterLen)
|
||||||
|
{
|
||||||
|
printf("\n# filterLen, filterOrder, Re/Cx, type, sym, ");
|
||||||
|
for ( yc = 0; yc < NUMY; ++yc )
|
||||||
|
{
|
||||||
|
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
|
||||||
|
continue;
|
||||||
|
if (printAsCSV)
|
||||||
|
printf("%s, ", convText[yc]);
|
||||||
|
}
|
||||||
|
*pIsFirstFilterLen = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( yc = 0; yc < NUMY; ++yc )
|
||||||
|
{
|
||||||
|
if (!yc)
|
||||||
|
{
|
||||||
|
double filterExp = log10((double)FILTERLEN) / log10(2.0);
|
||||||
|
printf("\n%5d, %5.1f, %s, %s, %s, ", FILTERLEN, filterExp,
|
||||||
|
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
|
||||||
|
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
|
||||||
|
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
|
||||||
|
continue;
|
||||||
|
if (printAsCSV)
|
||||||
|
printf("%.0f, ", procSmpPerSec[yc] * 0.001);
|
||||||
|
else
|
||||||
|
printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%d loops\n",
|
||||||
|
convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001, aNumIters[yc], aNumLoops[yc] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for ( yc = 1; yc < NUMY; ++yc )
|
||||||
|
{
|
||||||
|
const float * Yref;
|
||||||
|
const float * Ycurr;
|
||||||
|
int outMin;
|
||||||
|
|
||||||
|
if (!aRunAlgo[yc])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (printDbg)
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
if ( outN[yc] == 0 )
|
||||||
|
{
|
||||||
|
if (!printAsCSV)
|
||||||
|
printf("output size 0: '%s' not implemented\n", convText[yc]);
|
||||||
|
}
|
||||||
|
else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
|
||||||
|
{
|
||||||
|
if (!iPrintedRefOutLen)
|
||||||
|
{
|
||||||
|
printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
|
||||||
|
iPrintedRefOutLen = 1;
|
||||||
|
}
|
||||||
|
printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
|
||||||
|
FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
|
||||||
|
retErr = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
posMaxErr = 0;
|
||||||
|
maxErr = -1.0;
|
||||||
|
Yref = Y[0];
|
||||||
|
Ycurr = Y[yc];
|
||||||
|
outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
|
||||||
|
numErrOverLimit = 0;
|
||||||
|
for ( i = 0; i < outMin; ++i )
|
||||||
|
{
|
||||||
|
if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit && printErrValues )
|
||||||
|
{
|
||||||
|
printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
|
||||||
|
convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
|
||||||
|
++numErrOverLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
|
||||||
|
{
|
||||||
|
maxErr = fabsf(Ycurr[i] - Yref[i]);
|
||||||
|
posMaxErr = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
|
||||||
|
printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
pffastconv_free(X);
|
||||||
|
for ( i=0; i < NUMY; ++i)
|
||||||
|
{
|
||||||
|
if ( 1 || i < 2 )
|
||||||
|
pffastconv_free( Y[i] );
|
||||||
|
if (!aRunAlgo[i])
|
||||||
|
continue;
|
||||||
|
aDestroy[i]( aSetupCfg[i] );
|
||||||
|
}
|
||||||
|
|
||||||
|
pffastconv_free(H);
|
||||||
|
|
||||||
|
return retErr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
|
||||||
|
void validate_pffft_simd();
|
||||||
|
int validate_pffft_simd_ex(FILE * DbgOut);
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int result = 0;
|
||||||
|
int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
|
||||||
|
int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
|
||||||
|
int testReal = 1, testCplx = 1, testSymetric = 0, abortFirstFastAlgo = 1, printErrValues = 0, printAsCSV = 1;
|
||||||
|
int isFirstFilterLen = 1;
|
||||||
|
|
||||||
|
for ( i = 1; i < argc; ++i ) {
|
||||||
|
|
||||||
|
if (!strcmp(argv[i], "--test-simd")) {
|
||||||
|
int numErrs = validate_pffft_simd_ex(stdout);
|
||||||
|
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
|
||||||
|
return ( numErrs > 0 ? 1 : 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!strcmp(argv[i], "--no-len")) {
|
||||||
|
testOutLens = 0;
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--no-bench")) {
|
||||||
|
benchConv = 0;
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--quick")) {
|
||||||
|
quickTest = 1;
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--slow")) {
|
||||||
|
slowTest = 1;
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--real")) {
|
||||||
|
testCplx = 0;
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--cplx")) {
|
||||||
|
testReal = 0;
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--sym")) {
|
||||||
|
testSymetric = 1;
|
||||||
|
}
|
||||||
|
else /* if (!strcmp(argv[i], "--help")) */ {
|
||||||
|
printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (testOutLens)
|
||||||
|
{
|
||||||
|
for ( k = 0; k < 3; ++k )
|
||||||
|
{
|
||||||
|
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
|
||||||
|
continue;
|
||||||
|
printf("\n\n==========\n");
|
||||||
|
printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
|
||||||
|
printf("==========\n");
|
||||||
|
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
|
||||||
|
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
|
||||||
|
flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
|
||||||
|
testOutLen = 1;
|
||||||
|
printDbg = 0;
|
||||||
|
printSpeed = 0;
|
||||||
|
for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
|
||||||
|
{
|
||||||
|
if ( (M % 16) != 0 && testSymetric )
|
||||||
|
continue;
|
||||||
|
result |= test(M, flagsB, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, 0, &isFirstFilterLen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (benchConv)
|
||||||
|
{
|
||||||
|
printf("quickTest is %d\n", quickTest);
|
||||||
|
printf("slowTest is %d\n", slowTest);
|
||||||
|
|
||||||
|
for ( k = 0; k < 3; ++k )
|
||||||
|
{
|
||||||
|
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
|
||||||
|
continue;
|
||||||
|
if (!printAsCSV)
|
||||||
|
{
|
||||||
|
printf("\n\n==========\n");
|
||||||
|
printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
|
||||||
|
printf("==========\n");
|
||||||
|
}
|
||||||
|
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
|
||||||
|
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
|
||||||
|
flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
|
||||||
|
testOutLen = 0;
|
||||||
|
printDbg = 0;
|
||||||
|
printSpeed = 1;
|
||||||
|
if (!slowTest) {
|
||||||
|
if (!quickTest) {
|
||||||
|
result |= test(32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(32 + 16, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
}
|
||||||
|
result |= test(64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
if (!quickTest) {
|
||||||
|
result |= test(64 + 32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!quickTest) {
|
||||||
|
result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(256, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(512, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(1024, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
|
||||||
|
result |= test(2048, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(4096, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(8192, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(16384, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
result |= test(32768, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||||
|
}
|
||||||
|
if (printAsCSV)
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
371
pffft/test_pffft.c
Normal file
371
pffft/test_pffft.c
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2013 Julien Pommier.
|
||||||
|
|
||||||
|
Small test for PFFFT
|
||||||
|
|
||||||
|
How to build:
|
||||||
|
|
||||||
|
on linux, with fftw3:
|
||||||
|
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
||||||
|
|
||||||
|
on macos, without fftw3:
|
||||||
|
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
|
||||||
|
|
||||||
|
on macos, with fftw3:
|
||||||
|
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
|
||||||
|
|
||||||
|
as alternative: replace clang by gcc.
|
||||||
|
|
||||||
|
on windows, with visual c++:
|
||||||
|
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
|
||||||
|
|
||||||
|
build without SIMD instructions:
|
||||||
|
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
#include "pffft.h"
|
||||||
|
|
||||||
|
typedef float pffft_scalar;
|
||||||
|
#else
|
||||||
|
/*
|
||||||
|
Note: adapted for double precision dynamic range version.
|
||||||
|
*/
|
||||||
|
#include "pffft_double.h"
|
||||||
|
|
||||||
|
typedef double pffft_scalar;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
/* define own constants required to turn off g++ extensions .. */
|
||||||
|
#ifndef M_PI
|
||||||
|
#define M_PI 3.14159265358979323846 /* pi */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* EXPECTED_DYN_RANGE in dB:
|
||||||
|
* single precision float has 24 bits mantissa
|
||||||
|
* => 24 Bits * 6 dB = 144 dB
|
||||||
|
* allow a few dB tolerance (even 144 dB looks good on my PC)
|
||||||
|
*/
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
#define EXPECTED_DYN_RANGE 140.0
|
||||||
|
#else
|
||||||
|
#define EXPECTED_DYN_RANGE 215.0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* maximum allowed phase error in degree */
|
||||||
|
#define DEG_ERR_LIMIT 1E-4
|
||||||
|
|
||||||
|
/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
|
||||||
|
#define MAG_ERR_LIMIT 1E-6
|
||||||
|
|
||||||
|
|
||||||
|
#define PRINT_SPEC 0
|
||||||
|
|
||||||
|
#define PWR2LOG(PWR) ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int test(int N, int cplx, int useOrdered) {
|
||||||
|
int Nfloat = (cplx ? N*2 : N);
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
pffft_scalar *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *R = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
#else
|
||||||
|
pffft_scalar *X = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *Y = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *R = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *Z = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
pffft_scalar *W = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||||
|
#endif
|
||||||
|
pffft_scalar amp = (pffft_scalar)1.0;
|
||||||
|
double freq, dPhi, phi, phi0;
|
||||||
|
double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
|
||||||
|
int k, j, m, iter, kmaxOther, retError = 0;
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
assert( pffft_is_power_of_two(N) );
|
||||||
|
PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
||||||
|
#else
|
||||||
|
assert( pffftd_is_power_of_two(N) );
|
||||||
|
PFFFTD_Setup *s = pffftd_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
||||||
|
#endif
|
||||||
|
assert(s);
|
||||||
|
if (!s) {
|
||||||
|
printf("Error setting up PFFFT!\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
|
||||||
|
{
|
||||||
|
amp = (pffft_scalar)( ( (m % 3) == 0 ) ? 1.0 : 1.1 );
|
||||||
|
freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
|
||||||
|
dPhi = 2.0 * M_PI * freq;
|
||||||
|
if ( dPhi < 0.0 )
|
||||||
|
dPhi += 2.0 * M_PI;
|
||||||
|
|
||||||
|
iter = -1;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
++iter;
|
||||||
|
|
||||||
|
if (iter)
|
||||||
|
printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
|
||||||
|
|
||||||
|
/* generate cosine carrier as time signal - start at defined phase phi0 */
|
||||||
|
phi = phi0 = (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
|
||||||
|
for ( j = 0; j < N; ++j )
|
||||||
|
{
|
||||||
|
if (cplx) {
|
||||||
|
X[2*j] = amp * (pffft_scalar)cos(phi); /* real part */
|
||||||
|
X[2*j+1] = amp * (pffft_scalar)sin(phi); /* imag part */
|
||||||
|
}
|
||||||
|
else
|
||||||
|
X[j] = amp * (pffft_scalar)cos(phi); /* only real part */
|
||||||
|
|
||||||
|
/* phase increment .. stay normalized - cos()/sin() might degrade! */
|
||||||
|
phi += dPhi;
|
||||||
|
if ( phi >= M_PI )
|
||||||
|
phi -= 2.0 * M_PI;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* forward transform from X --> Y .. using work buffer W */
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
if ( useOrdered )
|
||||||
|
pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pffft_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */
|
||||||
|
pffft_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if ( useOrdered )
|
||||||
|
pffftd_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pffftd_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */
|
||||||
|
pffftd_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
pwrOther = -1.0;
|
||||||
|
pwrCar = 0;
|
||||||
|
|
||||||
|
|
||||||
|
/* for positive frequencies: 0 to 0.5 * samplerate */
|
||||||
|
/* and also for negative frequencies: -0.5 * samplerate to 0 */
|
||||||
|
for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
|
||||||
|
{
|
||||||
|
if (!cplx && !j) /* special treatment for DC for real input */
|
||||||
|
pwr = Y[j]*Y[j];
|
||||||
|
else if (!cplx && j == N/2) /* treat 0.5 * samplerate */
|
||||||
|
pwr = Y[1] * Y[1]; /* despite j (for freq calculation) we have index 1 */
|
||||||
|
else
|
||||||
|
pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
|
||||||
|
if (iter || PRINT_SPEC)
|
||||||
|
printf("%s fft %d: pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
|
||||||
|
if (k == j)
|
||||||
|
pwrCar = pwr;
|
||||||
|
else if ( pwr > pwrOther ) {
|
||||||
|
pwrOther = pwr;
|
||||||
|
kmaxOther = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
|
||||||
|
printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
|
||||||
|
printf(" carrier power at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
|
||||||
|
printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
|
||||||
|
printf(" max other pwr at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
|
||||||
|
printf(" dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
|
||||||
|
retError = 1;
|
||||||
|
if ( iter == 0 )
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( k > 0 && k != N/2 )
|
||||||
|
{
|
||||||
|
phi = atan2( Y[2*k+1], Y[2*k] );
|
||||||
|
if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
|
||||||
|
{
|
||||||
|
retError = 1;
|
||||||
|
printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg expected = %f deg\n",
|
||||||
|
(cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
|
||||||
|
mag = sqrt(pwrCar) / N;
|
||||||
|
if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
|
||||||
|
{
|
||||||
|
retError = 1;
|
||||||
|
printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* now convert spectrum back */
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
if (useOrdered)
|
||||||
|
pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
|
||||||
|
else
|
||||||
|
pffft_transform(s, R, Z, W, PFFFT_BACKWARD);
|
||||||
|
#else
|
||||||
|
if (useOrdered)
|
||||||
|
pffftd_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
|
||||||
|
else
|
||||||
|
pffftd_transform(s, R, Z, W, PFFFT_BACKWARD);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
errSum = 0.0;
|
||||||
|
for ( j = 0; j < (cplx ? (2*N) : N); ++j )
|
||||||
|
{
|
||||||
|
/* scale back */
|
||||||
|
Z[j] /= N;
|
||||||
|
/* square sum errors over real (and imag parts) */
|
||||||
|
err = (X[j]-Z[j]) * (X[j]-Z[j]);
|
||||||
|
errSum += err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( errSum > N * 1E-7 )
|
||||||
|
{
|
||||||
|
retError = 1;
|
||||||
|
printf("%s fft %d bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
pffft_destroy_setup(s);
|
||||||
|
pffft_aligned_free(X);
|
||||||
|
pffft_aligned_free(Y);
|
||||||
|
pffft_aligned_free(Z);
|
||||||
|
pffft_aligned_free(R);
|
||||||
|
pffft_aligned_free(W);
|
||||||
|
#else
|
||||||
|
pffftd_destroy_setup(s);
|
||||||
|
pffftd_aligned_free(X);
|
||||||
|
pffftd_aligned_free(Y);
|
||||||
|
pffftd_aligned_free(Z);
|
||||||
|
pffftd_aligned_free(R);
|
||||||
|
pffftd_aligned_free(W);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return retError;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
|
||||||
|
void validate_pffft_simd();
|
||||||
|
int validate_pffft_simd_ex(FILE * DbgOut);
|
||||||
|
void validate_pffftd_simd();
|
||||||
|
int validate_pffftd_simd_ex(FILE * DbgOut);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int N, result, resN, resAll, i, k, resNextPw2, resIsPw2, resFFT;
|
||||||
|
|
||||||
|
int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
|
||||||
|
int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
|
||||||
|
|
||||||
|
for ( i = 1; i < argc; ++i ) {
|
||||||
|
|
||||||
|
if (!strcmp(argv[i], "--test-simd")) {
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
int numErrs = validate_pffft_simd_ex(stdout);
|
||||||
|
#else
|
||||||
|
int numErrs = validate_pffftd_simd_ex(stdout);
|
||||||
|
#endif
|
||||||
|
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
|
||||||
|
return ( numErrs > 0 ? 1 : 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resNextPw2 = 0;
|
||||||
|
resIsPw2 = 0;
|
||||||
|
for ( k = 0; k < (sizeof(inp_power_of_two)/sizeof(inp_power_of_two[0])); ++k) {
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
N = pffft_next_power_of_two(inp_power_of_two[k]);
|
||||||
|
#else
|
||||||
|
N = pffftd_next_power_of_two(inp_power_of_two[k]);
|
||||||
|
#endif
|
||||||
|
if (N != ref_power_of_two[k]) {
|
||||||
|
resNextPw2 = 1;
|
||||||
|
printf("pffft_next_power_of_two(%d) does deliver %d, which is not reference result %d!\n",
|
||||||
|
inp_power_of_two[k], N, ref_power_of_two[k] );
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
result = pffft_is_power_of_two(inp_power_of_two[k]);
|
||||||
|
#else
|
||||||
|
result = pffftd_is_power_of_two(inp_power_of_two[k]);
|
||||||
|
#endif
|
||||||
|
if (inp_power_of_two[k] == ref_power_of_two[k]) {
|
||||||
|
if (!result) {
|
||||||
|
resIsPw2 = 1;
|
||||||
|
printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", inp_power_of_two[k]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (result) {
|
||||||
|
resIsPw2 = 1;
|
||||||
|
printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", inp_power_of_two[k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!resNextPw2)
|
||||||
|
printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
|
||||||
|
if (!resIsPw2)
|
||||||
|
printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
|
||||||
|
|
||||||
|
resFFT = 0;
|
||||||
|
for ( N = 32; N <= 65536; N *= 2 )
|
||||||
|
{
|
||||||
|
result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
|
||||||
|
resN = result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
|
||||||
|
resN |= result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
|
||||||
|
resN |= result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
|
||||||
|
resN |= result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
if (!resN)
|
||||||
|
printf("tests for size %d succeeded successfully.\n", N);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resFFT) {
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, float) succeeded successfully.\n");
|
||||||
|
#else
|
||||||
|
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, double) succeeded successfully.\n");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
resAll = resNextPw2 | resIsPw2 | resFFT;
|
||||||
|
if (!resAll)
|
||||||
|
printf("all tests succeeded successfully.\n");
|
||||||
|
else
|
||||||
|
printf("there are failed tests!\n");
|
||||||
|
|
||||||
|
return resAll;
|
||||||
|
}
|
||||||
|
|
||||||
377
pffft/test_pffft.cpp
Normal file
377
pffft/test_pffft.cpp
Normal file
@@ -0,0 +1,377 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||||
|
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||||
|
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||||
|
|
||||||
|
Small test & bench for PFFFT, comparing its performance with the scalar
|
||||||
|
FFTPACK, FFTW, and Apple vDSP
|
||||||
|
|
||||||
|
How to build:
|
||||||
|
|
||||||
|
on linux, with fftw3:
|
||||||
|
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c
|
||||||
|
test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
||||||
|
|
||||||
|
on macos, without fftw3:
|
||||||
|
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c
|
||||||
|
-L/usr/local/lib -I/usr/local/include/ -framework Accelerate
|
||||||
|
|
||||||
|
on macos, with fftw3:
|
||||||
|
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c
|
||||||
|
test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f
|
||||||
|
-framework Accelerate
|
||||||
|
|
||||||
|
as alternative: replace clang by gcc.
|
||||||
|
|
||||||
|
on windows, with visual c++:
|
||||||
|
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
|
||||||
|
|
||||||
|
build without SIMD instructions:
|
||||||
|
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c
|
||||||
|
fftpack.c -lm
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "pffft.hpp"
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
/* define own constants required to turn off g++ extensions .. */
|
||||||
|
#ifndef M_PI
|
||||||
|
#define M_PI 3.14159265358979323846 /* pi */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* maximum allowed phase error in degree */
|
||||||
|
#define DEG_ERR_LIMIT 1E-4
|
||||||
|
|
||||||
|
/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
|
||||||
|
#define MAG_ERR_LIMIT 1E-6
|
||||||
|
|
||||||
|
#define PRINT_SPEC 0
|
||||||
|
|
||||||
|
#define PWR2LOG(PWR) ((PWR) < 1E-30 ? 10.0 * log10(1E-30) : 10.0 * log10(PWR))
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
bool
|
||||||
|
Ttest(int N, bool useOrdered)
|
||||||
|
{
|
||||||
|
typedef pffft::Fft<T> Fft;
|
||||||
|
typedef typename pffft::Fft<T>::Scalar FftScalar;
|
||||||
|
typedef typename Fft::Complex FftComplex;
|
||||||
|
|
||||||
|
const bool cplx = pffft::Fft<T>::isComplexTransform();
|
||||||
|
const double EXPECTED_DYN_RANGE = Fft::isDoubleScalar() ? 215.0 : 140.0;
|
||||||
|
|
||||||
|
assert(Fft::isPowerOfTwo(N));
|
||||||
|
|
||||||
|
Fft fft = Fft(N); // instantiate and prepareLength() for length N
|
||||||
|
|
||||||
|
#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
|
||||||
|
|
||||||
|
// possible ways to declare/instatiate aligned vectors with C++11
|
||||||
|
// some lines require a typedef of above
|
||||||
|
auto X = fft.valueVector(); // for X = input vector
|
||||||
|
pffft::AlignedVector<typename Fft::Complex> Y = fft.spectrumVector(); // for Y = forward(X)
|
||||||
|
pffft::AlignedVector<FftScalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
|
||||||
|
pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
|
||||||
|
// or Z = inverseInternalLayout(R)
|
||||||
|
#else
|
||||||
|
|
||||||
|
// possible ways to declare/instatiate aligned vectors with C++98
|
||||||
|
pffft::AlignedVector<T> X = fft.valueVector(); // for X = input vector
|
||||||
|
pffft::AlignedVector<FftComplex> Y = fft.spectrumVector(); // for Y = forward(X)
|
||||||
|
pffft::AlignedVector<typename Fft::Scalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
|
||||||
|
pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
|
||||||
|
// or Z = inverseInternalLayout(R)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// work with complex - without the capabilities of a higher c++ standard
|
||||||
|
FftScalar* Xs = reinterpret_cast<FftScalar*>(X.data()); // for X = input vector
|
||||||
|
FftScalar* Ys = reinterpret_cast<FftScalar*>(Y.data()); // for Y = forward(X)
|
||||||
|
FftScalar* Zs = reinterpret_cast<FftScalar*>(Z.data()); // for Z = inverse(Y) = inverse( forward(X) )
|
||||||
|
|
||||||
|
int k, j, m, iter, kmaxOther;
|
||||||
|
bool retError = false;
|
||||||
|
double freq, dPhi, phi, phi0;
|
||||||
|
double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
|
||||||
|
double amp = 1.0;
|
||||||
|
|
||||||
|
for (k = m = 0; k < (cplx ? N : (1 + N / 2)); k += N / 16, ++m) {
|
||||||
|
amp = ((m % 3) == 0) ? 1.0F : 1.1F;
|
||||||
|
freq = (k < N / 2) ? ((double)k / N) : ((double)(k - N) / N);
|
||||||
|
dPhi = 2.0 * M_PI * freq;
|
||||||
|
if (dPhi < 0.0)
|
||||||
|
dPhi += 2.0 * M_PI;
|
||||||
|
|
||||||
|
iter = -1;
|
||||||
|
while (1) {
|
||||||
|
++iter;
|
||||||
|
|
||||||
|
if (iter)
|
||||||
|
printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
|
||||||
|
|
||||||
|
/* generate cosine carrier as time signal - start at defined phase phi0 */
|
||||||
|
phi = phi0 =
|
||||||
|
(m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
|
||||||
|
for (j = 0; j < N; ++j) {
|
||||||
|
if (cplx) {
|
||||||
|
Xs[2 * j] = (FftScalar)( amp * cos(phi) ); /* real part */
|
||||||
|
Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */
|
||||||
|
} else
|
||||||
|
Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */
|
||||||
|
|
||||||
|
/* phase increment .. stay normalized - cos()/sin() might degrade! */
|
||||||
|
phi += dPhi;
|
||||||
|
if (phi >= M_PI)
|
||||||
|
phi -= 2.0 * M_PI;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* forward transform from X --> Y .. using work buffer W */
|
||||||
|
if (useOrdered)
|
||||||
|
fft.forward(X, Y);
|
||||||
|
else {
|
||||||
|
fft.forwardToInternalLayout(X, R); /* use R for reordering */
|
||||||
|
fft.reorderSpectrum(R, Y); /* have canonical order in Y[] for power calculations */
|
||||||
|
}
|
||||||
|
|
||||||
|
pwrOther = -1.0;
|
||||||
|
pwrCar = 0;
|
||||||
|
|
||||||
|
/* for positive frequencies: 0 to 0.5 * samplerate */
|
||||||
|
/* and also for negative frequencies: -0.5 * samplerate to 0 */
|
||||||
|
for (j = 0; j < (cplx ? N : (1 + N / 2)); ++j) {
|
||||||
|
if (!cplx && !j) /* special treatment for DC for real input */
|
||||||
|
pwr = Ys[j] * Ys[j];
|
||||||
|
else if (!cplx && j == N / 2) /* treat 0.5 * samplerate */
|
||||||
|
pwr = Ys[1] *
|
||||||
|
Ys[1]; /* despite j (for freq calculation) we have index 1 */
|
||||||
|
else
|
||||||
|
pwr = Ys[2 * j] * Ys[2 * j] + Ys[2 * j + 1] * Ys[2 * j + 1];
|
||||||
|
if (iter || PRINT_SPEC)
|
||||||
|
printf("%s fft %d: pwr[j = %d] = %g == %f dB\n",
|
||||||
|
(cplx ? "cplx" : "real"),
|
||||||
|
N,
|
||||||
|
j,
|
||||||
|
pwr,
|
||||||
|
PWR2LOG(pwr));
|
||||||
|
if (k == j)
|
||||||
|
pwrCar = pwr;
|
||||||
|
else if (pwr > pwrOther) {
|
||||||
|
pwrOther = pwr;
|
||||||
|
kmaxOther = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE) {
|
||||||
|
printf("%s fft %d amp %f iter %d:\n",
|
||||||
|
(cplx ? "cplx" : "real"),
|
||||||
|
N,
|
||||||
|
amp,
|
||||||
|
iter);
|
||||||
|
printf(" carrier power at bin %d: %g == %f dB\n",
|
||||||
|
k,
|
||||||
|
pwrCar,
|
||||||
|
PWR2LOG(pwrCar));
|
||||||
|
printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar));
|
||||||
|
printf(" max other pwr at bin %d: %g == %f dB\n",
|
||||||
|
kmaxOther,
|
||||||
|
pwrOther,
|
||||||
|
PWR2LOG(pwrOther));
|
||||||
|
printf(" dynamic range: %f dB\n\n",
|
||||||
|
PWR2LOG(pwrCar) - PWR2LOG(pwrOther));
|
||||||
|
retError = true;
|
||||||
|
if (iter == 0)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (k > 0 && k != N / 2) {
|
||||||
|
phi = atan2(Ys[2 * k + 1], Ys[2 * k]);
|
||||||
|
if (fabs(phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0) {
|
||||||
|
retError = true;
|
||||||
|
printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg "
|
||||||
|
"expected = %f deg\n",
|
||||||
|
(cplx ? "cplx" : "real"),
|
||||||
|
N,
|
||||||
|
k,
|
||||||
|
amp,
|
||||||
|
phi * 180.0 / M_PI,
|
||||||
|
phi0 * 180.0 / M_PI);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expextedMag = cplx ? amp : ((k == 0 || k == N / 2) ? amp : (amp / 2));
|
||||||
|
mag = sqrt(pwrCar) / N;
|
||||||
|
if (fabs(mag - expextedMag) > MAG_ERR_LIMIT) {
|
||||||
|
retError = true;
|
||||||
|
printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n",
|
||||||
|
(cplx ? "cplx" : "real"),
|
||||||
|
N,
|
||||||
|
k,
|
||||||
|
amp,
|
||||||
|
mag,
|
||||||
|
expextedMag);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* now convert spectrum back */
|
||||||
|
if (useOrdered)
|
||||||
|
fft.inverse(Y, Z);
|
||||||
|
else
|
||||||
|
fft.inverseFromInternalLayout(R, Z); /* inverse() from internal Layout */
|
||||||
|
|
||||||
|
errSum = 0.0;
|
||||||
|
for (j = 0; j < (cplx ? (2 * N) : N); ++j) {
|
||||||
|
/* scale back */
|
||||||
|
Zs[j] /= N;
|
||||||
|
/* square sum errors over real (and imag parts) */
|
||||||
|
err = (Xs[j] - Zs[j]) * (Xs[j] - Zs[j]);
|
||||||
|
errSum += err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errSum > N * 1E-7) {
|
||||||
|
retError = true;
|
||||||
|
printf("%s fft %d bin %d : inverse FFT doesn't match original signal! "
|
||||||
|
"errSum = %g ; mean err = %g\n",
|
||||||
|
(cplx ? "cplx" : "real"),
|
||||||
|
N,
|
||||||
|
k,
|
||||||
|
errSum,
|
||||||
|
errSum / N);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// using the std::vector<> base classes .. no need for alignedFree() for X, Y, Z and R
|
||||||
|
|
||||||
|
return retError;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
test(int N, bool useComplex, bool useOrdered)
|
||||||
|
{
|
||||||
|
if (useComplex) {
|
||||||
|
return
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
Ttest< std::complex<float> >(N, useOrdered)
|
||||||
|
#endif
|
||||||
|
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
|
||||||
|
&&
|
||||||
|
#endif
|
||||||
|
#ifdef PFFFT_ENABLE_DOUBLE
|
||||||
|
Ttest< std::complex<double> >(N, useOrdered)
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
} else {
|
||||||
|
return
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
Ttest<float>(N, useOrdered)
|
||||||
|
#endif
|
||||||
|
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
|
||||||
|
&&
|
||||||
|
#endif
|
||||||
|
#ifdef PFFFT_ENABLE_DOUBLE
|
||||||
|
Ttest<double>(N, useOrdered)
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main(int argc, char** argv)
|
||||||
|
{
|
||||||
|
int N, result, resN, resAll, k, resNextPw2, resIsPw2, resFFT;
|
||||||
|
|
||||||
|
int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
|
||||||
|
int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
|
||||||
|
|
||||||
|
resNextPw2 = 0;
|
||||||
|
resIsPw2 = 0;
|
||||||
|
for (k = 0; k < (sizeof(inp_power_of_two) / sizeof(inp_power_of_two[0]));
|
||||||
|
++k) {
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
N = pffft::Fft<float>::nextPowerOfTwo(inp_power_of_two[k]);
|
||||||
|
#else
|
||||||
|
N = pffft::Fft<double>::nextPowerOfTwo(inp_power_of_two[k]);
|
||||||
|
#endif
|
||||||
|
if (N != ref_power_of_two[k]) {
|
||||||
|
resNextPw2 = 1;
|
||||||
|
printf("pffft_next_power_of_two(%d) does deliver %d, which is not "
|
||||||
|
"reference result %d!\n",
|
||||||
|
inp_power_of_two[k],
|
||||||
|
N,
|
||||||
|
ref_power_of_two[k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
result = pffft::Fft<float>::isPowerOfTwo(inp_power_of_two[k]);
|
||||||
|
#else
|
||||||
|
result = pffft::Fft<double>::isPowerOfTwo(inp_power_of_two[k]);
|
||||||
|
#endif
|
||||||
|
if (inp_power_of_two[k] == ref_power_of_two[k]) {
|
||||||
|
if (!result) {
|
||||||
|
resIsPw2 = 1;
|
||||||
|
printf("pffft_is_power_of_two(%d) delivers false; expected true!\n",
|
||||||
|
inp_power_of_two[k]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (result) {
|
||||||
|
resIsPw2 = 1;
|
||||||
|
printf("pffft_is_power_of_two(%d) delivers true; expected false!\n",
|
||||||
|
inp_power_of_two[k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!resNextPw2)
|
||||||
|
printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
|
||||||
|
if (!resIsPw2)
|
||||||
|
printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
|
||||||
|
|
||||||
|
resFFT = 0;
|
||||||
|
for (N = 32; N <= 65536; N *= 2) {
|
||||||
|
result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
|
||||||
|
resN = result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
|
||||||
|
resN |= result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
|
||||||
|
resN |= result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
|
||||||
|
resN |= result;
|
||||||
|
resFFT |= result;
|
||||||
|
|
||||||
|
if (!resN)
|
||||||
|
printf("tests for size %d succeeded successfully.\n", N);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resFFT)
|
||||||
|
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, "
|
||||||
|
#ifdef PFFFT_ENABLE_FLOAT
|
||||||
|
"float"
|
||||||
|
#endif
|
||||||
|
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
|
||||||
|
"/"
|
||||||
|
#endif
|
||||||
|
#ifdef PFFFT_ENABLE_DOUBLE
|
||||||
|
"double"
|
||||||
|
#endif
|
||||||
|
") succeeded successfully.\n");
|
||||||
|
|
||||||
|
resAll = resNextPw2 | resIsPw2 | resFFT;
|
||||||
|
if (!resAll)
|
||||||
|
printf("all tests succeeded successfully.\n");
|
||||||
|
else
|
||||||
|
printf("there are failed tests!\n");
|
||||||
|
|
||||||
|
return resAll;
|
||||||
|
}
|
||||||
24
pffft/uninstall.cmake
Normal file
24
pffft/uninstall.cmake
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
|
||||||
|
|
||||||
|
if(NOT EXISTS ${MANIFEST})
|
||||||
|
message(FATAL_ERROR "Cannot find install manifest: '${MANIFEST}'")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
file(STRINGS ${MANIFEST} files)
|
||||||
|
foreach(file ${files})
|
||||||
|
if(EXISTS ${file})
|
||||||
|
message(STATUS "Removing file: '${file}'")
|
||||||
|
|
||||||
|
exec_program(
|
||||||
|
${CMAKE_COMMAND} ARGS "-E remove ${file}"
|
||||||
|
OUTPUT_VARIABLE stdout
|
||||||
|
RETURN_VALUE result
|
||||||
|
)
|
||||||
|
|
||||||
|
if(NOT "${result}" STREQUAL 0)
|
||||||
|
message(FATAL_ERROR "Failed to remove file: '${file}'.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
MESSAGE(STATUS "File '${file}' does not exist.")
|
||||||
|
endif()
|
||||||
|
endforeach(file)
|
||||||
2
pffft/use_gcc8.inc
Normal file
2
pffft/use_gcc8.inc
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
export GCC_WITH_CMAKE=$(which gcc-8)
|
||||||
|
export GPP_WITH_CMAKE=$(which g++-8)
|
||||||
Reference in New Issue
Block a user