add pffft
This commit is contained in:
279
pffft/.github/workflows/c-cpp.yml
vendored
Normal file
279
pffft/.github/workflows/c-cpp.yml
vendored
Normal file
@@ -0,0 +1,279 @@
|
||||
name: C/C++ CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- github_actions
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
- github_actions
|
||||
|
||||
env:
|
||||
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
|
||||
BUILD_TYPE: Release
|
||||
|
||||
jobs:
|
||||
build_w_mipp_ubuntu-amd64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: check out MIPP
|
||||
uses: actions/checkout@master
|
||||
with:
|
||||
repository: hayguen/MIPP
|
||||
path: ./MIPP
|
||||
- name: cmake configure MIPP
|
||||
run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
|
||||
- name: cmake install MIPP headers
|
||||
run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
- name: cmake_make_simd_float_double
|
||||
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||
- name: cmake_make_simd_float
|
||||
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||
- name: cmake_make_simd_double
|
||||
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||
- name: cmake_make_no-simd_float_double
|
||||
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||
- name: cmake_make_no-simd_scalar_float_double
|
||||
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||
- name: compress
|
||||
run: tar zcvf pffft_w_mipp_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_ubuntu_builds
|
||||
path: pffft_w_mipp_ubuntu-amd64.tar.gz
|
||||
|
||||
build_ubuntu-amd64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: cmake_make_simd_float_double
|
||||
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||
- name: cmake_make_simd_float
|
||||
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||
- name: cmake_make_simd_double
|
||||
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||
- name: cmake_make_no-simd_float_double
|
||||
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||
- name: cmake_make_no-simd_scalar_float_double
|
||||
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||
- name: compress
|
||||
run: tar zcvf pffft_ubuntu-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_ubuntu_builds
|
||||
path: pffft_ubuntu-amd64.tar.gz
|
||||
|
||||
cross_build_win_from_linux:
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- name: prerequisites
|
||||
run: sudo apt -qq update && sudo apt -yqq install gcc-mingw-w64 g++-mingw-w64
|
||||
|
||||
- name: check out MIPP
|
||||
uses: actions/checkout@master
|
||||
with:
|
||||
repository: hayguen/MIPP
|
||||
path: ./MIPP
|
||||
- name: cmake configure MIPP
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
|
||||
- name: cmake install MIPP headers
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake --build MIPP_build --target install
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
- name: build_w32_no-simd
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
|
||||
- name: build_w32_simd_full
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw32.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=pentium4 -DTARGET_C_ARCH=pentium4 -DMIPP_INCLUDE_DIRS=$X/include/mipp
|
||||
|
||||
- name: build_w64_no-simd
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh no-simd -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF
|
||||
- name: build_w64_simd_full
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: X=$(pwd) && cd $GITHUB_WORKSPACE && bash ./cross_build_mingw64.sh simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=sandybridge -DTARGET_C_ARCH=sandybridge -DMIPP_INCLUDE_DIRS=$X/include/mipp
|
||||
|
||||
- name: compress
|
||||
run: tar zcvf pffft_cross-build-windows-from-linux-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_w32_no-simd build_w32_simd_full build_w64_no-simd build_w64_simd_full
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_windows_from_cross_builds
|
||||
path: pffft_cross-build-windows-from-linux-amd64.tar.gz
|
||||
|
||||
|
||||
build_win_msvc:
|
||||
# The CMake configure and build commands are platform agnostic and should work equally
|
||||
# well on Windows or Mac. You can convert this to a matrix build if you need
|
||||
# cross-platform coverage.
|
||||
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
|
||||
runs-on: windows-2019
|
||||
|
||||
steps:
|
||||
- name: check out MIPP
|
||||
uses: actions/checkout@master
|
||||
with:
|
||||
repository: hayguen/MIPP
|
||||
path: ./MIPP
|
||||
- name: cmake configure MIPP
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake -S pffft/MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$(pwd)
|
||||
- name: cmake install MIPP headers
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake --build MIPP_build --target install
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure CMake No-SIMD
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake -S $GITHUB_WORKSPACE -B build_no-simd -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DPFFFT_USE_SIMD=OFF -DTARGET_CXX_ARCH=none -DTARGET_C_ARCH=none
|
||||
- name: Build No-SIMD
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||
run: cmake --build build_no-simd --config $BUILD_TYPE
|
||||
|
||||
- name: Configure CMake SSE2
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake -S $GITHUB_WORKSPACE -B build_sse2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=SSE2 -DTARGET_C_ARCH=SSE2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
|
||||
- name: Build SSE2
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||
run: cmake --build build_sse2 --config $BUILD_TYPE
|
||||
|
||||
- name: Configure CMake AVX
|
||||
# Use a bash shell so we can use the same syntax for environment variable
|
||||
# access regardless of the host operating system
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake -S $GITHUB_WORKSPACE -B build_avx -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX -DTARGET_C_ARCH=AVX -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
|
||||
- name: Build AVX
|
||||
working-directory: ${{runner.workspace}}
|
||||
shell: bash
|
||||
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||
run: cmake --build build_avx --config $BUILD_TYPE
|
||||
|
||||
- name: Configure CMake AVX2
|
||||
# Use a bash shell so we can use the same syntax for environment variable
|
||||
# access regardless of the host operating system
|
||||
shell: bash
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: cmake -S $GITHUB_WORKSPACE -B build_avx2 -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DENABLE_PFDSP=ON -DTARGET_CXX_ARCH=AVX2 -DTARGET_C_ARCH=AVX2 -DMIPP_INCLUDE_DIRS=$(pwd)/include/mipp
|
||||
- name: Build AVX2
|
||||
working-directory: ${{runner.workspace}}
|
||||
shell: bash
|
||||
# Execute the build. You can specify a specific target with "--target <NAME>"
|
||||
run: cmake --build build_avx2 --config $BUILD_TYPE
|
||||
|
||||
- name: compress
|
||||
working-directory: ${{runner.workspace}}
|
||||
run: tar zcvf pffft_windows-msvc-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_no-simd build_sse2 build_avx build_avx2
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_windows_msvc_builds
|
||||
path: ${{runner.workspace}}/pffft_windows-msvc-amd64.tar.gz
|
||||
|
||||
|
||||
build_win_mingw:
|
||||
runs-on: windows-2019
|
||||
strategy:
|
||||
matrix:
|
||||
compiler: [gcc]
|
||||
msystem: [MINGW64]
|
||||
defaults:
|
||||
run:
|
||||
shell: msys2 {0}
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: msys2/setup-msys2@v2
|
||||
with:
|
||||
msystem: MINGW64
|
||||
install: gcc cmake make
|
||||
- name: Configure cmake
|
||||
run: CC=gcc cmake -DMINGW=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native -S . -B build_mgw64
|
||||
- name: Build
|
||||
run: cmake --build build_mgw64
|
||||
|
||||
- name: compress
|
||||
run: tar zcvf pffft_windows-mingw-amd64.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_mgw64
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_windows_mingw_builds
|
||||
path: pffft_windows-mingw-amd64.tar.gz
|
||||
|
||||
|
||||
build_macos11:
|
||||
# copied from build_ubuntu-amd64 with minor renaming
|
||||
runs-on: macos-11
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: cmake_make_simd_float_double
|
||||
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||
- name: cmake_make_simd_float
|
||||
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||
- name: cmake_make_simd_double
|
||||
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||
- name: cmake_make_no-simd_float_double
|
||||
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||
- name: cmake_make_no-simd_scalar_float_double
|
||||
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||
- name: compress
|
||||
run: tar zcvf pffft_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_macos_builds
|
||||
path: pffft_macos-11.tar.gz
|
||||
|
||||
build_w_mipp_macos11:
|
||||
# copied from build_w_mipp_ubuntu-amd64 with minor renaming
|
||||
runs-on: macos-11
|
||||
|
||||
steps:
|
||||
- name: check out MIPP
|
||||
uses: actions/checkout@master
|
||||
with:
|
||||
repository: hayguen/MIPP
|
||||
path: ./MIPP
|
||||
- name: cmake configure MIPP
|
||||
run: cmake -S MIPP -B MIPP_build -DCMAKE_INSTALL_PREFIX=$HOME/.local
|
||||
- name: cmake install MIPP headers
|
||||
run: cmake --build MIPP_build --target install && ls -alh $HOME/.local/ && ls -alh $HOME/.local/include/
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
- name: cmake_make_simd_float_double
|
||||
run: mkdir build_simd_full && cmake -S . -B build_simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_full
|
||||
- name: cmake_make_simd_float
|
||||
run: mkdir build_simd_float && cmake -S . -B build_simd_float -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_DOUBLE=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_float
|
||||
- name: cmake_make_simd_double
|
||||
run: mkdir build_simd_double && cmake -S . -B build_simd_double -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_simd_double
|
||||
- name: cmake_make_no-simd_float_double
|
||||
run: mkdir build_no-simd_full && cmake -S . -B build_no-simd_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_full
|
||||
- name: cmake_make_no-simd_scalar_float_double
|
||||
run: mkdir build_no-simd_scalar_full && cmake -S . -B build_no-simd_scalar_full -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DPFFFT_USE_SIMD=OFF -DPFFFT_USE_SCALAR_VECT=ON -DPFFFT_USE_BENCH_GREEN=OFF -DPFFFT_USE_BENCH_KISS=OFF -DPFFFT_USE_BENCH_POCKET=OFF -DTARGET_CXX_ARCH=native -DTARGET_C_ARCH=native && cmake --build build_no-simd_scalar_full
|
||||
- name: compress
|
||||
run: tar zcvf pffft_w_mipp_macos-11.tar.gz --exclude=CMakeFiles --exclude=*.cmake --exclude=Makefile --exclude=CMakeCache.txt build_simd_full build_simd_float build_simd_double build_no-simd_full build_no-simd_scalar_full
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pffft_macos_builds
|
||||
path: pffft_w_mipp_macos-11.tar.gz
|
||||
4
pffft/.gitignore
vendored
Normal file
4
pffft/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
build
|
||||
build_benches
|
||||
build_*
|
||||
.vscode
|
||||
9
pffft/.gitmodules
vendored
Normal file
9
pffft/.gitmodules
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
[submodule "greenffts"]
|
||||
path = greenffts
|
||||
url = https://github.com/hayguen/greenffts.git
|
||||
[submodule "kissfft"]
|
||||
path = kissfft
|
||||
url = https://github.com/hayguen/kissfft.git
|
||||
[submodule "pocketfft"]
|
||||
path = pocketfft
|
||||
url = https://github.com/hayguen/pocketfft.git
|
||||
663
pffft/CMakeLists.txt
Normal file
663
pffft/CMakeLists.txt
Normal file
@@ -0,0 +1,663 @@
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
project(PRETTY_FAST_FFT)
|
||||
|
||||
# smaller library size?
|
||||
option(PFFFT_USE_TYPE_FLOAT "activate single precision 'float'?" ON)
|
||||
option(PFFFT_USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
|
||||
|
||||
# architecture/optimization options
|
||||
option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
|
||||
option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
|
||||
|
||||
# what to install?
|
||||
option(INSTALL_PFFFT "install pffft to CMAKE_INSTALL_PREFIX?" ON)
|
||||
option(INSTALL_PFDSP "install pfdsp to CMAKE_INSTALL_PREFIX?" OFF)
|
||||
option(INSTALL_PFFASTCONV "install pffastconv to CMAKE_INSTALL_PREFIX?" OFF)
|
||||
|
||||
# test options
|
||||
option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
|
||||
option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
|
||||
option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
|
||||
option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
|
||||
option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF)
|
||||
option(PFFFT_USE_FFTPACK "compile and use FFTPACK in fft benchmark & validation?" ON)
|
||||
|
||||
option(PFFFT_USE_DEBUG_ASAN "use GCC's address sanitizer?" OFF)
|
||||
|
||||
option(PFFFT_DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
|
||||
|
||||
# C90 requires the gcc extensions for function attributes like always_inline
|
||||
# C99 provides the function attributes: no gcc extensions required
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
set(CMAKE_C_EXTENSIONS OFF)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 98)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
# populate what to install
|
||||
set(INSTALL_TARGETS "")
|
||||
set(INSTALL_HEADERS "")
|
||||
|
||||
|
||||
if ( (NOT PFFFT_USE_TYPE_FLOAT) AND (NOT PFFFT_USE_TYPE_DOUBLE) )
|
||||
message(FATAL_ERROR "activate at least one of PFFFT_USE_TYPE_FLOAT or PFFFT_USE_TYPE_DOUBLE")
|
||||
endif()
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
include(cmake/target_optimizations.cmake)
|
||||
include(cmake/compiler_warnings.cmake)
|
||||
find_package(PAPI)
|
||||
find_package(MIPP)
|
||||
if (MIPP_FOUND)
|
||||
# if (TARGET MIPP)
|
||||
message(STATUS "found MIPP")
|
||||
else()
|
||||
message(STATUS "NOT found MIPP")
|
||||
endif()
|
||||
|
||||
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
set(ASANLIB "asan")
|
||||
else()
|
||||
set(ASANLIB "")
|
||||
endif()
|
||||
|
||||
message(STATUS "INFO: CMAKE_C_COMPILER_ID is ${CMAKE_C_COMPILER_ID}")
|
||||
message(STATUS "INFO: CMAKE_CXX_COMPILER_ID is ${CMAKE_CXX_COMPILER_ID}")
|
||||
if (WIN32)
|
||||
message(STATUS "INFO: detected WIN32")
|
||||
else()
|
||||
message(STATUS "INFO: NOT WIN32")
|
||||
endif()
|
||||
if (MINGW)
|
||||
message(STATUS "INFO: detected MINGW with compiler ${CMAKE_C_COMPILER_ID}")
|
||||
else()
|
||||
message(STATUS "INFO: NOT MINGW")
|
||||
endif()
|
||||
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
|
||||
message(STATUS "INFO: detected MSVC with compiler ${CMAKE_C_COMPILER_ID}")
|
||||
endif()
|
||||
|
||||
|
||||
if (PFFFT_USE_BENCH_GREEN)
|
||||
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/greenffts/CMakeLists.txt")
|
||||
message(STATUS "found subdir greenffts")
|
||||
set(PATH_GREEN "${CMAKE_CURRENT_LIST_DIR}/greenffts")
|
||||
add_subdirectory( "${PATH_GREEN}" )
|
||||
else()
|
||||
message(WARNING "GreenFFT not found in subdir greenffts")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_BENCH_KISS)
|
||||
# git submodule add https://github.com/hayguen/kissfft.git
|
||||
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/kissfft/CMakeLists.txt")
|
||||
message(STATUS "found subdir kissfft")
|
||||
set(PATH_KISS "${CMAKE_CURRENT_LIST_DIR}/kissfft")
|
||||
add_subdirectory( "${PATH_KISS}" )
|
||||
else()
|
||||
message(WARNING "KissFFT not found in subdir kissfft")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_BENCH_POCKET)
|
||||
# git submodule add https://github.com/hayguen/pocketfft.git
|
||||
if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c")
|
||||
message(STATUS "found subdir pocketfft")
|
||||
set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft")
|
||||
add_subdirectory( "${PATH_POCKET}" )
|
||||
else()
|
||||
message(WARNING "PocketFFT not found in subdir pocketfft")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
########################################################################
|
||||
# select the release build type by default to get optimization flags
|
||||
########################################################################
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
message(STATUS "Build type not specified: defaulting to release.")
|
||||
endif(NOT CMAKE_BUILD_TYPE)
|
||||
|
||||
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
|
||||
# using Visual Studio C++
|
||||
message(STATUS "INFO: detected MSVC: will not link math lib m")
|
||||
set(MATHLIB "")
|
||||
|
||||
add_definitions("/D_CRT_SECURE_NO_WARNINGS")
|
||||
|
||||
set(MSVC_DISABLED_WARNINGS_LIST
|
||||
"C4996"
|
||||
)
|
||||
|
||||
else()
|
||||
if(PFFFT_DISABLE_LINK_WITH_M)
|
||||
else()
|
||||
message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
|
||||
set(MATHLIB "m")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(STDCXXLIB "")
|
||||
if (MINGW)
|
||||
set(STDCXXLIB "stdc++")
|
||||
endif()
|
||||
|
||||
|
||||
set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
|
||||
set( SIMD_DOUBLE_HDRS simd/pf_double.h simd/pf_avx_double.h simd/pf_scalar_double.h )
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
set( FLOAT_SOURCES pffft.c pffft.h ${SIMD_FLOAT_HDRS} )
|
||||
if (INSTALL_PFFFT)
|
||||
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.h)
|
||||
endif()
|
||||
else()
|
||||
set( FLOAT_SOURCES )
|
||||
endif()
|
||||
|
||||
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
set( DOUBLE_SOURCES pffft_double.c pffft_double.h ${SIMD_DOUBLE_HDRS} )
|
||||
if (INSTALL_PFFFT)
|
||||
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft_double.h)
|
||||
endif()
|
||||
else()
|
||||
set( DOUBLE_SOURCES )
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
add_library(PFFFT STATIC ${FLOAT_SOURCES} ${DOUBLE_SOURCES} pffft_common.c pffft_priv_impl.h pffft.hpp )
|
||||
set_target_properties(PFFFT PROPERTIES OUTPUT_NAME "pffft")
|
||||
target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
|
||||
target_activate_c_compiler_warnings(PFFFT)
|
||||
if (PFFFT_USE_SCALAR_VECT)
|
||||
target_compile_definitions(PFFFT PRIVATE PFFFT_SCALVEC_ENABLED=1)
|
||||
endif()
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(PFFFT PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
target_set_c_arch_flags(PFFFT)
|
||||
if (NOT PFFFT_USE_SIMD)
|
||||
target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
|
||||
endif()
|
||||
target_link_libraries( PFFFT ${ASANLIB} ${MATHLIB} )
|
||||
set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
)
|
||||
if (INSTALL_PFFFT)
|
||||
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFFT)
|
||||
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffft.hpp)
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_cplx.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
|
||||
set_property(TARGET PFDSP PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET PFDSP PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
set_target_properties(PFDSP PROPERTIES OUTPUT_NAME "pfdsp")
|
||||
target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
|
||||
target_activate_cxx_compiler_warnings(PFDSP)
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(PFDSP PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
if (PFFFT_USE_SIMD)
|
||||
target_set_cxx_arch_flags(PFDSP)
|
||||
else()
|
||||
target_compile_definitions(PFDSP PRIVATE PFFFT_SIMD_DISABLE=1)
|
||||
endif()
|
||||
target_link_libraries( PFDSP ${MATHLIB} )
|
||||
set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
)
|
||||
if (INSTALL_PFDSP)
|
||||
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFDSP)
|
||||
set(INSTALL_HEADERS ${INSTALL_HEADERS} pf_mixer.h pf_cplx.h pf_carrier.h pf_cic.h)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_FFTPACK)
|
||||
|
||||
# float / single precision
|
||||
add_library(FFTPACK_FLOAT STATIC fftpack.c fftpack.h)
|
||||
target_compile_definitions(FFTPACK_FLOAT PRIVATE _USE_MATH_DEFINES)
|
||||
target_activate_c_compiler_warnings(FFTPACK_FLOAT)
|
||||
target_link_libraries( FFTPACK_FLOAT ${MATHLIB} )
|
||||
set_property(TARGET FFTPACK_FLOAT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
)
|
||||
|
||||
# double precision
|
||||
add_library(FFTPACK_DOUBLE STATIC fftpack.c fftpack.h)
|
||||
target_compile_definitions(FFTPACK_DOUBLE PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(FFTPACK_DOUBLE PUBLIC FFTPACK_DOUBLE_PRECISION)
|
||||
target_activate_c_compiler_warnings(FFTPACK_DOUBLE)
|
||||
target_link_libraries( FFTPACK_DOUBLE ${MATHLIB} )
|
||||
set_property(TARGET FFTPACK_DOUBLE APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
)
|
||||
|
||||
# builtin test program of fftpack
|
||||
add_executable(test_fftpack_float fftpack.c fftpack.h)
|
||||
target_compile_definitions(test_fftpack_float PRIVATE _USE_MATH_DEFINES TESTING_FFTPACK)
|
||||
target_link_libraries(test_fftpack_float ${MATHLIB})
|
||||
|
||||
add_executable(test_fftpack_double fftpack.c fftpack.h)
|
||||
target_compile_definitions(test_fftpack_double PRIVATE _USE_MATH_DEFINES FFTPACK_DOUBLE_PRECISION TESTING_FFTPACK)
|
||||
target_link_libraries(test_fftpack_double ${MATHLIB})
|
||||
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
# only 'float' supported in PFFASTCONV
|
||||
add_library(PFFASTCONV STATIC pffastconv.c pffastconv.h pffft.h )
|
||||
set_target_properties(PFFASTCONV PROPERTIES OUTPUT_NAME "pffastconv")
|
||||
target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
|
||||
target_activate_c_compiler_warnings(PFFASTCONV)
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
|
||||
set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
)
|
||||
if (INSTALL_PFFASTCONV)
|
||||
set(INSTALL_TARGETS ${INSTALL_TARGETS} PFFASTCONV)
|
||||
set(INSTALL_HEADERS ${INSTALL_HEADERS} pffastconv.h)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
######################################################
|
||||
|
||||
install( TARGETS ${INSTALL_TARGETS} DESTINATION lib)
|
||||
install( FILES ${INSTALL_HEADERS} DESTINATION include)
|
||||
|
||||
add_custom_target(uninstall
|
||||
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/uninstall.cmake"
|
||||
)
|
||||
|
||||
#######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
add_executable( test_pffft_float test_pffft.c )
|
||||
target_compile_definitions(test_pffft_float PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(test_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
target_link_libraries( test_pffft_float PFFFT ${ASANLIB} )
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
add_executable( test_pffft_double test_pffft.c )
|
||||
target_compile_definitions(test_pffft_double PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(test_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
target_link_libraries( test_pffft_double PFFFT ${ASANLIB} )
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
add_executable( test_fft_factors test_fft_factors.c )
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
endif()
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
target_compile_definitions(test_fft_factors PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
endif()
|
||||
target_link_libraries(test_fft_factors PFFFT ${ASANLIB} ${MATHLIB})
|
||||
|
||||
######################################################
|
||||
|
||||
add_executable( test_pffft_cpp test_pffft.cpp )
|
||||
target_compile_definitions(test_pffft_cpp PRIVATE _USE_MATH_DEFINES)
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
endif()
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
target_compile_definitions(test_pffft_cpp PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
endif()
|
||||
target_link_libraries( test_pffft_cpp PFFFT ${STDCXXLIB} ${ASANLIB} )
|
||||
|
||||
######################################################
|
||||
|
||||
add_executable( test_pffft_cpp_11 test_pffft.cpp )
|
||||
target_compile_definitions(test_pffft_cpp_11 PRIVATE _USE_MATH_DEFINES)
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
endif()
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
target_compile_definitions(test_pffft_cpp_11 PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
endif()
|
||||
target_link_libraries( test_pffft_cpp_11 PFFFT ${STDCXXLIB} ${ASANLIB} )
|
||||
|
||||
set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET test_pffft_cpp_11 PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
add_executable(test_pffastconv test_pffastconv.c
|
||||
${SIMD_FLOAT_HDRS} ${SIMD_DOUBLE_HDRS}
|
||||
)
|
||||
target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
target_set_c_arch_flags(test_pffastconv)
|
||||
if (NOT PFFFT_USE_SIMD)
|
||||
target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
|
||||
endif()
|
||||
target_link_libraries( test_pffastconv PFFASTCONV ${ASANLIB} ${MATHLIB} )
|
||||
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
add_executable(bench_pffft_float bench_pffft.c pffft.h)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(bench_pffft_float PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
|
||||
target_link_libraries( bench_pffft_float PFFFT ${ASANLIB} )
|
||||
|
||||
if (PFFFT_USE_FFTPACK)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTPACK=1)
|
||||
target_link_libraries(bench_pffft_float FFTPACK_FLOAT)
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_BENCH_FFTW)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE HAVE_FFTW=1)
|
||||
target_link_libraries(bench_pffft_float fftw3f)
|
||||
endif()
|
||||
|
||||
if (PATH_GREEN AND PFFFT_USE_BENCH_GREEN)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE HAVE_GREEN_FFTS=1)
|
||||
target_link_libraries(bench_pffft_float GreenFFT)
|
||||
endif()
|
||||
|
||||
if (PATH_KISS AND PFFFT_USE_BENCH_KISS)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1)
|
||||
target_link_libraries(bench_pffft_float KissFFT)
|
||||
endif()
|
||||
|
||||
if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
|
||||
target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1)
|
||||
target_link_libraries(bench_pffft_float PocketFFT)
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_BENCH_MKL)
|
||||
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
|
||||
# has chances to work
|
||||
else()
|
||||
# other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?!
|
||||
message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
|
||||
endif()
|
||||
message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
|
||||
target_compile_definitions(bench_pffft_float PRIVATE HAVE_MKL=1)
|
||||
target_link_libraries(bench_pffft_float mkl_intel_lp64 mkl_sequential -lmkl_core)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
add_executable(bench_pffft_double bench_pffft.c pffft.h)
|
||||
target_compile_definitions(bench_pffft_double PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(bench_pffft_double PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(bench_pffft_double PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
target_link_libraries( bench_pffft_double PFFFT ${ASANLIB} )
|
||||
|
||||
if (PFFFT_USE_FFTPACK)
|
||||
target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTPACK=1)
|
||||
target_link_libraries(bench_pffft_double FFTPACK_DOUBLE)
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_BENCH_FFTW)
|
||||
target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1)
|
||||
target_link_libraries(bench_pffft_double fftw3)
|
||||
endif()
|
||||
|
||||
if (PATH_POCKET AND PFFFT_USE_BENCH_POCKET)
|
||||
target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1)
|
||||
target_link_libraries(bench_pffft_double PocketFFT)
|
||||
endif()
|
||||
|
||||
if (PFFFT_USE_BENCH_MKL)
|
||||
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
|
||||
# has chances to work
|
||||
else()
|
||||
# other PROCESSORs could be "ppc", "ppc64", "arm", "aarch64", "armv7l" - or something else?!
|
||||
message(WARNING "using Intel MKL on '${CMAKE_SYSTEM_PROCESSOR}' might fail.")
|
||||
endif()
|
||||
message(STATUS "In case compiling/linking with Intel MKL fails, check CMakeLists.txt or deactivate PFFFT_USE_BENCH_MKL")
|
||||
target_compile_definitions(bench_pffft_double PRIVATE HAVE_MKL=1)
|
||||
target_link_libraries(bench_pffft_double mkl_intel_lp64 mkl_sequential -lmkl_core)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
|
||||
add_executable(bench_pf_mixer_float bench_mixers.cpp papi_perf_counter.h)
|
||||
target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
target_link_libraries( bench_pf_mixer_float ${ASANLIB} )
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(bench_pf_mixer_float PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
if (PAPI_FOUND)
|
||||
target_compile_definitions(bench_pf_mixer_float PRIVATE HAVE_PAPI=1)
|
||||
target_link_libraries(bench_pf_mixer_float ${PAPI_LIBRARIES})
|
||||
endif()
|
||||
target_link_libraries( bench_pf_mixer_float PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
|
||||
|
||||
|
||||
############################################################################
|
||||
|
||||
add_library(pf_conv_arch_none pf_conv.cpp pf_conv.h pf_cplx.h)
|
||||
target_compile_definitions(pf_conv_arch_none PRIVATE CONV_ARCH_POST=none MIPP_NO_INTRINSICS=1)
|
||||
set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET pf_conv_arch_none PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
target_activate_cxx_compiler_warnings(pf_conv_arch_none)
|
||||
add_library(pf_conv_dispatcher pf_conv_dispatcher.cpp pf_conv_dispatcher.h pf_conv.h pf_cplx.h)
|
||||
set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET pf_conv_dispatcher PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
target_activate_cxx_compiler_warnings(pf_conv_dispatcher)
|
||||
|
||||
add_library(pf_conv_arch_dflt pf_conv.cpp pf_conv.h pf_cplx.h)
|
||||
target_compile_definitions(pf_conv_arch_dflt PRIVATE CONV_ARCH_POST=dflt)
|
||||
set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET pf_conv_arch_dflt PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
target_activate_cxx_compiler_warnings(pf_conv_arch_dflt)
|
||||
target_set_cxx_arch_flags(pf_conv_arch_dflt)
|
||||
|
||||
target_link_libraries(pf_conv_dispatcher pf_conv_arch_none pf_conv_arch_dflt)
|
||||
|
||||
if ((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64"))
|
||||
|
||||
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
set(PF_CONV_ARCHES "sse3;sse4;avx;avx2")
|
||||
set(PF_CONV_OPT_sse3 "core2") # emulate a map
|
||||
set(PF_CONV_OPT_sse4 "nehalem")
|
||||
set(PF_CONV_OPT_avx "sandybridge")
|
||||
set(PF_CONV_OPT_avx2 "haswell")
|
||||
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AMD64)
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||
set(PF_CONV_ARCHES "sse2;avx;avx2")
|
||||
set(PF_CONV_OPT_sse2 "SSE2") # emulate a map
|
||||
set(PF_CONV_OPT_avx "AVX")
|
||||
set(PF_CONV_OPT_avx2 "AVX2")
|
||||
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_MSVC_AMD64)
|
||||
else()
|
||||
set(PF_CONV_ARCHES "")
|
||||
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||
endif()
|
||||
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
|
||||
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
set(PF_CONV_ARCHES "armv8a")
|
||||
set(PF_CONV_OPT_armv8a "armv8-a") # emulate a map for arch
|
||||
|
||||
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_AARCH64)
|
||||
else()
|
||||
set(PF_CONV_ARCHES "")
|
||||
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||
endif()
|
||||
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
|
||||
|
||||
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
set(PF_CONV_ARCHES "neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72")
|
||||
set(PF_CONV_OPT_neon_vfpv4 "armv7-a") # emulate a map for arch
|
||||
set(PF_CONV_EXTRA_neon_vfpv4 "neon_vfpv4") # emulate a map for additional options (EXTRA)
|
||||
set(PF_CONV_OPT_neon_rpi3_a53 "armv7-a")
|
||||
set(PF_CONV_EXTRA_neon_rpi3_a53 "neon_rpi3_a53")
|
||||
set(PF_CONV_OPT_neon_rpi4_a72 "armv7-a")
|
||||
set(PF_CONV_EXTRA_neon_rpi4_a72 "neon_rpi4_a72")
|
||||
|
||||
target_compile_definitions(pf_conv_dispatcher PRIVATE CONV_ARCH_GCC_ARM32NEON)
|
||||
else()
|
||||
set(PF_CONV_ARCHES "")
|
||||
message(WARNING "unknown compiler ${CMAKE_CXX_COMPILER_ID} on CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||
endif()
|
||||
|
||||
else()
|
||||
message(WARNING "this is unforseen CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}: can't do architecture specific compilation")
|
||||
endif()
|
||||
|
||||
foreach (arch_opt ${PF_CONV_ARCHES})
|
||||
add_library(pf_conv_arch_${arch_opt} pf_conv.cpp pf_conv.h pf_cplx.h)
|
||||
set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET pf_conv_arch_${arch_opt} PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
target_activate_cxx_compiler_warnings(pf_conv_arch_${arch_opt})
|
||||
target_compile_definitions(pf_conv_arch_${arch_opt} PRIVATE CONV_ARCH_POST=${arch_opt})
|
||||
|
||||
target_set_cxx_arch_option(pf_conv_arch_${arch_opt} "${PF_CONV_OPT_${arch_opt}}" "${PF_CONV_EXTRA_${arch_opt}}" "${PF_CONV_OPT_${arch_opt}}")
|
||||
target_link_libraries(pf_conv_dispatcher pf_conv_arch_${arch_opt})
|
||||
message(STATUS "added library pf_conv_arch_${arch_opt} with CONV_ARCH_POST=${arch_opt}")
|
||||
endforeach()
|
||||
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
foreach (arch_opt ${PF_CONV_ARCHES})
|
||||
target_compile_options(pf_conv_arch_${arch_opt} PRIVATE "-fsanitize=address")
|
||||
target_link_libraries( pf_conv_arch_${arch_opt} ${ASANLIB})
|
||||
endforeach()
|
||||
|
||||
target_compile_options(pf_conv_arch_none PRIVATE "-fsanitize=address")
|
||||
target_link_libraries( pf_conv_arch_none ${ASANLIB})
|
||||
|
||||
target_compile_options(pf_conv_dispatcher PRIVATE "-fsanitize=address")
|
||||
target_link_libraries(pf_conv_dispatcher ${ASANLIB})
|
||||
endif()
|
||||
|
||||
if(MIPP_FOUND)
|
||||
foreach (arch_opt ${PF_CONV_ARCHES})
|
||||
message(STATUS "link pf_conv_arch_${arch_opt} against MIPP")
|
||||
target_link_libraries(pf_conv_arch_${arch_opt} MIPP)
|
||||
endforeach()
|
||||
|
||||
message(STATUS "link pf_conv_arch_none against MIPP")
|
||||
target_link_libraries(pf_conv_arch_none MIPP)
|
||||
endif()
|
||||
|
||||
############################################################################
|
||||
|
||||
add_executable(bench_pf_conv_float bench_conv.cpp papi_perf_counter.h)
|
||||
set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET bench_pf_conv_float PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
target_compile_definitions(bench_pf_conv_float PRIVATE _USE_MATH_DEFINES)
|
||||
target_compile_definitions(bench_pf_conv_float PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
if (PFFFT_USE_DEBUG_ASAN)
|
||||
target_compile_options(bench_pf_conv_float PRIVATE "-fsanitize=address")
|
||||
endif()
|
||||
target_link_libraries( bench_pf_conv_float ${ASANLIB} )
|
||||
if (PAPI_FOUND)
|
||||
target_compile_definitions(bench_pf_conv_float PRIVATE HAVE_PAPI=1)
|
||||
target_link_libraries(bench_pf_conv_float ${PAPI_LIBRARIES})
|
||||
endif()
|
||||
if(MIPP_FOUND)
|
||||
target_link_libraries(bench_pf_conv_float MIPP)
|
||||
endif()
|
||||
target_link_libraries( bench_pf_conv_float pf_conv_dispatcher PFDSP $<$<CXX_COMPILER_ID:GNU>:stdc++> )
|
||||
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
add_subdirectory(examples)
|
||||
|
||||
######################################################
|
||||
|
||||
enable_testing()
|
||||
|
||||
|
||||
add_test(NAME test_fft_factors
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fft_factors"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
if (PFFFT_USE_FFTPACK)
|
||||
add_test(NAME test_fftpack_float
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_float"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
add_test(NAME test_fftpack_double
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_fftpack_double"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
|
||||
add_test(NAME bench_pffft_pow2
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--max-len" "128" "--quick"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
add_test(NAME bench_pffft_non2
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/bench_pffft_float" "--non-pow2" "--max-len" "192" "--quick"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
# add_test(NAME bench_plots
|
||||
# COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/plots.sh"
|
||||
# WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
# )
|
||||
|
||||
add_test(NAME test_pfconv_lens_symetric
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
add_test(NAME test_pfconv_lens_non_sym
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
add_test(NAME bench_pfconv_symetric
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick" "--sym"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
add_test(NAME bench_pfconv_non_sym
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--quick"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
endif()
|
||||
|
||||
38
pffft/LICENSE.txt
Normal file
38
pffft/LICENSE.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||
Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
352
pffft/README.md
Normal file
352
pffft/README.md
Normal file
@@ -0,0 +1,352 @@
|
||||
|
||||
---
|
||||
|
||||
# PFFFT: a pretty fast FFT and fast convolution with PFFASTCONV
|
||||
|
||||
---
|
||||
|
||||
<!-- toc -->
|
||||
|
||||
- [Brief Description](#brief-description)
|
||||
- [Why does it exist?](#why-does-it-exist)
|
||||
- [CMake](#cmake)
|
||||
- [History / Origin / Changes](#history--origin--changes)
|
||||
- [Comparison with other FFTs](#comparison-with-other-ffts)
|
||||
- [Dependencies / Required Linux packages](#dependencies--required-linux-packages)
|
||||
- [Benchmarks and results](#benchmarks-and-results)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
---
|
||||
|
||||
## Brief description:
|
||||
|
||||
PFFFT does 1D Fast Fourier Transforms, of single precision real and
|
||||
complex vectors. It tries do it fast, it tries to be correct, and it
|
||||
tries to be small. Computations do take advantage of SSE1 instructions
|
||||
on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
|
||||
license is BSD-like.
|
||||
|
||||
PFFFT is a fork of [Julien Pommier's library on bitbucket](https://bitbucket.org/jpommier/pffft/)
|
||||
with some changes and additions.
|
||||
|
||||
|
||||
PFFASTCONV does fast convolution (FIR filtering), of single precision
|
||||
real vectors, utilizing the PFFFT library. The license is BSD-like.
|
||||
|
||||
PFDSP contains a few other signal processing functions.
|
||||
Currently, mixing and carrier generation functions are contained.
|
||||
It is work in progress - also the API!
|
||||
The fast convolution from PFFASTCONV might get merged into PFDSP.
|
||||
|
||||
|
||||
## Why does it exist:
|
||||
|
||||
I (Julien Pommier) was in search of a good performing FFT library ,
|
||||
preferably very small and with a very liberal license.
|
||||
|
||||
When one says "fft library", FFTW ("Fastest Fourier Transform in the
|
||||
West") is probably the first name that comes to mind -- I guess that
|
||||
99% of open-source projects that need a FFT do use FFTW, and are happy
|
||||
with it. However, it is quite a large library , which does everything
|
||||
fft related (2d transforms, 3d transforms, other transformations such
|
||||
as discrete cosine , or fast hartley). And it is licensed under the
|
||||
GNU GPL , which means that it cannot be used in non open-source
|
||||
products.
|
||||
|
||||
An alternative to FFTW that is really small, is the venerable FFTPACK
|
||||
v4, which is available on NETLIB. A more recent version (v5) exists,
|
||||
but it is larger as it deals with multi-dimensional transforms. This
|
||||
is a library that is written in FORTRAN 77, a language that is now
|
||||
considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
|
||||
by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
|
||||
its age, benchmarks show it that it still a very good performing FFT
|
||||
library, see for example the 1d single precision benchmarks
|
||||
[here](http://www.fftw.org/speed/opteron-2.2GHz-32bit/). It is however not
|
||||
competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
|
||||
Apple vDSP. The reason for that is that those libraries do take
|
||||
advantage of the SSE SIMD instructions available on Intel CPUs,
|
||||
available since the days of the Pentium III. These instructions deal
|
||||
with small vectors of 4 floats at a time, instead of a single float
|
||||
for a traditionnal FPU, so when using these instructions one may expect
|
||||
a 4-fold performance improvement.
|
||||
|
||||
The idea was to take this fortran fftpack v4 code, translate to C,
|
||||
modify it to deal with those SSE instructions, and check that the
|
||||
final performance is not completely ridiculous when compared to other
|
||||
SIMD FFT libraries. Translation to C was performed with [f2c](
|
||||
http://www.netlib.org/f2c/). The resulting file was a bit edited in
|
||||
order to remove the thousands of gotos that were introduced by
|
||||
f2c. You will find the fftpack.h and fftpack.c sources in the
|
||||
repository, this a complete translation of [fftpack](
|
||||
http://www.netlib.org/fftpack/), with the discrete cosine transform
|
||||
and the test program. There is no license information in the netlib
|
||||
repository, but it was confirmed to me by the fftpack v5 curators that
|
||||
the [same terms do apply to fftpack v4]
|
||||
(http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html). This is a
|
||||
"BSD-like" license, it is compatible with proprietary projects.
|
||||
|
||||
Adapting fftpack to deal with the SIMD 4-element vectors instead of
|
||||
scalar single precision numbers was more complex than I originally
|
||||
thought, especially with the real transforms, and I ended up writing
|
||||
more code than I planned..
|
||||
|
||||
|
||||
## The code:
|
||||
|
||||
### Good old C:
|
||||
The FFT API is very very simple, just make sure that you read the comments in `pffft.h`.
|
||||
|
||||
The Fast convolution's API is also very simple, just make sure that you read the comments
|
||||
in `pffastconv.h`.
|
||||
|
||||
### C++:
|
||||
A simple C++ wrapper is available in `pffft.hpp`.
|
||||
|
||||
### Git:
|
||||
This archive's source can be downloaded with git (without the submodules):
|
||||
```
|
||||
git clone https://github.com/marton78/pffft.git
|
||||
```
|
||||
|
||||
### Only two files?:
|
||||
_"Only two files, in good old C, pffft.c and pffft.h"_
|
||||
|
||||
This statement does **NO LONGER** hold!
|
||||
|
||||
With new functionality and support for AVX, there was need to restructure the sources.
|
||||
But you can compile and link **pffft** as a static library.
|
||||
|
||||
|
||||
## CMake:
|
||||
There's now CMake support to build the static libraries `libPFFFT.a`
|
||||
and `libPFFASTCONV.a` from the source files, plus the additional
|
||||
`libFFTPACK.a` library. Later one's sources are there anyway for the benchmark.
|
||||
|
||||
There are several CMake options to modify library size and optimization.
|
||||
You can explore all available options with `cmake-gui` or `ccmake`,
|
||||
the console version - after having installed (on Debian/Ubuntu Linux) one of
|
||||
```
|
||||
sudo apt-get install cmake-qt-gui
|
||||
sudo apt-get install cmake-curses-gui
|
||||
```
|
||||
|
||||
Some of the options:
|
||||
* `PFFFT_USE_TYPE_FLOAT` to activate single precision 'float' (default: ON)
|
||||
* `PFFFT_USE_TYPE_DOUBLE` to activate 'double' precision float (default: ON)
|
||||
* `PFFFT_USE_SIMD` to use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? (default: ON)
|
||||
* `DISABLE_SIMD_AVX` to disable AVX CPU features (default: OFF)
|
||||
* `PFFFT_USE_SIMD_NEON` to force using NEON on ARM (requires PFFFT_USE_SIMD) (default: OFF)
|
||||
* `PFFFT_USE_SCALAR_VECT` to use 4-element vector scalar operations (if no other SIMD) (default: ON)
|
||||
|
||||
Options can be passed to `cmake` at command line, e.g.
|
||||
```
|
||||
cmake -DPFFFT_USE_TYPE_FLOAT=OFF -DPFFFT_USE_TYPE_DOUBLE=ON
|
||||
```
|
||||
|
||||
My Linux distribution defaults to GCC. With installed CLANG and the bash shell, you can use it with
|
||||
```
|
||||
mkdir build
|
||||
cd build
|
||||
CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -DCMAKE_BUILD_TYPE=Debug ../
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=~ ../
|
||||
ccmake . # or: cmake-gui .
|
||||
cmake --build . # or simply: make
|
||||
ctest # to execute some tests - including benchmarks
|
||||
cmake --build . --target install # or simply: [sudo] make install
|
||||
```
|
||||
|
||||
With MSVC on Windows, you need some different options. Following ones to build a 64-bit Release with Visual Studio 2019:
|
||||
```
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G "Visual Studio 16 2019" -A x64 ..
|
||||
cmake --build . --config Release
|
||||
ctest -C Release
|
||||
```
|
||||
|
||||
see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
|
||||
|
||||
|
||||
## History / Origin / Changes:
|
||||
Origin for this code/fork is Julien Pommier's pffft on bitbucket:
|
||||
[https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/)
|
||||
|
||||
Git history shows following first commits of the major contributors:
|
||||
* Julien Pommier: November 19, 2011
|
||||
* Marton Danoczy: September 30, 2015
|
||||
* Hayati Ayguen: December 22, 2019
|
||||
* Dario Mambro: March 24, 2020
|
||||
|
||||
There are a few other contributors not listed here.
|
||||
|
||||
The main changes include:
|
||||
* improved benchmarking, see [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks)
|
||||
* double support
|
||||
* avx(2) support
|
||||
* c++ headers (wrapper)
|
||||
* additional API helper functions
|
||||
* additional library for fast convolution
|
||||
* cmake support
|
||||
* ctest
|
||||
|
||||
|
||||
## Comparison with other FFTs:
|
||||
The idea was not to break speed records, but to get a decently fast
|
||||
fft that is at least 50% as fast as the fastest FFT -- especially on
|
||||
slowest computers . I'm more focused on getting the best performance
|
||||
on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
|
||||
on getting top performance on today fastest cpus.
|
||||
|
||||
It can be used in a real-time context as the fft functions do not
|
||||
perform any memory allocation -- that is why they accept a 'work'
|
||||
array in their arguments.
|
||||
|
||||
It is also a bit focused on performing 1D convolutions, that is why it
|
||||
provides "unordered" FFTs , and a fourier domain convolution
|
||||
operation.
|
||||
|
||||
Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages).
|
||||
It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast.
|
||||
The whole C++ implementation file is 161 lines, including the Copyright header, see
|
||||
[https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp)
|
||||
|
||||
## Dependencies / Required Linux packages
|
||||
|
||||
On Debian/Ubuntu Linux following packages should be installed:
|
||||
|
||||
```
|
||||
sudo apt-get install build-essential gcc g++ cmake
|
||||
```
|
||||
|
||||
|
||||
## Benchmarks and results
|
||||
|
||||
#### Quicklink
|
||||
Find results at [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
|
||||
|
||||
#### General
|
||||
My (Hayati Ayguen) first look at FFT-benchmarks was with [benchFFT](http://www.fftw.org/benchfft/)
|
||||
and especially the results of the benchmarks [results](http://www.fftw.org/speed/),
|
||||
which demonstrate the performance of the [FFTW](http://www.fftw.org/).
|
||||
Looking at the benchmarked computer systems from todays view (2021), these are quite outdated.
|
||||
|
||||
Having a look into the [benchFFT source code](http://www.fftw.org/benchfft/benchfft-3.1.tar.gz),
|
||||
the latest source changes, including competitive fft implementations, are dated November 2003.
|
||||
|
||||
In 2019, when pffft got my attention at [bitbucket](https://bitbucket.org/jpommier/pffft/src/master/),
|
||||
there were also some benchmark results.
|
||||
Unfortunately the results are tables with numbers - without graphical plots.
|
||||
Without the plots, i could not get an impression. That was, why i started
|
||||
[https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks),
|
||||
which includes GnuPlot figures.
|
||||
|
||||
Today in June 2021, i realized the existence of [https://github.com/FFTW/benchfft](https://github.com/FFTW/benchfft).
|
||||
This repository is much more up-to-date with a commit in December 2020.
|
||||
Unfortunately, it looks not so simple to get it run - including the generation of plots.
|
||||
|
||||
Is there any website showing benchFFT results of more recent computer systems?
|
||||
|
||||
Of course, it's very important, that a benchmark can be compared with a bunch
|
||||
of different FFT algorithms/implementations.
|
||||
This requires to have these compiled/built and utilizable.
|
||||
|
||||
|
||||
#### Git submodules for Green-, Kiss- and Pocket-FFT
|
||||
Sources for [Green-](https://github.com/hayguen/greenffts),
|
||||
[Kiss-](https://github.com/hayguen/kissfft)
|
||||
and [Pocket-FFT](https://github.com/hayguen/pocketfft)
|
||||
can be downloaded directly with the sources of this repository - using git submodules:
|
||||
```
|
||||
git clone --recursive https://github.com/marton78/pffft.git
|
||||
```
|
||||
|
||||
Important is `--recursive`, that does also fetch the submodules directly.
|
||||
But you might retrieve the submodules later, too:
|
||||
```
|
||||
git submodule update --init
|
||||
```
|
||||
|
||||
#### Fastest Fourier Transform in the West: FFTW
|
||||
To allow comparison with FFTW [http://www.fftw.org/](http://www.fftw.org/),
|
||||
cmake option `-DPFFFT_USE_BENCH_FFTW=ON` has to be used with following commands.
|
||||
The cmake option requires previous setup of following (debian/ubuntu) package:
|
||||
```
|
||||
sudo apt-get install libfftw3-dev
|
||||
```
|
||||
|
||||
#### Intel Math Kernel Library: MKL
|
||||
Intel's MKL [https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html)
|
||||
currently looks even faster than FFTW.
|
||||
|
||||
On Ubuntu-Linux it's easy to setup with the package `intel-mkl`.
|
||||
Similar on Debian: `intel-mkl-full`.
|
||||
|
||||
There are special repositories for following Linux distributions:
|
||||
* Debian/apt: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html)
|
||||
* RedHat/yum: [https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html](https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-yum-repo.html)
|
||||
* Gentoo/ebuild: [https://packages.gentoo.org/packages/sci-libs/mkl](https://packages.gentoo.org/packages/sci-libs/mkl)
|
||||
|
||||
#### Performing the benchmarks - with CMake
|
||||
Benchmarks should be prepared by creating a special build folder
|
||||
```
|
||||
mkdir build_benches
|
||||
cd build_benches
|
||||
cmake ../bench
|
||||
```
|
||||
|
||||
There are several CMake options to parametrize, which fft implementations should be benched.
|
||||
You can explore all available options with `cmake-gui` or `ccmake`, see [CMake](#cmake).
|
||||
|
||||
Some of the options:
|
||||
* `BENCH_ID` name the benchmark - used in filename
|
||||
* `BENCH_ARCH` target architecture passed to compiler for code optimization
|
||||
* `PFFFT_USE_BENCH_FFTW` use (system-installed) FFTW3 in fft benchmark? (default: OFF)
|
||||
* `PFFFT_USE_BENCH_GREEN` use Green FFT in fft benchmark? (default: ON)
|
||||
* `PFFFT_USE_BENCH_KISS` use KissFFT in fft benchmark? (default: ON)
|
||||
* `PFFFT_USE_BENCH_POCKET` use PocketFFT in fft benchmark? (default: ON)
|
||||
* `PFFFT_USE_BENCH_MKL` use Intel MKL in fft benchmark? (default: OFF)
|
||||
|
||||
These options can be passed to `cmake` at command line, e.g.
|
||||
```
|
||||
cmake -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
|
||||
```
|
||||
|
||||
The benchmarks are built and executed with
|
||||
```
|
||||
cmake --build .
|
||||
```
|
||||
|
||||
You can also specify to use a different compiler/version with the cmake step, e.g.:
|
||||
|
||||
```
|
||||
CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 cmake -DBENCH_ID=gcc9 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
|
||||
```
|
||||
|
||||
```
|
||||
CC=/usr/bin/clang-11 CXX=/usr/bin/clang++-11 cmake -DBENCH_ID=clang11 -DBENCH_ARCH=native -DPFFFT_USE_BENCH_FFTW=ON -DPFFFT_USE_BENCH_MKL=ON ../bench
|
||||
```
|
||||
|
||||
For using MSVC/Windows, the cmake command requires/needs the generator and architecture options and to be called from the VS Developer prompt:
|
||||
```
|
||||
cmake -G "Visual Studio 16 2019" -A x64 ../bench/
|
||||
```
|
||||
|
||||
see [https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html#visual-studio-generators)
|
||||
|
||||
|
||||
|
||||
For running with different compiler version(s):
|
||||
* copy the result file (.tgz), e.g. `cp *.tgz ../`
|
||||
* delete the build directory: `rm -rf *`
|
||||
* then continue with the cmake step
|
||||
|
||||
|
||||
#### Benchmark results and contribution
|
||||
You might contribute by providing us the results of your computer(s).
|
||||
|
||||
The benchmark results are stored in a separate git-repository:
|
||||
See [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks).
|
||||
|
||||
This is to keep this repositories' sources small.
|
||||
|
||||
224
pffft/bench/CMakeLists.txt
Normal file
224
pffft/bench/CMakeLists.txt
Normal file
@@ -0,0 +1,224 @@
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
project(BENCH_PFFFT)
|
||||
|
||||
set(BENCH_ID "default" CACHE STRING "ID: use single word without spaces. gets part of result filename")
|
||||
|
||||
option(BENCH_FAST_MATH "Build with fast math - non IEEE compliant" ON)
|
||||
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||
set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge/ARM-NEON:armv7-a")
|
||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
set(BENCH_ARCH "native" CACHE STRING "target architecture (-march): native/SSE:core2/AVX:sandybridge")
|
||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC") # others: "Intel"
|
||||
set(BENCH_ARCH "AVX" CACHE STRING "target architecture (/arch): SSE2/AVX")
|
||||
else()
|
||||
set(BENCH_ARCH "" CACHE STRING "target architecture - use full compiler option!")
|
||||
endif()
|
||||
|
||||
# architecture/optimization options
|
||||
option(PFFFT_USE_SIMD "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
|
||||
option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
|
||||
option(PFFFT_USE_SIMD_NEON "force using NEON on ARM? (requires PFFFT_USE_SIMD)" OFF)
|
||||
option(PFFFT_USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
|
||||
|
||||
option(PFFFT_USE_BENCH_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
|
||||
option(PFFFT_USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
|
||||
option(PFFFT_USE_BENCH_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
|
||||
option(PFFFT_USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
|
||||
option(PFFFT_USE_BENCH_MKL "use Intel MKL in fft benchmark? needs to be installed" OFF)
|
||||
|
||||
|
||||
set(OSSTR "")
|
||||
if (WIN32)
|
||||
set(OSSTR "Win32")
|
||||
endif (WIN32)
|
||||
if (UNIX)
|
||||
set(OSSTR "Unix")
|
||||
endif (UNIX)
|
||||
|
||||
set(BUILD_DIR_TO_EXE "")
|
||||
set(CMAKE_PLATFORM_OPT "")
|
||||
set(CMAKE_MAKE_OPT "")
|
||||
if (MSVC)
|
||||
set(BUILD_DIR_TO_EXE "Release/")
|
||||
set(CMAKE_PLATFORM_OPT "-A \"${CMAKE_GENERATOR_PLATFORM}\"")
|
||||
set(CMAKE_MAKE_OPT "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}")
|
||||
endif()
|
||||
|
||||
|
||||
set(benchdir "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}")
|
||||
set(benchdir_flt "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/float")
|
||||
set(benchdir_dbl "${CMAKE_BINARY_DIR}/bench_${BENCH_ID}/double")
|
||||
set(builddir_flt "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_float")
|
||||
set(builddir_dbl "${CMAKE_BINARY_DIR}/build_${BENCH_ID}_double")
|
||||
|
||||
add_custom_command(OUTPUT "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir}"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "benchmark ${BENCH_ID}" > "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "CMake major: ${CMAKE_MAJOR_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "CMake minor: ${CMAKE_MINOR_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "OS: ${OSSTR}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "System: ${CMAKE_SYSTEM_NAME}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "System CPU: ${CMAKE_SYSTEM_PROCESSOR}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "System Version: ${CMAKE_HOST_SYSTEM_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "C Compiler: ${CMAKE_C_COMPILER_ID}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "C Version: ${CMAKE_C_COMPILER_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "C++ Compiler: ${CMAKE_CXX_COMPILER_ID}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "C++ Version: ${CMAKE_CXX_COMPILER_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "MSVC Version: ${MSVC_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "MSVC Toolset: ${MSVC_TOOLSET_VERSION}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Exe Suffix: ${CMAKE_EXECUTABLE_SUFFIX}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "C Byte Order: ${CMAKE_C_BYTE_ORDER}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "C++ Byte Order: ${CMAKE_CXX_BYTE_ORDER}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Architecture: ${BENCH_ARCH}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Fast math: ${BENCH_FAST_MATH}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD=${PFFFT_USE_SIMD}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config DISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}" >> "${benchdir}/info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "config PFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}" >> "${benchdir}/info.txt"
|
||||
)
|
||||
|
||||
if (UNIX)
|
||||
add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
|
||||
COMMAND bash "-c" "${CMAKE_CURRENT_SOURCE_DIR}/unix_info.sh"
|
||||
DEPENDS "${benchdir}/info.txt"
|
||||
WORKING_DIRECTORY ${benchdir}
|
||||
)
|
||||
else()
|
||||
add_custom_command(OUTPUT "${benchdir}/unix_info.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir}/unix_info.txt"
|
||||
DEPENDS "${benchdir}/info.txt"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
add_custom_command(OUTPUT "${builddir_flt}/directory.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_flt}"
|
||||
COMMAND ${CMAKE_COMMAND} -E touch "${builddir_flt}/directory.txt"
|
||||
)
|
||||
|
||||
add_custom_command(OUTPUT "${builddir_dbl}/directory.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${builddir_dbl}"
|
||||
COMMAND ${CMAKE_COMMAND} -E touch "${builddir_dbl}/directory.txt"
|
||||
)
|
||||
|
||||
add_custom_command(OUTPUT "${benchdir_flt}/directory.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_flt}"
|
||||
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_flt}/directory.txt"
|
||||
)
|
||||
|
||||
add_custom_command(OUTPUT "${benchdir_dbl}/directory.txt"
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${benchdir_dbl}"
|
||||
COMMAND ${CMAKE_COMMAND} -E touch "${benchdir_dbl}/directory.txt"
|
||||
)
|
||||
|
||||
|
||||
|
||||
add_custom_target(build_float
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "start cmake for float in ${builddir_flt}"
|
||||
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
|
||||
"${CMAKE_MAKE_OPT}"
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
"-DARCH=${BENCH_ARCH}"
|
||||
-DUSE_FAST_MATH=${BENCH_FAST_MATH}
|
||||
-DPFFFT_USE_TYPE_FLOAT=ON
|
||||
-DPFFFT_USE_TYPE_DOUBLE=OFF
|
||||
-DUSE_FLOAT_PREC=ON
|
||||
-DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
|
||||
-DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
|
||||
-DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
|
||||
-DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
|
||||
-DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
|
||||
-DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
|
||||
-DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
|
||||
-DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
|
||||
-DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
|
||||
"${CMAKE_SOURCE_DIR}/.."
|
||||
# COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for float in ${builddir_flt}"
|
||||
COMMAND ${CMAKE_COMMAND} --build . --config Release
|
||||
DEPENDS "${builddir_flt}/directory.txt"
|
||||
WORKING_DIRECTORY "${builddir_flt}"
|
||||
)
|
||||
|
||||
add_custom_target(build_double
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "start cmake for double in ${builddir_dbl}"
|
||||
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_PLATFORM_OPT}
|
||||
"${CMAKE_MAKE_OPT}"
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
"-DARCH=${BENCH_ARCH}"
|
||||
-DUSE_FAST_MATH=${BENCH_FAST_MATH}
|
||||
-DPFFFT_USE_TYPE_FLOAT=OFF
|
||||
-DPFFFT_USE_TYPE_DOUBLE=ON
|
||||
-DUSE_FLOAT_PREC=OFF
|
||||
-DPFFFT_USE_SIMD=${PFFFT_USE_SIMD}
|
||||
-DDISABLE_SIMD_AVX=${DISABLE_SIMD_AVX}
|
||||
-DPFFFT_USE_SIMD_NEON=${PFFFT_USE_SIMD_NEON}
|
||||
-DPFFFT_USE_SCALAR_VECT=${PFFFT_USE_SCALAR_VECT}
|
||||
-DPFFFT_USE_BENCH_FFTW=${PFFFT_USE_BENCH_FFTW}
|
||||
-DPFFFT_USE_BENCH_GREEN=${PFFFT_USE_BENCH_GREEN}
|
||||
-DPFFFT_USE_BENCH_KISS=${PFFFT_USE_BENCH_KISS}
|
||||
-DPFFFT_USE_BENCH_POCKET=${PFFFT_USE_BENCH_POCKET}
|
||||
-DPFFFT_USE_BENCH_MKL=${PFFFT_USE_BENCH_MKL}
|
||||
"${CMAKE_SOURCE_DIR}/.."
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "start cmake --build . for double in ${builddir_dbl}"
|
||||
COMMAND ${CMAKE_COMMAND} --build . --config Release
|
||||
DEPENDS "${builddir_dbl}/directory.txt"
|
||||
WORKING_DIRECTORY "${builddir_dbl}"
|
||||
)
|
||||
|
||||
add_custom_target(bench_float
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for float"
|
||||
COMMAND "${builddir_flt}/${BUILD_DIR_TO_EXE}bench_pffft_float${CMAKE_EXECUTABLE_SUFFIX}"
|
||||
DEPENDS "${benchdir_flt}/directory.txt" build_float
|
||||
WORKING_DIRECTORY "${benchdir_flt}"
|
||||
)
|
||||
|
||||
add_custom_target(bench_double
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "start benchmark for double"
|
||||
COMMAND "${builddir_dbl}/${BUILD_DIR_TO_EXE}bench_pffft_double${CMAKE_EXECUTABLE_SUFFIX}"
|
||||
DEPENDS "${benchdir_dbl}/directory.txt" build_double
|
||||
WORKING_DIRECTORY "${benchdir_dbl}"
|
||||
)
|
||||
|
||||
add_custom_target(bench ALL
|
||||
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
|
||||
# DEPENDS "${benchdir}/info.txt" "${benchdir}/unix_info.txt"
|
||||
DEPENDS "${benchdir}/info.txt" bench_float bench_double "${benchdir}/unix_info.txt"
|
||||
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||
)
|
||||
|
||||
add_custom_target(bench_float_tar
|
||||
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
|
||||
DEPENDS "${benchdir}/info.txt" bench_float "${benchdir}/unix_info.txt"
|
||||
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||
)
|
||||
|
||||
add_custom_target(bench_double_tar
|
||||
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||
COMMAND ${CMAKE_COMMAND} -E tar cvz "bench_${BENCH_ID}.tgz" ${benchdir}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo ""
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "now mail result file bench_${BENCH_ID}.tgz"
|
||||
DEPENDS "${benchdir}/info.txt" bench_double "${benchdir}/unix_info.txt"
|
||||
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||
)
|
||||
|
||||
add_custom_target(clean_results
|
||||
COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_flt}"
|
||||
COMMAND ${CMAKE_COMMAND} -E remove_directory "${builddir_dbl}"
|
||||
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
|
||||
)
|
||||
|
||||
9
pffft/bench/unix_info.sh
Executable file
9
pffft/bench/unix_info.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
lscpu > unix_lscpu.txt
|
||||
cat /proc/cpuinfo > unix_cpuinfo.txt
|
||||
lsb_release -a > unix_lsb_release.txt
|
||||
FILES=$(ls -1 /etc/*-release)
|
||||
if [ ! -z "$FILES" ]; then
|
||||
cp /etc/*-release ./
|
||||
fi
|
||||
345
pffft/bench_conv.cpp
Normal file
345
pffft/bench_conv.cpp
Normal file
@@ -0,0 +1,345 @@
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
#include <cstdint>
|
||||
#include <complex>
|
||||
|
||||
#include "papi_perf_counter.h"
|
||||
|
||||
//#if defined(HAVE_MIPP) && !defined(NO_MIPP)
|
||||
#if defined(HAVE_MIPP)
|
||||
#include <mipp.h>
|
||||
|
||||
#define MIPP_VECTOR mipp::vector
|
||||
#else
|
||||
#define MIPP_VECTOR std::vector
|
||||
#endif
|
||||
|
||||
#include "pf_conv_dispatcher.h"
|
||||
#include "pf_conv.h"
|
||||
|
||||
|
||||
#define TEST_WITH_MIN_LEN 0
|
||||
|
||||
|
||||
MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
|
||||
{
|
||||
MIPP_VECTOR<float> v(N < 0 ? M : N);
|
||||
std::mt19937 g;
|
||||
g.seed(seed_value);
|
||||
constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
|
||||
for (int k = 0; k < M; ++k)
|
||||
v[k] = float(int_fast32_t(g())) * scale;
|
||||
for (int k = M; k < N; ++k)
|
||||
v[k] = 0.0F;
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
int bench_oop_core(
|
||||
const conv_f_ptrs & conv_arch,
|
||||
const float * signal, const int sz_signal,
|
||||
const float * filter, const int sz_filter,
|
||||
const int blockLen,
|
||||
float * y
|
||||
)
|
||||
{
|
||||
conv_buffer_state state;
|
||||
const auto conv_oop = conv_arch.fp_conv_float_oop;
|
||||
int n_out_sum = 0;
|
||||
state.offset = 0;
|
||||
state.size = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||
{
|
||||
state.size += blockLen;
|
||||
int n_out = conv_oop(signal, &state, filter, sz_filter, y);
|
||||
n_out_sum += n_out;
|
||||
}
|
||||
return n_out_sum;
|
||||
}
|
||||
|
||||
int bench_inplace_core(
|
||||
const conv_f_ptrs & conv_arch,
|
||||
float * signal, const int sz_signal,
|
||||
const float * filter, const int sz_filter,
|
||||
const int blockLen
|
||||
)
|
||||
{
|
||||
conv_buffer_state state;
|
||||
const auto conv_inplace = conv_arch.fp_conv_float_inplace;
|
||||
int n_out_sum = 0;
|
||||
state.offset = 0;
|
||||
state.size = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||
{
|
||||
state.size += blockLen;
|
||||
int n_out = conv_inplace(signal, &state, filter, sz_filter);
|
||||
n_out_sum += n_out;
|
||||
}
|
||||
return n_out_sum;
|
||||
}
|
||||
|
||||
|
||||
int bench_oop(
|
||||
const conv_f_ptrs & conv_arch,
|
||||
float * buffer,
|
||||
const float * signal, const int sz_signal,
|
||||
const float * filter, const int sz_filter,
|
||||
const int blockLen,
|
||||
float * y
|
||||
)
|
||||
{
|
||||
conv_buffer_state state;
|
||||
const auto conv_oop = conv_arch.fp_conv_float_oop;
|
||||
const auto move_rest = conv_arch.fp_conv_float_move_rest;
|
||||
int n_out_sum = 0;
|
||||
state.offset = 0;
|
||||
state.size = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||
{
|
||||
move_rest(buffer, &state);
|
||||
//memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
|
||||
std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
|
||||
state.size += blockLen;
|
||||
int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
|
||||
n_out_sum += n_out;
|
||||
}
|
||||
return n_out_sum;
|
||||
}
|
||||
|
||||
int bench_cx_real_oop(
|
||||
const conv_f_ptrs & conv_arch,
|
||||
complexf * buffer,
|
||||
const float * signal_re, const int sz_signal_re,
|
||||
const float * filter, const int sz_filter,
|
||||
const int blockLen,
|
||||
float * y_re
|
||||
)
|
||||
{
|
||||
conv_buffer_state state;
|
||||
const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
|
||||
const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
|
||||
// interpret buffer, signal and output vector y as complex data
|
||||
complexf * y = reinterpret_cast<complexf *>(y_re);
|
||||
const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
|
||||
const int sz_signal = sz_signal_re / 2;
|
||||
int n_out_sum = 0;
|
||||
state.offset = 0;
|
||||
state.size = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
|
||||
{
|
||||
move_rest(buffer, &state);
|
||||
//memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
|
||||
std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
|
||||
state.size += blockLen;
|
||||
int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
|
||||
n_out_sum += n_out;
|
||||
}
|
||||
return n_out_sum;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
// cli defaults:
|
||||
// process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
|
||||
int arch = 0, N = 64 * 1024 * 1024;
|
||||
int filterLen = 128, blockLen = 1024;
|
||||
int seed_sig = 1, seed_filter = 2;
|
||||
bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
|
||||
|
||||
for (int i = 1; i < argc; ++i)
|
||||
{
|
||||
if (i+1 < argc && !strcmp(argv[i], "-a"))
|
||||
arch = atoi(argv[++i]);
|
||||
else if (i+1 < argc && !strcmp(argv[i], "-n"))
|
||||
N = atoi(argv[++i]) * 1024 * 1024;
|
||||
else if (i+1 < argc && !strcmp(argv[i], "-f"))
|
||||
filterLen = atoi(argv[++i]);
|
||||
else if (i+1 < argc && !strcmp(argv[i], "-b"))
|
||||
blockLen = atoi(argv[++i]);
|
||||
else if (i+1 < argc && !strcmp(argv[i], "-ss"))
|
||||
seed_sig = atoi(argv[++i]);
|
||||
else if (i+1 < argc && !strcmp(argv[i], "-sf"))
|
||||
seed_filter = atoi(argv[++i]);
|
||||
else if (!strcmp(argv[i], "-v"))
|
||||
verbose = true;
|
||||
else if (!strcmp(argv[i], "-h"))
|
||||
showUsage = exitFromUsage = true;
|
||||
else
|
||||
fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
|
||||
}
|
||||
|
||||
int num_arch = 0;
|
||||
const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
fprintf(stderr, "num_arch is %d\n", num_arch);
|
||||
for (int a = 0; a < num_arch; ++a)
|
||||
if (conv_arch_ptrs[a])
|
||||
fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
|
||||
else
|
||||
fprintf(stderr, " arch %d is nullptr !!!\n", a );
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
|
||||
{
|
||||
fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
|
||||
fprintf(stderr, " [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
|
||||
fprintf(stderr, "arch is one of:");
|
||||
for (int a = 0; a < num_arch; ++a)
|
||||
if (conv_arch_ptrs[a])
|
||||
fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
|
||||
fprintf(stderr, "\n");
|
||||
if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
#ifdef HAVE_PAPI
|
||||
fprintf(stderr, "PAPI is available\n");
|
||||
#else
|
||||
fprintf(stderr, "PAPI is NOT available!\n");
|
||||
#endif
|
||||
}
|
||||
#if !defined(HAVE_MIPP)
|
||||
fprintf(stderr, "MIPP is NOT available!\n");
|
||||
#endif
|
||||
|
||||
//int float_simd_size[num_arch];
|
||||
int max_simd_size = -1;
|
||||
for (int a = 0; a < num_arch; ++a)
|
||||
{
|
||||
if (conv_arch_ptrs[a])
|
||||
{
|
||||
const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
|
||||
//float_simd_size[a] = sz;
|
||||
if (max_simd_size < sz)
|
||||
max_simd_size = sz;
|
||||
if (verbose)
|
||||
fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
|
||||
}
|
||||
//else
|
||||
// float_simd_size[a] = 0;
|
||||
}
|
||||
//const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
|
||||
if (verbose)
|
||||
fprintf(stderr, "max float simd size: %d\n", max_simd_size);
|
||||
|
||||
#if TEST_WITH_MIN_LEN
|
||||
filterLen = 2;
|
||||
#endif
|
||||
|
||||
// round up filter length
|
||||
filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
|
||||
|
||||
#if TEST_WITH_MIN_LEN
|
||||
blockLen = 1;
|
||||
N = 2 * (3 + filterLen); // produce 3+1 samples
|
||||
#endif
|
||||
|
||||
if (!conv_arch_ptrs[arch])
|
||||
{
|
||||
fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
|
||||
return 1;
|
||||
}
|
||||
const conv_f_ptrs & conv_arch = *conv_arch_ptrs[arch];
|
||||
if (verbose)
|
||||
fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
|
||||
|
||||
fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
|
||||
N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
|
||||
|
||||
MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
|
||||
MIPP_VECTOR<float> y(N + 1, 0.0F);
|
||||
MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
|
||||
MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
|
||||
MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
|
||||
|
||||
#if 1 && TEST_WITH_MIN_LEN
|
||||
for (int k = 0; k < N; ++k)
|
||||
s[k] = (k+1);
|
||||
for (int k = 0; k < filterLen; ++k)
|
||||
filter[k] = (k+1);
|
||||
#endif
|
||||
|
||||
s[N] = 123.0F;
|
||||
y[N] = 321.0F;
|
||||
buffer[blockLen + filterLen] = 789.0F;
|
||||
buffer_cx[blockLen + filterLen].i = 987.0F;
|
||||
|
||||
fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
|
||||
int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
|
||||
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
|
||||
#if TEST_WITH_MIN_LEN
|
||||
for (int k = 0; k < n_oop_out; ++k )
|
||||
fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
|
||||
n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
|
||||
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
|
||||
assert(s[N] == 123.0F);
|
||||
assert(y[N] == 321.0F);
|
||||
assert(buffer[blockLen + filterLen] == 789.0F);
|
||||
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
|
||||
#if TEST_WITH_MIN_LEN
|
||||
for (int k = 0; k < n_oop_out; ++k )
|
||||
fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
|
||||
n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
|
||||
fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
|
||||
assert(s[N] == 123.0F);
|
||||
assert(y[N] == 321.0F);
|
||||
assert(buffer[blockLen + filterLen] == 789.0F);
|
||||
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
|
||||
#if TEST_WITH_MIN_LEN
|
||||
fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
|
||||
for (int k = 0; k < n_oop_out; ++k )
|
||||
fprintf(stderr, "y[%2d] = %g %+g * i\n", k, y[2*k], y[2*k+1]);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
|
||||
const int Nc = N /2;
|
||||
fprintf(stderr, "reference with std::complex<float>:\n");
|
||||
for (int off = 0; off +filterLen <= Nc; ++off )
|
||||
{
|
||||
std::complex<float> sum(0.0F, 0.0F);
|
||||
for (int k=0; k < filterLen; ++k)
|
||||
sum += sc[off+k] * filter[k];
|
||||
fprintf(stderr, "yv[%2d] = %g %+g * i\n", off, sum.real(), sum.imag() );
|
||||
}
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
|
||||
int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
|
||||
fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
|
||||
assert(s[N] == 123.0F);
|
||||
assert(y[N] == 321.0F);
|
||||
assert(buffer[blockLen + filterLen] == 789.0F);
|
||||
assert(buffer_cx[blockLen + filterLen].i == 987.0F);
|
||||
#if TEST_WITH_MIN_LEN
|
||||
for (int k = 0; k < n_inp_out; ++k )
|
||||
fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
return 0;
|
||||
}
|
||||
889
pffft/bench_mixers.cpp
Normal file
889
pffft/bench_mixers.cpp
Normal file
@@ -0,0 +1,889 @@
|
||||
/*
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
bench for mixer algorithm/implementations
|
||||
|
||||
*/
|
||||
|
||||
#include <pf_mixer.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "papi_perf_counter.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
#define HAVE_SYS_TIMES
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SYS_TIMES
|
||||
# include <sys/times.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define VC_EXTRALEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#define BENCH_REF_TRIG_FUNC 1
|
||||
#define BENCH_OUT_OF_PLACE_ALGOS 0
|
||||
#define BENCH_INPLACE_ALGOS 1
|
||||
|
||||
#define SAVE_BY_DEFAULT 0
|
||||
#define SAVE_LIMIT_MSPS 16
|
||||
|
||||
#if 0
|
||||
#define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
|
||||
#define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
|
||||
#define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
|
||||
#define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
|
||||
#define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
|
||||
#define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
|
||||
#define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
|
||||
#define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
|
||||
#define BENCH_FILE_REC_OSC_CC ""
|
||||
#define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
|
||||
#define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
|
||||
#else
|
||||
#define BENCH_FILE_SHIFT_MATH_CC ""
|
||||
#define BENCH_FILE_ADD_FAST_CC ""
|
||||
#define BENCH_FILE_ADD_FAST_INP_C ""
|
||||
#define BENCH_FILE_UNROLL_INP_C ""
|
||||
#define BENCH_FILE_LTD_UNROLL_INP_C ""
|
||||
#define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C ""
|
||||
#define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C ""
|
||||
#define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C ""
|
||||
#define BENCH_FILE_REC_OSC_CC ""
|
||||
#define BENCH_FILE_REC_OSC_INP_C ""
|
||||
#define BENCH_FILE_REC_OSC_SSE_INP_C ""
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#if defined(HAVE_SYS_TIMES)
|
||||
static double ttclk = 0.;
|
||||
|
||||
static double uclock_sec(int find_start)
|
||||
{
|
||||
struct tms t0, t;
|
||||
if (ttclk == 0.)
|
||||
{
|
||||
ttclk = sysconf(_SC_CLK_TCK);
|
||||
fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
|
||||
}
|
||||
times(&t);
|
||||
if (find_start)
|
||||
{
|
||||
t0 = t;
|
||||
while (t0.tms_utime == t.tms_utime)
|
||||
times(&t);
|
||||
}
|
||||
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
|
||||
return ((double)t.tms_utime) / ttclk;
|
||||
}
|
||||
|
||||
#elif defined(WIN32)
|
||||
// https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
|
||||
double uclock_sec(int find_start)
|
||||
{
|
||||
FILETIME a, b, c, d;
|
||||
if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
|
||||
{
|
||||
// Returns total user time.
|
||||
// Can be tweaked to include kernel times as well.
|
||||
return
|
||||
(double)(d.dwLowDateTime |
|
||||
((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
|
||||
}
|
||||
else {
|
||||
// Handle error
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
double uclock_sec(int find_start)
|
||||
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
|
||||
#endif
|
||||
|
||||
|
||||
void save(complexf * d, int B, int N, const char * fn)
|
||||
{
|
||||
if (!fn || !fn[0])
|
||||
{
|
||||
if (! SAVE_BY_DEFAULT)
|
||||
return;
|
||||
fn = "/dev/shm/bench.bin";
|
||||
}
|
||||
FILE * f = fopen(fn, "wb");
|
||||
if (!f) {
|
||||
fprintf(stderr, "error writing result to %s\n", fn);
|
||||
return;
|
||||
}
|
||||
if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
|
||||
N = SAVE_LIMIT_MSPS * 1024 * 1024;
|
||||
for (int off = 0; off + B <= N; off += B)
|
||||
{
|
||||
fwrite(d+off, sizeof(complexf), B, f);
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_math_cc(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
const complexf *input,
|
||||
complexf *output,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
float phase = 0.0F;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_math_cc(const int B, const int N, const bool ignore_time) {
|
||||
int iter, off;
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_math_cc(B, N, ignore_time, input, output, iter, off);
|
||||
|
||||
save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
|
||||
|
||||
free(input);
|
||||
free(output);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_shift_table_cc(int B, int N) {
|
||||
double t0, t1, tstop, T, nI;
|
||||
int iter, off;
|
||||
int table_size=65536;
|
||||
float phase = 0.0F;
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
|
||||
shift_table_data_t table_data = shift_table_init(table_size);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
iter = 0;
|
||||
off = 0;
|
||||
t0 = uclock_sec(1);
|
||||
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
do {
|
||||
// work
|
||||
phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
|
||||
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( t1 < tstop && off + B < N );
|
||||
|
||||
save(output, B, off, NULL);
|
||||
free(input);
|
||||
free(output);
|
||||
T = ( t1 - t0 ); /* duration per fft() */
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_shift_addfast(int B, int N) {
|
||||
double t0, t1, tstop, T, nI;
|
||||
int iter, off;
|
||||
float phase = 0.0F;
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_addfast_data_t state = shift_addfast_init(-0.0009F);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
iter = 0;
|
||||
off = 0;
|
||||
t0 = uclock_sec(1);
|
||||
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
do {
|
||||
// work
|
||||
phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
|
||||
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( t1 < tstop && off + B < N );
|
||||
|
||||
save(output, B, off, BENCH_FILE_ADD_FAST_CC);
|
||||
|
||||
free(input);
|
||||
free(output);
|
||||
T = ( t1 - t0 ); /* duration per fft() */
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_addfast_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_addfast_data_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
float phase = 0.0F;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
phase = shift_addfast_inp_c(data+off, B, &state, phase);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_addfast_inp(int B, int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_addfast_data_t state = shift_addfast_init(-0.0009F);
|
||||
int iter, off;
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_addfast_inplace(
|
||||
B, N, ignore_time, input, state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
|
||||
|
||||
free(input);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_shift_unroll_oop(int B, int N) {
|
||||
double t0, t1, tstop, T, nI;
|
||||
int iter, off;
|
||||
float phase = 0.0F;
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
iter = 0;
|
||||
off = 0;
|
||||
t0 = uclock_sec(1);
|
||||
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
do {
|
||||
// work
|
||||
phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
|
||||
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( t1 < tstop && off + B < N );
|
||||
|
||||
save(output, B, off, NULL);
|
||||
free(input);
|
||||
free(output);
|
||||
T = ( t1 - t0 ); /* duration per fft() */
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_unroll_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_unroll_data_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
float phase = 0.0F;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
phase = shift_unroll_inp_c(data+off, B, &state, phase);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_unroll_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
|
||||
int iter, off;
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_unroll_inplace(
|
||||
B, N, ignore_time, input, state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_UNROLL_INP_C);
|
||||
|
||||
free(input);
|
||||
shift_unroll_deinit(&state);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
|
||||
double bench_shift_limited_unroll_oop(int B, int N) {
|
||||
double t0, t1, tstop, T, nI;
|
||||
int iter, off;
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
iter = 0;
|
||||
off = 0;
|
||||
t0 = uclock_sec(1);
|
||||
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
do {
|
||||
// work
|
||||
shift_limited_unroll_cc(input+off, output+off, B, &state);
|
||||
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( t1 < tstop && off + B < N );
|
||||
|
||||
save(output, B, off, NULL);
|
||||
free(input);
|
||||
free(output);
|
||||
T = ( t1 - t0 ); /* duration per fft() */
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_limited_unroll_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_limited_unroll_data_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
shift_limited_unroll_inp_c(data+off, B, &state);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_limited_unroll_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
|
||||
int iter, off;
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_limited_unroll_inplace(
|
||||
B, N, ignore_time, input, state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
|
||||
|
||||
free(input);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_limited_unroll_A_sse_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_limited_unroll_A_sse_data_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
shift_limited_unroll_A_sse_inp_c(data+off, B, &state);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_limited_unroll_A_sse_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_limited_unroll_A_sse_data_t *state = (shift_limited_unroll_A_sse_data_t*)malloc(sizeof(shift_limited_unroll_A_sse_data_t));
|
||||
int iter, off;
|
||||
|
||||
*state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_limited_unroll_A_sse_inplace(
|
||||
B, N, ignore_time, input, *state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
|
||||
|
||||
free(input);
|
||||
free(state);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_limited_unroll_B_sse_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_limited_unroll_B_sse_data_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
shift_limited_unroll_B_sse_inp_c(data+off, B, &state);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_limited_unroll_B_sse_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_limited_unroll_B_sse_data_t *state = (shift_limited_unroll_B_sse_data_t*)malloc(sizeof(shift_limited_unroll_B_sse_data_t));
|
||||
int iter, off;
|
||||
|
||||
*state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
//shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_limited_unroll_B_sse_inplace(
|
||||
B, N, ignore_time, input, *state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
|
||||
|
||||
free(input);
|
||||
free(state);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_limited_unroll_C_sse_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_limited_unroll_C_sse_data_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
shift_limited_unroll_C_sse_inp_c(data+off, B, &state);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_limited_unroll_C_sse_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
shift_limited_unroll_C_sse_data_t *state = (shift_limited_unroll_C_sse_data_t*)malloc(sizeof(shift_limited_unroll_C_sse_data_t));
|
||||
int iter, off;
|
||||
|
||||
*state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
double T = bench_core_shift_limited_unroll_C_sse_inplace(
|
||||
B, N, ignore_time, input, *state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
|
||||
|
||||
free(input);
|
||||
free(state);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_shift_rec_osc_cc_oop(int B, int N) {
|
||||
double t0, t1, tstop, T, nI;
|
||||
int iter, off;
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
complexf *output = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state, shift_state;
|
||||
shift_recursive_osc_conf_t gen_conf, shift_conf;
|
||||
|
||||
shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
iter = 0;
|
||||
off = 0;
|
||||
t0 = uclock_sec(1);
|
||||
tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
do {
|
||||
// work
|
||||
shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
|
||||
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( t1 < tstop && off + B < N );
|
||||
|
||||
save(input, B, off, BENCH_FILE_REC_OSC_CC);
|
||||
|
||||
save(output, B, off, NULL);
|
||||
free(input);
|
||||
free(output);
|
||||
T = ( t1 - t0 ); /* duration per fft() */
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_rec_osc_cc_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_recursive_osc_conf_t &conf, shift_recursive_osc_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
shift_recursive_osc_inp_c(data+off, B, &conf, &state);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_rec_osc_cc_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state, shift_state;
|
||||
shift_recursive_osc_conf_t gen_conf, shift_conf;
|
||||
int iter, off;
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
|
||||
|
||||
double T = bench_core_shift_rec_osc_cc_inplace(
|
||||
B, N, ignore_time, input, shift_conf, shift_state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
|
||||
free(input);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
double bench_core_shift_rec_osc_sse_c_inplace(
|
||||
const int B, const int N, const bool ignore_time,
|
||||
complexf *data,
|
||||
shift_recursive_osc_sse_conf_t &conf, shift_recursive_osc_sse_t &state,
|
||||
int &iters_out, int &off_out
|
||||
)
|
||||
{
|
||||
const double t0 = uclock_sec(1);
|
||||
const double tstop = t0 + 0.5; /* benchmark duration: 500 ms */
|
||||
double t1;
|
||||
int off = 0, iter = 0;
|
||||
papi_perf_counter perf_counter(1);
|
||||
|
||||
do {
|
||||
// work
|
||||
shift_recursive_osc_sse_inp_c(data+off, B, &conf, &state);
|
||||
off += B;
|
||||
++iter;
|
||||
t1 = uclock_sec(0);
|
||||
} while ( off + B < N && (ignore_time || t1 < tstop) );
|
||||
|
||||
iters_out = iter;
|
||||
off_out = off;
|
||||
return t1 - t0;
|
||||
}
|
||||
|
||||
double bench_shift_rec_osc_sse_c_inp(const int B, const int N, const bool ignore_time) {
|
||||
complexf *input = (complexf *)malloc(N * sizeof(complexf));
|
||||
shift_recursive_osc_t gen_state;
|
||||
shift_recursive_osc_conf_t gen_conf;
|
||||
|
||||
shift_recursive_osc_sse_t *shift_state = (shift_recursive_osc_sse_t*)malloc(sizeof(shift_recursive_osc_sse_t));
|
||||
shift_recursive_osc_sse_conf_t shift_conf;
|
||||
int iter, off;
|
||||
|
||||
shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
|
||||
gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
|
||||
|
||||
shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
|
||||
|
||||
double T = bench_core_shift_rec_osc_sse_c_inplace(
|
||||
B, N, ignore_time, input, shift_conf, *shift_state,
|
||||
iter, off
|
||||
);
|
||||
|
||||
save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
|
||||
free(input);
|
||||
free(shift_state);
|
||||
printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
|
||||
double nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
|
||||
return (nI / T); /* normalized iterations per second */
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
double rt;
|
||||
|
||||
// process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
|
||||
int B = 8 * 1024;
|
||||
int N = 64 * 1024 * 1024;
|
||||
int showUsage = 0;
|
||||
bool ignore_time = true;
|
||||
|
||||
if (argc == 1)
|
||||
showUsage = 1;
|
||||
|
||||
if (1 < argc)
|
||||
B = atoi(argv[1]);
|
||||
if (2 < argc)
|
||||
N = atoi(argv[2]) * 1024 * 1024;
|
||||
|
||||
if ( !B || !N || showUsage )
|
||||
{
|
||||
fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
|
||||
if ( !B || !N )
|
||||
return 0;
|
||||
}
|
||||
|
||||
fprintf(stderr, "processing up to N = %d MSamples with block length of %d samples\n",
|
||||
N / (1024 * 1024), B );
|
||||
|
||||
|
||||
#if BENCH_REF_TRIG_FUNC
|
||||
printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
|
||||
rt = bench_shift_math_cc(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
#endif
|
||||
|
||||
#if BENCH_OUT_OF_PLACE_ALGOS
|
||||
printf("starting bench of shift_table_cc (out-of-place) ..\n");
|
||||
rt = bench_shift_table_cc(B, N);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
|
||||
rt = bench_shift_addfast(B, N);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
|
||||
rt = bench_shift_unroll_oop(B, N);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
|
||||
rt = bench_shift_limited_unroll_oop(B, N);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
|
||||
rt = bench_shift_rec_osc_cc_oop(B, N);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
#endif
|
||||
|
||||
#if BENCH_INPLACE_ALGOS
|
||||
|
||||
printf("starting bench of shift_addfast_inp_c in-place ..\n");
|
||||
rt = bench_shift_addfast_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("starting bench of shift_unroll_inp_c in-place ..\n");
|
||||
rt = bench_shift_unroll_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
|
||||
rt = bench_shift_limited_unroll_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
if ( have_sse_shift_mixer_impl() )
|
||||
{
|
||||
printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
|
||||
rt = bench_shift_limited_unroll_A_sse_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
|
||||
rt = bench_shift_limited_unroll_B_sse_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
|
||||
rt = bench_shift_limited_unroll_C_sse_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
}
|
||||
|
||||
printf("starting bench of shift_recursive_osc_cc in-place ..\n");
|
||||
rt = bench_shift_rec_osc_cc_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
|
||||
if ( have_sse_shift_mixer_impl() )
|
||||
{
|
||||
printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
|
||||
rt = bench_shift_rec_osc_sse_c_inp(B, N, ignore_time);
|
||||
printf(" %f MSamples/sec\n\n", rt * 1E-6);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
1402
pffft/bench_pffft.c
Normal file
1402
pffft/bench_pffft.c
Normal file
File diff suppressed because it is too large
Load Diff
26
pffft/cmake/FindMIPP.cmake
Normal file
26
pffft/cmake/FindMIPP.cmake
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
if(MIPP_INCLUDE_DIRS)
|
||||
set(MIPP_FIND_QUIETLY TRUE)
|
||||
endif()
|
||||
|
||||
find_path(MIPP_INCLUDE_DIRS NAMES mipp.h
|
||||
HINTS
|
||||
${MIPP_ROOT}
|
||||
$ENV{HOME}/.local
|
||||
PATH_SUFFIXES include/mipp
|
||||
)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(MIPP DEFAULT_MSG MIPP_INCLUDE_DIRS)
|
||||
|
||||
if(MIPP_FOUND AND NOT TARGET MIPP)
|
||||
message(STATUS "MIPP_FOUND -> creating interface library MIPP at ${MIPP_INCLUDE_DIRS}")
|
||||
add_library(MIPP INTERFACE)
|
||||
target_compile_definitions(MIPP INTERFACE HAVE_MIPP=1)
|
||||
target_include_directories(MIPP INTERFACE ${MIPP_INCLUDE_DIRS})
|
||||
target_compile_features(MIPP INTERFACE cxx_std_11)
|
||||
else()
|
||||
message(WARNING "MIPP not found.")
|
||||
endif()
|
||||
|
||||
mark_as_advanced(MIPP_INCLUDE_DIRS)
|
||||
25
pffft/cmake/FindPAPI.cmake
Normal file
25
pffft/cmake/FindPAPI.cmake
Normal file
@@ -0,0 +1,25 @@
|
||||
# Find PAPI libraries
|
||||
# Once done this will define
|
||||
# PAPI_FOUND - System has PAPI
|
||||
# PAPI_INCLUDE_DIRS - The PAPI include directories
|
||||
# PAPI_LIBRARIES - The libraries needed to use PAPI
|
||||
|
||||
if(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)
|
||||
set(PAPI_FIND_QUIETLY TRUE)
|
||||
endif()
|
||||
|
||||
find_path(PAPI_INCLUDE_DIRS NAMES papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include)
|
||||
find_library(PAPI_LIBRARIES NAMES papi HINTS ${PAPI_ROOT} PATH_SUFFIXES lib lib64)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARIES PAPI_INCLUDE_DIRS)
|
||||
if(PAPI_FOUND AND NOT TARGET PAPI::PAPI)
|
||||
set(PAPI_LIBRARIES ${PAPI_LIBRARIES} rt)
|
||||
|
||||
add_library(PAPI::PAPI SHARED IMPORTED)
|
||||
set_target_properties(PAPI::PAPI PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}"
|
||||
IMPORTED_LOCATION "${PAPI_LIBRARIES}")
|
||||
endif()
|
||||
|
||||
mark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)
|
||||
11
pffft/cmake/compiler_warnings.cmake
Normal file
11
pffft/cmake/compiler_warnings.cmake
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
function(target_activate_cxx_compiler_warnings target)
|
||||
target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
|
||||
target_compile_options(${target} PRIVATE $<$<CXX_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
|
||||
endfunction()
|
||||
|
||||
function(target_activate_c_compiler_warnings target)
|
||||
target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:GNU>:-Wall -Wextra -pedantic>)
|
||||
target_compile_options(${target} PRIVATE $<$<C_COMPILER_ID:Clang>:-Wall -Wextra -pedantic>)
|
||||
endfunction()
|
||||
|
||||
197
pffft/cmake/target_optimizations.cmake
Normal file
197
pffft/cmake/target_optimizations.cmake
Normal file
@@ -0,0 +1,197 @@
|
||||
|
||||
# cmake options: TARGET_C_ARCH / TARGET_CPP_ARCH:
|
||||
# and optionally: TARGET_C_EXTRA TARGET_CXX_EXTRA
|
||||
#
|
||||
# provided:
|
||||
# - function: target_set_c_arch_flags(<target>) # uses options TARGET_C_ARCH and TARGET_C_EXTRA
|
||||
# - function: target_set_cxx_arch_flags(<target>) # uses options TARGET_CXX_ARCH and TARGET_CXX_EXTRA
|
||||
# - macro: target_set_cxx_arch_option(<target> <gcc/clang_march> <gcc/clang_extra> <msvc_arch>)
|
||||
#
|
||||
# see https://en.wikichip.org/wiki/x86/extensions
|
||||
# and https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
|
||||
# for gcc specific architecture options
|
||||
# and https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
|
||||
# or https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
|
||||
# for msvc specific architecture options
|
||||
|
||||
# https://en.wikichip.org/wiki/arm/versions
|
||||
# https://en.wikipedia.org/wiki/Raspberry_Pi
|
||||
# https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#ARM-Options
|
||||
# https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores
|
||||
# https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
|
||||
|
||||
# arm32_rpi1 untested
|
||||
# -mcpu=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp -mtune=arm1176jzf-s
|
||||
# arm32_rpi2 untested
|
||||
# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||
# arm32_rpi3 with "armv7-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
|
||||
# "-march=armv7-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||
# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
|
||||
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4"
|
||||
# arm32_rpi3 with "armv8-a" tested on Raspbian GNU/Linux 10 (buster), 32-bit => MIPP test reports: NEONv1, 128 bits
|
||||
# "-march=armv8-a" "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53"
|
||||
# arm32_rpi4 untested
|
||||
# RPi 4 Model B: Cortex-A72 => "-mtune=cortex-a72" ?
|
||||
# "-mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits -mtune=cortex-a72"
|
||||
|
||||
set(MSVC_EXTRA_OPT_none "")
|
||||
set(GCC_EXTRA_OPT_none "")
|
||||
set(GCC_EXTRA_OPT_neon_vfpv4 "-mfloat-abi=hard" "-mfpu=neon-vfpv4")
|
||||
set(GCC_EXTRA_OPT_neon_rpi3_a53 "-mfloat-abi=hard" "-mfpu=neon-vfpv4" "-mtune=cortex-a53")
|
||||
set(GCC_EXTRA_OPT_neon_rpi4_a72 "-mfloat-abi=hard" "-mfpu=neon-fp-armv8" "-mtune=cortex-a72")
|
||||
|
||||
if ( (CMAKE_SYSTEM_PROCESSOR STREQUAL "i686") OR (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") )
|
||||
set(GCC_MARCH_DESC "native/SSE2:pentium4/SSE3:core2/SSE4:nehalem/AVX:sandybridge/AVX2:haswell")
|
||||
set(GCC_MARCH_VALUES "none;native;pentium4;core2;nehalem;sandybridge;haswell" CACHE INTERNAL "List of possible architectures")
|
||||
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible EXTRA options")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
set(GCC_MARCH_DESC "native/ARMwNEON:armv8-a")
|
||||
set(GCC_MARCH_VALUES "none;native;armv8-a" CACHE INTERNAL "List of possible architectures")
|
||||
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
|
||||
set(GCC_MARCH_DESC "native/ARMwNEON:armv7-a")
|
||||
set(GCC_MARCH_VALUES "none;native;armv7-a" CACHE INTERNAL "List of possible architectures")
|
||||
set(GCC_EXTRA_VALUES "none;neon_vfpv4;neon_rpi3_a53;neon_rpi4_a72" CACHE INTERNAL "List of possible additional options")
|
||||
else()
|
||||
message(WARNING "unsupported CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}'")
|
||||
# other PROCESSORs could be "ppc", "ppc64", "arm" - or something else?!
|
||||
set(GCC_MARCH_DESC "native")
|
||||
set(GCC_MARCH_VALUES "none;native" CACHE INTERNAL "List of possible architectures")
|
||||
set(GCC_EXTRA_VALUES "" CACHE INTERNAL "List of possible additional options")
|
||||
endif()
|
||||
|
||||
# cmake options - depending on C/C++ compiler
|
||||
# how are chances, that C and C++ compilers are from different vendors?
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||
set(TARGET_C_ARCH "none" CACHE STRING "gcc target C architecture (-march): ${GCC_MARCH_DESC}")
|
||||
set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||
set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
|
||||
set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||
endif()
|
||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
set(TARGET_C_ARCH "none" CACHE STRING "clang target C architecture (-march): ${GCC_MARCH_DESC}")
|
||||
set_property(CACHE TARGET_C_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||
set(TARGET_C_EXTRA "none" CACHE STRING "gcc additional options for C")
|
||||
set_property(CACHE TARGET_C_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||
endif()
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||
set(TARGET_C_ARCH "none" CACHE STRING "msvc target C architecture (/arch): SSE2/AVX/AVX2/AVX512")
|
||||
set(TARGET_C_EXTRA "none" CACHE STRING "msvc additional options")
|
||||
else()
|
||||
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set(TARGET_CXX_ARCH "none" CACHE STRING "gcc target C++ architecture (-march): ${GCC_MARCH_DESC}")
|
||||
set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||
set(TARGET_CXX_EXTRA "none" CACHE STRING "gcc additional options for C++")
|
||||
set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||
endif()
|
||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
set(TARGET_CXX_ARCH "none" CACHE STRING "clang target C++ architecture (-march): ${GCC_MARCH_DESC}")
|
||||
set_property(CACHE TARGET_CXX_ARCH PROPERTY STRINGS ${GCC_MARCH_VALUES})
|
||||
if ( NOT (GCC_EXTRA_VALUES STREQUAL "") )
|
||||
set(TARGET_CXX_EXTRA "none" CACHE STRING "clang additional options for C++")
|
||||
set_property(CACHE TARGET_CXX_EXTRA PROPERTY STRINGS ${GCC_EXTRA_VALUES})
|
||||
endif()
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||
set(TARGET_CXX_ARCH "none" CACHE STRING "msvc target C++ architecture (/arch): SSE2/AVX/AVX2/AVX512")
|
||||
set(TARGET_CXX_EXTRA "none" CACHE STRING "msvc additional options")
|
||||
else()
|
||||
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}', see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
|
||||
function(target_set_c_arch_flags target)
|
||||
if ( ("${TARGET_C_ARCH}" STREQUAL "") OR ("${TARGET_C_ARCH}" STREQUAL "none") )
|
||||
message(STATUS "C ARCH for target ${target} is not set!")
|
||||
else()
|
||||
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
|
||||
target_compile_options(${target} PRIVATE "-march=${TARGET_C_ARCH}")
|
||||
message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||
target_compile_options(${target} PRIVATE "/arch:${TARGET_C_ARCH}")
|
||||
message(STATUS "C ARCH for target ${target} set: ${TARGET_C_ARCH}")
|
||||
else()
|
||||
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
endif()
|
||||
if ( ("${TARGET_C_EXTRA}" STREQUAL "") OR ("${TARGET_C_EXTRA}" STREQUAL "none") )
|
||||
message(STATUS "C additional options for target ${target} is not set!")
|
||||
else()
|
||||
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
|
||||
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
|
||||
message(STATUS "C additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_C_EXTRA}}")
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||
# target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_C_EXTRA}}")
|
||||
message(STATUS "C additional options for target ${target} not usable with MSVC")
|
||||
else()
|
||||
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
if ( ("${TARGET_C_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
|
||||
message(STATUS "additional option contains neon: setting PFFFT_ENABLE_NEON for C target ${target}")
|
||||
target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(target_set_cxx_arch_flags target)
|
||||
if ( ("${TARGET_CXX_ARCH}" STREQUAL "") OR ("${TARGET_CXX_ARCH}" STREQUAL "none") )
|
||||
message(STATUS "C++ ARCH for target ${target} is not set!")
|
||||
else()
|
||||
if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
|
||||
target_compile_options(${target} PRIVATE "-march=${TARGET_CXX_ARCH}")
|
||||
message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||
target_compile_options(${target} PRIVATE "/arch:${TARGET_CXX_ARCH}")
|
||||
message(STATUS "C++ ARCH for target ${target} set: ${TARGET_CXX_ARCH}")
|
||||
else()
|
||||
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
endif()
|
||||
if ( ("${TARGET_CXX_EXTRA}" STREQUAL "") OR ("${TARGET_CXX_EXTRA}" STREQUAL "none") )
|
||||
message(STATUS "C++ additional options for target ${target} is not set!")
|
||||
else()
|
||||
if ( (CMAKE_C_COMPILER_ID STREQUAL "GNU") OR (CMAKE_C_COMPILER_ID STREQUAL "Clang") )
|
||||
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
|
||||
message(STATUS "C++ additional options for target ${target} set: ${GCC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
|
||||
# target_compile_options(${target} PRIVATE "${MSVC_EXTRA_OPT_${TARGET_CXX_EXTRA}}")
|
||||
message(STATUS "C++ additional options for target ${target} not usable with MSVC")
|
||||
else()
|
||||
message(WARNING "unsupported C compiler '${CMAKE_C_COMPILER_ID}' for target_set_c_arch_flags(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
if ( ("${TARGET_CXX_EXTRA}" MATCHES "^neon_.*") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") )
|
||||
message(STATUS "additional option contains 'neon': setting PFFFT_ENABLE_NEON for C++ target ${target}")
|
||||
target_compile_definitions(${target} PRIVATE PFFFT_ENABLE_NEON=1)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
macro(target_set_cxx_arch_option target gcc_clang_arch gcc_clang_extra msvc_arch )
|
||||
if ( (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") )
|
||||
|
||||
if ( NOT (("${gcc_clang_arch}" STREQUAL "") OR ("${gcc_clang_arch}" STREQUAL "none") ) )
|
||||
target_compile_options(${target} PRIVATE "-march=${gcc_clang_arch}")
|
||||
message(STATUS "C++ ARCH for target ${target}: ${gcc_clang_arch}")
|
||||
endif()
|
||||
if (NOT ( ("${gcc_clang_extra}" STREQUAL "") OR ("${gcc_clang_extra}" STREQUAL "none") ) )
|
||||
target_compile_options(${target} PRIVATE "${GCC_EXTRA_OPT_${gcc_clang_extra}}")
|
||||
message(STATUS "C++ additional options for target ${target}: ${GCC_EXTRA_OPT_${gcc_clang_extra}}")
|
||||
endif()
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||
if ( NOT (("${msvc_arch}" STREQUAL "") OR ("${msvc_arch}" STREQUAL "none") ) )
|
||||
target_compile_options(${target} PRIVATE "/arch:${msvc_arch}")
|
||||
message(STATUS "C++ ARCH for target ${target} set: ${msvc_arch}")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "unsupported C++ compiler '${CMAKE_CXX_COMPILER_ID}' for target_set_cxx_arch_option(), see https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
25
pffft/cross_build_mingw32.sh
Executable file
25
pffft/cross_build_mingw32.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
# requires debian/ubuntu packages: zip gcc-mingw-w64
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "usage: $0 <zip-post> <any other cmake options>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ZIP_POST="$1"
|
||||
shift
|
||||
|
||||
CROSS="i686-w64-mingw32"
|
||||
WN="w32"
|
||||
TOOLCHAIN="mingw-w32-i686.cmake"
|
||||
|
||||
rm -rf build_${WN}_${ZIP_POST}
|
||||
echo -e "\n\n********************************************************"
|
||||
echo "start build of pffft_${WN}_${ZIP_POST}"
|
||||
mkdir build_${WN}_${ZIP_POST} && \
|
||||
cmake -S . -B build_${WN}_${ZIP_POST} \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
|
||||
-DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
|
||||
"$@" && \
|
||||
cmake --build build_${WN}_${ZIP_POST}
|
||||
25
pffft/cross_build_mingw64.sh
Executable file
25
pffft/cross_build_mingw64.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
# requires debian/ubuntu packages: zip gcc-mingw-w64
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "usage: $0 <zip-post> <any other cmake options>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ZIP_POST="$1"
|
||||
shift
|
||||
|
||||
# CROSS="x86_64-w64-mingw32"
|
||||
WN="w64"
|
||||
TOOLCHAIN="mingw-w64-x64_64.cmake"
|
||||
|
||||
rm -rf build_${WN}_${ZIP_POST}
|
||||
echo -e "\n\n********************************************************"
|
||||
echo "start build of pffft_${WN}_${ZIP_POST}"
|
||||
mkdir build_${WN}_${ZIP_POST} && \
|
||||
cmake -S . -B build_${WN}_${ZIP_POST} \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} \
|
||||
-DCMAKE_INSTALL_PREFIX=pffft_bin-${WN}_${ZIP_POST} \
|
||||
"$@" && \
|
||||
cmake --build build_${WN}_${ZIP_POST}
|
||||
63
pffft/examples/CMakeLists.txt
Normal file
63
pffft/examples/CMakeLists.txt
Normal file
@@ -0,0 +1,63 @@
|
||||
cmake_minimum_required(VERSION 3.1)
|
||||
project(examples)
|
||||
|
||||
if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
|
||||
# using Visual Studio C++
|
||||
message(STATUS "INFO: detected MSVC: will not link math lib m")
|
||||
set(MATHLIB "")
|
||||
add_definitions("/D_CRT_SECURE_NO_WARNINGS")
|
||||
set(MSVC_DISABLED_WARNINGS_LIST "C4996")
|
||||
else()
|
||||
if(PFFFT_DISABLE_LINK_WITH_M)
|
||||
else()
|
||||
message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
|
||||
set(MATHLIB "m")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(STDCXXLIB "")
|
||||
if (MINGW)
|
||||
set(STDCXXLIB "stdc++")
|
||||
endif()
|
||||
|
||||
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
|
||||
if (PFFFT_USE_TYPE_DOUBLE)
|
||||
add_executable(example_cpp11_real_dbl_fwd example_cpp11_real_dbl_fwd.cpp)
|
||||
target_compile_definitions(example_cpp11_real_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
target_link_libraries(example_cpp11_real_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||
set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET example_cpp11_real_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
add_executable(example_cpp11_cplx_dbl_fwd example_cpp11_cplx_dbl_fwd.cpp)
|
||||
target_compile_definitions(example_cpp11_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_DOUBLE)
|
||||
target_link_libraries(example_cpp11_cplx_dbl_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||
set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET example_cpp11_cplx_dbl_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
add_executable(example_c_cplx_dbl_fwd example_c_cplx_dbl_fwd.c)
|
||||
target_compile_definitions(example_c_cplx_dbl_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
target_link_libraries(example_c_cplx_dbl_fwd PFFFT ${MATHLIB})
|
||||
endif()
|
||||
|
||||
|
||||
if (PFFFT_USE_TYPE_FLOAT)
|
||||
add_executable(example_cpp98_real_flt_fwd example_cpp98_real_flt_fwd.cpp)
|
||||
target_compile_definitions(example_cpp98_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
target_link_libraries(example_cpp98_real_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||
set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD 98)
|
||||
set_property(TARGET example_cpp98_real_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
add_executable(example_cpp98_cplx_flt_fwd example_cpp98_cplx_flt_fwd.cpp)
|
||||
target_compile_definitions(example_cpp98_cplx_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
target_link_libraries(example_cpp98_cplx_flt_fwd PFFFT ${STDCXXLIB} ${MATHLIB})
|
||||
set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD 98)
|
||||
set_property(TARGET example_cpp98_cplx_flt_fwd PROPERTY CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
add_executable(example_c_real_flt_fwd example_c_real_flt_fwd.c)
|
||||
target_compile_definitions(example_c_real_flt_fwd PRIVATE PFFFT_ENABLE_FLOAT)
|
||||
target_link_libraries(example_c_real_flt_fwd PFFFT ${MATHLIB})
|
||||
endif()
|
||||
|
||||
69
pffft/examples/example_c_cplx_dbl_fwd.c
Normal file
69
pffft/examples/example_c_cplx_dbl_fwd.c
Normal file
@@ -0,0 +1,69 @@
|
||||
|
||||
#include "pffft_double.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
void c_forward_complex_double(const int transformLen)
|
||||
{
|
||||
printf("running %s()\n", __FUNCTION__);
|
||||
|
||||
/* first check - might be skipped */
|
||||
if (transformLen < pffftd_min_fft_size(PFFFT_COMPLEX))
|
||||
{
|
||||
fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffftd_min_fft_size(PFFFT_COMPLEX));
|
||||
return;
|
||||
}
|
||||
|
||||
/* instantiate FFT and prepare transformation for length N */
|
||||
PFFFTD_Setup *ffts = pffftd_new_setup(transformLen, PFFFT_COMPLEX);
|
||||
|
||||
/* one more check */
|
||||
if (!ffts)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"Error: transformation length %d is not decomposable into small prime factors. "
|
||||
"Next valid transform size is: %d ; next power of 2 is: %d\n",
|
||||
transformLen,
|
||||
pffftd_nearest_transform_size(transformLen, PFFFT_COMPLEX, 1),
|
||||
pffftd_next_power_of_two(transformLen) );
|
||||
return;
|
||||
}
|
||||
|
||||
/* allocate aligned vectors for input X and output Y */
|
||||
double *X = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */
|
||||
double *Y = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double)); /* complex: re/im interleaved */
|
||||
double *W = (double*)pffftd_aligned_malloc(transformLen * 2 * sizeof(double));
|
||||
|
||||
/* prepare some input data */
|
||||
for (int k = 0; k < 2 * transformLen; k += 4)
|
||||
{
|
||||
X[k] = k / 2; /* real */
|
||||
X[k+1] = (k / 2) & 1; /* imag */
|
||||
|
||||
X[k+2] = -1 - k / 2; /* real */
|
||||
X[k+3] = (k / 2) & 1; /* imag */
|
||||
}
|
||||
|
||||
/* do the forward transform; write complex spectrum result into Y */
|
||||
pffftd_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
|
||||
|
||||
/* print spectral output */
|
||||
printf("output should be complex spectrum with %d complex bins\n", transformLen);
|
||||
for (int k = 0; k < 2 * transformLen; k += 2)
|
||||
printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
|
||||
|
||||
pffftd_aligned_free(W);
|
||||
pffftd_aligned_free(Y);
|
||||
pffftd_aligned_free(X);
|
||||
pffftd_destroy_setup(ffts);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 16;
|
||||
c_forward_complex_double(N);
|
||||
return 0;
|
||||
}
|
||||
66
pffft/examples/example_c_real_flt_fwd.c
Normal file
66
pffft/examples/example_c_real_flt_fwd.c
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
void c_forward_real_float(const int transformLen)
|
||||
{
|
||||
printf("running %s()\n", __FUNCTION__);
|
||||
|
||||
/* first check - might be skipped */
|
||||
if (transformLen < pffft_min_fft_size(PFFFT_REAL))
|
||||
{
|
||||
fprintf(stderr, "Error: minimum FFT transformation length is %d\n", pffft_min_fft_size(PFFFT_REAL));
|
||||
return;
|
||||
}
|
||||
|
||||
/* instantiate FFT and prepare transformation for length N */
|
||||
PFFFT_Setup *ffts = pffft_new_setup(transformLen, PFFFT_REAL);
|
||||
|
||||
/* one more check */
|
||||
if (!ffts)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"Error: transformation length %d is not decomposable into small prime factors. "
|
||||
"Next valid transform size is: %d ; next power of 2 is: %d\n",
|
||||
transformLen,
|
||||
pffft_nearest_transform_size(transformLen, PFFFT_REAL, 1),
|
||||
pffft_next_power_of_two(transformLen) );
|
||||
return;
|
||||
}
|
||||
|
||||
/* allocate aligned vectors for input X and output Y */
|
||||
float *X = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
|
||||
float *Y = (float*)pffft_aligned_malloc(transformLen * sizeof(float)); /* complex: re/im interleaved */
|
||||
float *W = (float*)pffft_aligned_malloc(transformLen * sizeof(float));
|
||||
|
||||
/* prepare some input data */
|
||||
for (int k = 0; k < transformLen; k += 2)
|
||||
{
|
||||
X[k] = k;
|
||||
X[k+1] = -1-k;
|
||||
}
|
||||
|
||||
/* do the forward transform; write complex spectrum result into Y */
|
||||
pffft_transform_ordered(ffts, X, Y, W, PFFFT_FORWARD);
|
||||
|
||||
/* print spectral output */
|
||||
printf("output should be complex spectrum with %d complex bins\n", transformLen /2);
|
||||
for (int k = 0; k < transformLen; k += 2)
|
||||
printf("Y[%d] = %f + i * %f\n", k/2, Y[k], Y[k+1]);
|
||||
|
||||
pffft_aligned_free(W);
|
||||
pffft_aligned_free(Y);
|
||||
pffft_aligned_free(X);
|
||||
pffft_destroy_setup(ffts);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 32;
|
||||
c_forward_real_float(N);
|
||||
return 0;
|
||||
}
|
||||
66
pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
Normal file
66
pffft/examples/example_cpp11_cplx_dbl_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
#include "pffft.hpp"
|
||||
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
void cxx11_forward_complex_double(const int transformLen)
|
||||
{
|
||||
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||
|
||||
// first check - might be skipped
|
||||
using FFT_T = pffft::Fft< std::complex<double> >;
|
||||
if (transformLen < FFT_T::minFFtsize())
|
||||
{
|
||||
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// instantiate FFT and prepare transformation for length N
|
||||
pffft::Fft< std::complex<double> > fft(transformLen);
|
||||
|
||||
// one more check
|
||||
if (!fft.isValid())
|
||||
{
|
||||
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// allocate aligned vectors for input X and output Y
|
||||
auto X = fft.valueVector();
|
||||
auto Y = fft.spectrumVector();
|
||||
|
||||
// alternative access: get raw pointers to aligned vectors
|
||||
std::complex<double> *Xs = X.data();
|
||||
std::complex<double> *Ys = Y.data();
|
||||
|
||||
// prepare some input data
|
||||
for (int k = 0; k < transformLen; k += 2)
|
||||
{
|
||||
X[k] = std::complex<double>(k, k&1); // access through AlignedVector<double>
|
||||
Xs[k+1] = std::complex<double>(-1-k, k&1); // access through raw pointer
|
||||
}
|
||||
|
||||
// do the forward transform; write complex spectrum result into Y
|
||||
fft.forward(X, Y);
|
||||
|
||||
// print spectral output
|
||||
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||
{
|
||||
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 16;
|
||||
cxx11_forward_complex_double(N);
|
||||
return 0;
|
||||
}
|
||||
66
pffft/examples/example_cpp11_real_dbl_fwd.cpp
Normal file
66
pffft/examples/example_cpp11_real_dbl_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
#include "pffft.hpp"
|
||||
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
void cxx11_forward_real_double(const int transformLen)
|
||||
{
|
||||
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||
|
||||
// first check - might be skipped
|
||||
using FFT_T = pffft::Fft<double>;
|
||||
if (transformLen < FFT_T::minFFtsize())
|
||||
{
|
||||
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// instantiate FFT and prepare transformation for length N
|
||||
pffft::Fft<double> fft { transformLen };
|
||||
|
||||
// one more check
|
||||
if (!fft.isValid())
|
||||
{
|
||||
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// allocate aligned vectors for (real) input X and (complex) output Y
|
||||
auto X = fft.valueVector(); // input vector; type is AlignedVector<double>
|
||||
auto Y = fft.spectrumVector(); // output vector; type is AlignedVector< std::complex<double> >
|
||||
|
||||
// alternative access: get raw pointers to aligned vectors
|
||||
double *Xs = X.data();
|
||||
std::complex<double> *Ys = Y.data();
|
||||
|
||||
// prepare some input data
|
||||
for (int k = 0; k < transformLen; k += 2)
|
||||
{
|
||||
X[k] = k; // access through AlignedVector<double>
|
||||
Xs[k+1] = -1-k; // access through raw pointer
|
||||
}
|
||||
|
||||
// do the forward transform; write complex spectrum result into Y
|
||||
fft.forward(X, Y);
|
||||
|
||||
// print spectral output
|
||||
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||
{
|
||||
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 32;
|
||||
cxx11_forward_real_double(N);
|
||||
return 0;
|
||||
}
|
||||
66
pffft/examples/example_cpp98_cplx_flt_fwd.cpp
Normal file
66
pffft/examples/example_cpp98_cplx_flt_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
#include "pffft.hpp"
|
||||
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
void cxx98_forward_complex_float(const int transformLen)
|
||||
{
|
||||
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||
|
||||
// first check - might be skipped
|
||||
typedef pffft::Fft< std::complex<float> > FFT_T;
|
||||
if (transformLen < FFT_T::minFFtsize())
|
||||
{
|
||||
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// instantiate FFT and prepare transformation for length N
|
||||
pffft::Fft< std::complex<float> > fft(transformLen);
|
||||
|
||||
// one more check
|
||||
if (!fft.isValid())
|
||||
{
|
||||
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// allocate aligned vectors for input X and output Y
|
||||
pffft::AlignedVector< std::complex<float> > X = fft.valueVector();
|
||||
pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
|
||||
|
||||
// alternative access: get raw pointers to aligned vectors
|
||||
std::complex<float> *Xs = X.data();
|
||||
std::complex<float> *Ys = Y.data();
|
||||
|
||||
// prepare some input data
|
||||
for (int k = 0; k < transformLen; k += 2)
|
||||
{
|
||||
X[k] = std::complex<float>(k, k&1); // access through AlignedVector<float>
|
||||
Xs[k+1] = std::complex<float>(-1-k, k&1); // access through raw pointer
|
||||
}
|
||||
|
||||
// do the forward transform; write complex spectrum result into Y
|
||||
fft.forward(X, Y);
|
||||
|
||||
// print spectral output
|
||||
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||
{
|
||||
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 16;
|
||||
cxx98_forward_complex_float(N);
|
||||
return 0;
|
||||
}
|
||||
66
pffft/examples/example_cpp98_real_flt_fwd.cpp
Normal file
66
pffft/examples/example_cpp98_real_flt_fwd.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
#include "pffft.hpp"
|
||||
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
void cxx98_forward_real_float(const int transformLen)
|
||||
{
|
||||
std::cout << "running " << __FUNCTION__ << "()" << std::endl;
|
||||
|
||||
// first check - might be skipped
|
||||
typedef pffft::Fft<float> FFT_T;
|
||||
if (transformLen < FFT_T::minFFtsize())
|
||||
{
|
||||
std::cerr << "Error: minimum FFT transformation length is " << FFT_T::minFFtsize() << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// instantiate FFT and prepare transformation for length N
|
||||
pffft::Fft<float> fft(transformLen);
|
||||
|
||||
// one more check
|
||||
if (!fft.isValid())
|
||||
{
|
||||
std::cerr << "Error: transformation length " << transformLen << " is not decomposable into small prime factors. "
|
||||
<< "Next valid transform size is: " << FFT_T::nearestTransformSize(transformLen)
|
||||
<< "; next power of 2 is: " << FFT_T::nextPowerOfTwo(transformLen) << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// allocate aligned vectors for input X and output Y
|
||||
pffft::AlignedVector<float> X = fft.valueVector();
|
||||
pffft::AlignedVector< std::complex<float> > Y = fft.spectrumVector();
|
||||
|
||||
// alternative access: get raw pointers to aligned vectors
|
||||
float *Xs = X.data();
|
||||
std::complex<float> *Ys = Y.data();
|
||||
|
||||
// prepare some input data
|
||||
for (int k = 0; k < transformLen; k += 2)
|
||||
{
|
||||
X[k] = k; // access through AlignedVector<float>
|
||||
Xs[k+1] = -1-k; // access through raw pointer
|
||||
}
|
||||
|
||||
// do the forward transform; write complex spectrum result into Y
|
||||
fft.forward(X, Y);
|
||||
|
||||
// print spectral output
|
||||
std::cout << "output should be complex spectrum with " << fft.getSpectrumSize() << " bins" << std::endl;
|
||||
std::cout << "output vector has size " << Y.size() << " (complex bins):" << std::endl;
|
||||
for (unsigned k = 0; k < Y.size(); k += 2)
|
||||
{
|
||||
std::cout << "Y[" << k << "] = " << Y[k] << std::endl;
|
||||
std::cout << "Y[" << k+1 << "] = " << Ys[k+1] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 32;
|
||||
cxx98_forward_real_float(N);
|
||||
return 0;
|
||||
}
|
||||
3130
pffft/fftpack.c
Normal file
3130
pffft/fftpack.c
Normal file
File diff suppressed because it is too large
Load Diff
799
pffft/fftpack.h
Normal file
799
pffft/fftpack.h
Normal file
@@ -0,0 +1,799 @@
|
||||
/*
|
||||
Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
ChangeLog:
|
||||
2011/10/02: this is my first release of this file.
|
||||
*/
|
||||
|
||||
#ifndef FFTPACK_H
|
||||
#define FFTPACK_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft */
|
||||
|
||||
#ifndef FFTPACK_DOUBLE_PRECISION
|
||||
typedef float fftpack_real;
|
||||
typedef int fftpack_int;
|
||||
#else
|
||||
typedef double fftpack_real;
|
||||
typedef int fftpack_int;
|
||||
#endif
|
||||
|
||||
void cffti(fftpack_int n, fftpack_real *wsave);
|
||||
|
||||
void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
|
||||
|
||||
void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
|
||||
|
||||
void rffti(fftpack_int n, fftpack_real *wsave);
|
||||
void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
|
||||
void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
|
||||
|
||||
void cosqi(fftpack_int n, fftpack_real *wsave);
|
||||
void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||
void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||
|
||||
void costi(fftpack_int n, fftpack_real *wsave);
|
||||
void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||
|
||||
void sinqi(fftpack_int n, fftpack_real *wsave);
|
||||
void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||
void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||
|
||||
void sinti(fftpack_int n, fftpack_real *wsave);
|
||||
void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* FFTPACK_H */
|
||||
|
||||
/*
|
||||
|
||||
FFTPACK
|
||||
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
|
||||
version 4 april 1985
|
||||
|
||||
a package of fortran subprograms for the fast fourier
|
||||
transform of periodic and other symmetric sequences
|
||||
|
||||
by
|
||||
|
||||
paul n swarztrauber
|
||||
|
||||
national center for atmospheric research boulder,colorado 80307
|
||||
|
||||
which is sponsored by the national science foundation
|
||||
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
|
||||
|
||||
this package consists of programs which perform fast fourier
|
||||
transforms for both complex and real periodic sequences and
|
||||
certain other symmetric sequences that are listed below.
|
||||
|
||||
1. rffti initialize rfftf and rfftb
|
||||
2. rfftf forward transform of a real periodic sequence
|
||||
3. rfftb backward transform of a real coefficient array
|
||||
|
||||
4. ezffti initialize ezfftf and ezfftb
|
||||
5. ezfftf a simplified real periodic forward transform
|
||||
6. ezfftb a simplified real periodic backward transform
|
||||
|
||||
7. sinti initialize sint
|
||||
8. sint sine transform of a real odd sequence
|
||||
|
||||
9. costi initialize cost
|
||||
10. cost cosine transform of a real even sequence
|
||||
|
||||
11. sinqi initialize sinqf and sinqb
|
||||
12. sinqf forward sine transform with odd wave numbers
|
||||
13. sinqb unnormalized inverse of sinqf
|
||||
|
||||
14. cosqi initialize cosqf and cosqb
|
||||
15. cosqf forward cosine transform with odd wave numbers
|
||||
16. cosqb unnormalized inverse of cosqf
|
||||
|
||||
17. cffti initialize cfftf and cfftb
|
||||
18. cfftf forward transform of a complex periodic sequence
|
||||
19. cfftb unnormalized inverse of cfftf
|
||||
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine rffti(n,wsave)
|
||||
|
||||
****************************************************************
|
||||
|
||||
subroutine rffti initializes the array wsave which is used in
|
||||
both rfftf and rfftb. the prime factorization of n together with
|
||||
a tabulation of the trigonometric functions are computed and
|
||||
stored in wsave.
|
||||
|
||||
input parameter
|
||||
|
||||
n the length of the sequence to be transformed.
|
||||
|
||||
output parameter
|
||||
|
||||
wsave a work array which must be dimensioned at least 2*n+15.
|
||||
the same work array can be used for both rfftf and rfftb
|
||||
as long as n remains unchanged. different wsave arrays
|
||||
are required for different values of n. the contents of
|
||||
wsave must not be changed between calls of rfftf or rfftb.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine rfftf(n,r,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine rfftf computes the fourier coefficients of a real
|
||||
perodic sequence (fourier analysis). the transform is defined
|
||||
below at output parameter r.
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the array r to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
n may change so long as different work arrays are provided
|
||||
|
||||
r a real array of length n which contains the sequence
|
||||
to be transformed
|
||||
|
||||
wsave a work array which must be dimensioned at least 2*n+15.
|
||||
in the program that calls rfftf. the wsave array must be
|
||||
initialized by calling subroutine rffti(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
the same wsave array can be used by rfftf and rfftb.
|
||||
|
||||
|
||||
output parameters
|
||||
|
||||
r r(1) = the sum from i=1 to i=n of r(i)
|
||||
|
||||
if n is even set l =n/2 , if n is odd set l = (n+1)/2
|
||||
|
||||
then for k = 2,...,l
|
||||
|
||||
r(2*k-2) = the sum from i = 1 to i = n of
|
||||
|
||||
r(i)*cos((k-1)*(i-1)*2*pi/n)
|
||||
|
||||
r(2*k-1) = the sum from i = 1 to i = n of
|
||||
|
||||
-r(i)*sin((k-1)*(i-1)*2*pi/n)
|
||||
|
||||
if n is even
|
||||
|
||||
r(n) = the sum from i = 1 to i = n of
|
||||
|
||||
(-1)**(i-1)*r(i)
|
||||
|
||||
***** note
|
||||
this transform is unnormalized since a call of rfftf
|
||||
followed by a call of rfftb will multiply the input
|
||||
sequence by n.
|
||||
|
||||
wsave contains results which must not be destroyed between
|
||||
calls of rfftf or rfftb.
|
||||
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine rfftb(n,r,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine rfftb computes the real perodic sequence from its
|
||||
fourier coefficients (fourier synthesis). the transform is defined
|
||||
below at output parameter r.
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the array r to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
n may change so long as different work arrays are provided
|
||||
|
||||
r a real array of length n which contains the sequence
|
||||
to be transformed
|
||||
|
||||
wsave a work array which must be dimensioned at least 2*n+15.
|
||||
in the program that calls rfftb. the wsave array must be
|
||||
initialized by calling subroutine rffti(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
the same wsave array can be used by rfftf and rfftb.
|
||||
|
||||
|
||||
output parameters
|
||||
|
||||
r for n even and for i = 1,...,n
|
||||
|
||||
r(i) = r(1)+(-1)**(i-1)*r(n)
|
||||
|
||||
plus the sum from k=2 to k=n/2 of
|
||||
|
||||
2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
|
||||
|
||||
-2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
|
||||
|
||||
for n odd and for i = 1,...,n
|
||||
|
||||
r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
|
||||
|
||||
2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
|
||||
|
||||
-2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
|
||||
|
||||
***** note
|
||||
this transform is unnormalized since a call of rfftf
|
||||
followed by a call of rfftb will multiply the input
|
||||
sequence by n.
|
||||
|
||||
wsave contains results which must not be destroyed between
|
||||
calls of rfftb or rfftf.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinti(n,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinti initializes the array wsave which is used in
|
||||
subroutine sint. the prime factorization of n together with
|
||||
a tabulation of the trigonometric functions are computed and
|
||||
stored in wsave.
|
||||
|
||||
input parameter
|
||||
|
||||
n the length of the sequence to be transformed. the method
|
||||
is most efficient when n+1 is a product of small primes.
|
||||
|
||||
output parameter
|
||||
|
||||
wsave a work array with at least int(2.5*n+15) locations.
|
||||
different wsave arrays are required for different values
|
||||
of n. the contents of wsave must not be changed between
|
||||
calls of sint.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sint(n,x,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sint computes the discrete fourier sine transform
|
||||
of an odd sequence x(i). the transform is defined below at
|
||||
output parameter x.
|
||||
|
||||
sint is the unnormalized inverse of itself since a call of sint
|
||||
followed by another call of sint will multiply the input sequence
|
||||
x by 2*(n+1).
|
||||
|
||||
the array wsave which is used by subroutine sint must be
|
||||
initialized by calling subroutine sinti(n,wsave).
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the sequence to be transformed. the method
|
||||
is most efficient when n+1 is the product of small primes.
|
||||
|
||||
x an array which contains the sequence to be transformed
|
||||
|
||||
|
||||
wsave a work array with dimension at least int(2.5*n+15)
|
||||
in the program that calls sint. the wsave array must be
|
||||
initialized by calling subroutine sinti(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
|
||||
output parameters
|
||||
|
||||
x for i=1,...,n
|
||||
|
||||
x(i)= the sum from k=1 to k=n
|
||||
|
||||
2*x(k)*sin(k*i*pi/(n+1))
|
||||
|
||||
a call of sint followed by another call of
|
||||
sint will multiply the sequence x by 2*(n+1).
|
||||
hence sint is the unnormalized inverse
|
||||
of itself.
|
||||
|
||||
wsave contains initialization calculations which must not be
|
||||
destroyed between calls of sint.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine costi(n,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine costi initializes the array wsave which is used in
|
||||
subroutine cost. the prime factorization of n together with
|
||||
a tabulation of the trigonometric functions are computed and
|
||||
stored in wsave.
|
||||
|
||||
input parameter
|
||||
|
||||
n the length of the sequence to be transformed. the method
|
||||
is most efficient when n-1 is a product of small primes.
|
||||
|
||||
output parameter
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15.
|
||||
different wsave arrays are required for different values
|
||||
of n. the contents of wsave must not be changed between
|
||||
calls of cost.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cost(n,x,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cost computes the discrete fourier cosine transform
|
||||
of an even sequence x(i). the transform is defined below at output
|
||||
parameter x.
|
||||
|
||||
cost is the unnormalized inverse of itself since a call of cost
|
||||
followed by another call of cost will multiply the input sequence
|
||||
x by 2*(n-1). the transform is defined below at output parameter x
|
||||
|
||||
the array wsave which is used by subroutine cost must be
|
||||
initialized by calling subroutine costi(n,wsave).
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the sequence x. n must be greater than 1.
|
||||
the method is most efficient when n-1 is a product of
|
||||
small primes.
|
||||
|
||||
x an array which contains the sequence to be transformed
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15
|
||||
in the program that calls cost. the wsave array must be
|
||||
initialized by calling subroutine costi(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
|
||||
output parameters
|
||||
|
||||
x for i=1,...,n
|
||||
|
||||
x(i) = x(1)+(-1)**(i-1)*x(n)
|
||||
|
||||
+ the sum from k=2 to k=n-1
|
||||
|
||||
2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
|
||||
|
||||
a call of cost followed by another call of
|
||||
cost will multiply the sequence x by 2*(n-1)
|
||||
hence cost is the unnormalized inverse
|
||||
of itself.
|
||||
|
||||
wsave contains initialization calculations which must not be
|
||||
destroyed between calls of cost.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinqi(n,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinqi initializes the array wsave which is used in
|
||||
both sinqf and sinqb. the prime factorization of n together with
|
||||
a tabulation of the trigonometric functions are computed and
|
||||
stored in wsave.
|
||||
|
||||
input parameter
|
||||
|
||||
n the length of the sequence to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
|
||||
output parameter
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15.
|
||||
the same work array can be used for both sinqf and sinqb
|
||||
as long as n remains unchanged. different wsave arrays
|
||||
are required for different values of n. the contents of
|
||||
wsave must not be changed between calls of sinqf or sinqb.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinqf(n,x,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinqf computes the fast fourier transform of quarter
|
||||
wave data. that is , sinqf computes the coefficients in a sine
|
||||
series representation with only odd wave numbers. the transform
|
||||
is defined below at output parameter x.
|
||||
|
||||
sinqb is the unnormalized inverse of sinqf since a call of sinqf
|
||||
followed by a call of sinqb will multiply the input sequence x
|
||||
by 4*n.
|
||||
|
||||
the array wsave which is used by subroutine sinqf must be
|
||||
initialized by calling subroutine sinqi(n,wsave).
|
||||
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the array x to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
|
||||
x an array which contains the sequence to be transformed
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15.
|
||||
in the program that calls sinqf. the wsave array must be
|
||||
initialized by calling subroutine sinqi(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
|
||||
output parameters
|
||||
|
||||
x for i=1,...,n
|
||||
|
||||
x(i) = (-1)**(i-1)*x(n)
|
||||
|
||||
+ the sum from k=1 to k=n-1 of
|
||||
|
||||
2*x(k)*sin((2*i-1)*k*pi/(2*n))
|
||||
|
||||
a call of sinqf followed by a call of
|
||||
sinqb will multiply the sequence x by 4*n.
|
||||
therefore sinqb is the unnormalized inverse
|
||||
of sinqf.
|
||||
|
||||
wsave contains initialization calculations which must not
|
||||
be destroyed between calls of sinqf or sinqb.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinqb(n,x,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine sinqb computes the fast fourier transform of quarter
|
||||
wave data. that is , sinqb computes a sequence from its
|
||||
representation in terms of a sine series with odd wave numbers.
|
||||
the transform is defined below at output parameter x.
|
||||
|
||||
sinqf is the unnormalized inverse of sinqb since a call of sinqb
|
||||
followed by a call of sinqf will multiply the input sequence x
|
||||
by 4*n.
|
||||
|
||||
the array wsave which is used by subroutine sinqb must be
|
||||
initialized by calling subroutine sinqi(n,wsave).
|
||||
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the array x to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
|
||||
x an array which contains the sequence to be transformed
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15.
|
||||
in the program that calls sinqb. the wsave array must be
|
||||
initialized by calling subroutine sinqi(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
|
||||
output parameters
|
||||
|
||||
x for i=1,...,n
|
||||
|
||||
x(i)= the sum from k=1 to k=n of
|
||||
|
||||
4*x(k)*sin((2k-1)*i*pi/(2*n))
|
||||
|
||||
a call of sinqb followed by a call of
|
||||
sinqf will multiply the sequence x by 4*n.
|
||||
therefore sinqf is the unnormalized inverse
|
||||
of sinqb.
|
||||
|
||||
wsave contains initialization calculations which must not
|
||||
be destroyed between calls of sinqb or sinqf.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cosqi(n,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cosqi initializes the array wsave which is used in
|
||||
both cosqf and cosqb. the prime factorization of n together with
|
||||
a tabulation of the trigonometric functions are computed and
|
||||
stored in wsave.
|
||||
|
||||
input parameter
|
||||
|
||||
n the length of the array to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
|
||||
output parameter
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15.
|
||||
the same work array can be used for both cosqf and cosqb
|
||||
as long as n remains unchanged. different wsave arrays
|
||||
are required for different values of n. the contents of
|
||||
wsave must not be changed between calls of cosqf or cosqb.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cosqf(n,x,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cosqf computes the fast fourier transform of quarter
|
||||
wave data. that is , cosqf computes the coefficients in a cosine
|
||||
series representation with only odd wave numbers. the transform
|
||||
is defined below at output parameter x
|
||||
|
||||
cosqf is the unnormalized inverse of cosqb since a call of cosqf
|
||||
followed by a call of cosqb will multiply the input sequence x
|
||||
by 4*n.
|
||||
|
||||
the array wsave which is used by subroutine cosqf must be
|
||||
initialized by calling subroutine cosqi(n,wsave).
|
||||
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the array x to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
|
||||
x an array which contains the sequence to be transformed
|
||||
|
||||
wsave a work array which must be dimensioned at least 3*n+15
|
||||
in the program that calls cosqf. the wsave array must be
|
||||
initialized by calling subroutine cosqi(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
|
||||
output parameters
|
||||
|
||||
x for i=1,...,n
|
||||
|
||||
x(i) = x(1) plus the sum from k=2 to k=n of
|
||||
|
||||
2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
|
||||
|
||||
a call of cosqf followed by a call of
|
||||
cosqb will multiply the sequence x by 4*n.
|
||||
therefore cosqb is the unnormalized inverse
|
||||
of cosqf.
|
||||
|
||||
wsave contains initialization calculations which must not
|
||||
be destroyed between calls of cosqf or cosqb.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cosqb(n,x,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cosqb computes the fast fourier transform of quarter
|
||||
wave data. that is , cosqb computes a sequence from its
|
||||
representation in terms of a cosine series with odd wave numbers.
|
||||
the transform is defined below at output parameter x.
|
||||
|
||||
cosqb is the unnormalized inverse of cosqf since a call of cosqb
|
||||
followed by a call of cosqf will multiply the input sequence x
|
||||
by 4*n.
|
||||
|
||||
the array wsave which is used by subroutine cosqb must be
|
||||
initialized by calling subroutine cosqi(n,wsave).
|
||||
|
||||
|
||||
input parameters
|
||||
|
||||
n the length of the array x to be transformed. the method
|
||||
is most efficient when n is a product of small primes.
|
||||
|
||||
x an array which contains the sequence to be transformed
|
||||
|
||||
wsave a work array that must be dimensioned at least 3*n+15
|
||||
in the program that calls cosqb. the wsave array must be
|
||||
initialized by calling subroutine cosqi(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
|
||||
output parameters
|
||||
|
||||
x for i=1,...,n
|
||||
|
||||
x(i)= the sum from k=1 to k=n of
|
||||
|
||||
4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
|
||||
|
||||
a call of cosqb followed by a call of
|
||||
cosqf will multiply the sequence x by 4*n.
|
||||
therefore cosqf is the unnormalized inverse
|
||||
of cosqb.
|
||||
|
||||
wsave contains initialization calculations which must not
|
||||
be destroyed between calls of cosqb or cosqf.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cffti(n,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cffti initializes the array wsave which is used in
|
||||
both cfftf and cfftb. the prime factorization of n together with
|
||||
a tabulation of the trigonometric functions are computed and
|
||||
stored in wsave.
|
||||
|
||||
input parameter
|
||||
|
||||
n the length of the sequence to be transformed
|
||||
|
||||
output parameter
|
||||
|
||||
wsave a work array which must be dimensioned at least 4*n+15
|
||||
the same work array can be used for both cfftf and cfftb
|
||||
as long as n remains unchanged. different wsave arrays
|
||||
are required for different values of n. the contents of
|
||||
wsave must not be changed between calls of cfftf or cfftb.
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cfftf(n,c,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cfftf computes the forward complex discrete fourier
|
||||
transform (the fourier analysis). equivalently , cfftf computes
|
||||
the fourier coefficients of a complex periodic sequence.
|
||||
the transform is defined below at output parameter c.
|
||||
|
||||
the transform is not normalized. to obtain a normalized transform
|
||||
the output must be divided by n. otherwise a call of cfftf
|
||||
followed by a call of cfftb will multiply the sequence by n.
|
||||
|
||||
the array wsave which is used by subroutine cfftf must be
|
||||
initialized by calling subroutine cffti(n,wsave).
|
||||
|
||||
input parameters
|
||||
|
||||
|
||||
n the length of the complex sequence c. the method is
|
||||
more efficient when n is the product of small primes. n
|
||||
|
||||
c a complex array of length n which contains the sequence
|
||||
|
||||
wsave a real work array which must be dimensioned at least 4n+15
|
||||
in the program that calls cfftf. the wsave array must be
|
||||
initialized by calling subroutine cffti(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
the same wsave array can be used by cfftf and cfftb.
|
||||
|
||||
output parameters
|
||||
|
||||
c for j=1,...,n
|
||||
|
||||
c(j)=the sum from k=1,...,n of
|
||||
|
||||
c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
|
||||
|
||||
where i=sqrt(-1)
|
||||
|
||||
wsave contains initialization calculations which must not be
|
||||
destroyed between calls of subroutine cfftf or cfftb
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cfftb(n,c,wsave)
|
||||
|
||||
******************************************************************
|
||||
|
||||
subroutine cfftb computes the backward complex discrete fourier
|
||||
transform (the fourier synthesis). equivalently , cfftb computes
|
||||
a complex periodic sequence from its fourier coefficients.
|
||||
the transform is defined below at output parameter c.
|
||||
|
||||
a call of cfftf followed by a call of cfftb will multiply the
|
||||
sequence by n.
|
||||
|
||||
the array wsave which is used by subroutine cfftb must be
|
||||
initialized by calling subroutine cffti(n,wsave).
|
||||
|
||||
input parameters
|
||||
|
||||
|
||||
n the length of the complex sequence c. the method is
|
||||
more efficient when n is the product of small primes.
|
||||
|
||||
c a complex array of length n which contains the sequence
|
||||
|
||||
wsave a real work array which must be dimensioned at least 4n+15
|
||||
in the program that calls cfftb. the wsave array must be
|
||||
initialized by calling subroutine cffti(n,wsave) and a
|
||||
different wsave array must be used for each different
|
||||
value of n. this initialization does not have to be
|
||||
repeated so long as n remains unchanged thus subsequent
|
||||
transforms can be obtained faster than the first.
|
||||
the same wsave array can be used by cfftf and cfftb.
|
||||
|
||||
output parameters
|
||||
|
||||
c for j=1,...,n
|
||||
|
||||
c(j)=the sum from k=1,...,n of
|
||||
|
||||
c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
|
||||
|
||||
where i=sqrt(-1)
|
||||
|
||||
wsave contains initialization calculations which must not be
|
||||
destroyed between calls of subroutine cfftf or cfftb
|
||||
|
||||
*/
|
||||
20
pffft/fmv.h
Normal file
20
pffft/fmv.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef FMV_H
|
||||
|
||||
#if HAVE_FUNC_ATTRIBUTE_IFUNC
|
||||
#if defined(__has_attribute)
|
||||
#if __has_attribute(target_clones)
|
||||
#if defined(__x86_64)
|
||||
|
||||
// see https://gcc.gnu.org/wiki/FunctionMultiVersioning
|
||||
#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
|
||||
#define HAVE_PF_TARGET_CLONES 1
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef PF_TARGET_CLONES
|
||||
#define PF_TARGET_CLONES
|
||||
#endif
|
||||
|
||||
#endif
|
||||
25
pffft/mingw-w32-i686.cmake
Normal file
25
pffft/mingw-w32-i686.cmake
Normal file
@@ -0,0 +1,25 @@
|
||||
# Sample toolchain file for building for Windows from an Ubuntu Linux system.
|
||||
#
|
||||
# Typical usage:
|
||||
# *) install cross compiler: `sudo apt-get install mingw-w64`
|
||||
# *) cd build
|
||||
# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w32-i686.cmake ..
|
||||
#
|
||||
# build for Windows' 32 bit architecture
|
||||
|
||||
set(CMAKE_SYSTEM_NAME Windows)
|
||||
set(CMAKE_SYSTEM_PROCESSOR x86_64)
|
||||
set(TOOLCHAIN_PREFIX i686-w64-mingw32)
|
||||
|
||||
# cross compilers to use for C, C++ and Fortran
|
||||
set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
|
||||
set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
|
||||
set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
|
||||
|
||||
# target environment on the build host system
|
||||
set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
|
||||
|
||||
# modify default behavior of FIND_XXX() commands
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
25
pffft/mingw-w64-x64_64.cmake
Normal file
25
pffft/mingw-w64-x64_64.cmake
Normal file
@@ -0,0 +1,25 @@
|
||||
# Sample toolchain file for building for Windows from an Ubuntu Linux system.
|
||||
#
|
||||
# Typical usage:
|
||||
# *) install cross compiler: `sudo apt-get install mingw-w64`
|
||||
# *) cd build
|
||||
# *) cmake -DCMAKE_TOOLCHAIN_FILE=~/mingw-w64-x86_64.cmake ..
|
||||
#
|
||||
# build for Windows' 64 bit architecture
|
||||
|
||||
set(CMAKE_SYSTEM_NAME Windows)
|
||||
set(CMAKE_SYSTEM_PROCESSOR x86_64)
|
||||
set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
|
||||
|
||||
# cross compilers to use for C, C++ and Fortran
|
||||
set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
|
||||
set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
|
||||
set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
|
||||
|
||||
# target environment on the build host system
|
||||
set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
|
||||
|
||||
# modify default behavior of FIND_XXX() commands
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
97
pffft/papi_perf_counter.h
Normal file
97
pffft/papi_perf_counter.h
Normal file
@@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
|
||||
/* for measurement of CPU cycles ..
|
||||
*
|
||||
* requires
|
||||
* sudo apt-get install libpapi-dev papi-tools
|
||||
* on debian/ubuntu linux distributions
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_PAPI
|
||||
#include <papi.h>
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
struct papi_perf_counter
|
||||
{
|
||||
papi_perf_counter()
|
||||
: realTime(0.0F), processTime(0.0F), instructions(0LL), ipc(0.0F)
|
||||
, started(false), finished(false), print_at_destruction(false)
|
||||
{ }
|
||||
|
||||
papi_perf_counter(int _start, bool print_at_destruction_ = true)
|
||||
: print_at_destruction(print_at_destruction_)
|
||||
{
|
||||
(void)_start;
|
||||
start();
|
||||
}
|
||||
|
||||
~papi_perf_counter()
|
||||
{
|
||||
if (print_at_destruction)
|
||||
print(stderr);
|
||||
}
|
||||
|
||||
bool start()
|
||||
{
|
||||
static bool reported_start_error = false;
|
||||
#ifdef HAVE_PAPI
|
||||
int ret = PAPI_ipc(&realTime, &processTime, &instructions, &ipc);
|
||||
if (ret && !reported_start_error)
|
||||
{
|
||||
reported_start_error = true;
|
||||
fprintf(stderr, "papi_perf_counter::start(): PAPI_ipc() returned error %d\n", ret);
|
||||
}
|
||||
#else
|
||||
if (!reported_start_error)
|
||||
{
|
||||
reported_start_error = true;
|
||||
fprintf(stderr, "papi_perf_counter::start(): no HAVE_PAPI\n");
|
||||
}
|
||||
int ret = 1;
|
||||
#endif
|
||||
started = (!ret);
|
||||
finished = false;
|
||||
return started;
|
||||
}
|
||||
|
||||
bool finish()
|
||||
{
|
||||
papi_perf_counter end(1, false);
|
||||
if (started && !finished && end.started)
|
||||
{
|
||||
realTime = end.realTime - realTime;
|
||||
processTime = end.processTime - processTime;
|
||||
instructions = end.instructions - instructions;
|
||||
ipc = end.ipc;
|
||||
finished = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void print(FILE *f = stdout)
|
||||
{
|
||||
if (started && !finished)
|
||||
finish();
|
||||
if (!started || !finished)
|
||||
return;
|
||||
double cycles = instructions / ipc;
|
||||
fprintf(f, "real %g, process %g, instructions %lld, ins/cycle %f => cycles %g\n"
|
||||
, realTime, processTime, instructions, ipc, cycles
|
||||
);
|
||||
started = false;
|
||||
}
|
||||
|
||||
float realTime;
|
||||
float processTime;
|
||||
long long instructions;
|
||||
float ipc;
|
||||
bool started;
|
||||
bool finished;
|
||||
bool print_at_destruction;
|
||||
};
|
||||
|
||||
298
pffft/pf_carrier.cpp
Normal file
298
pffft/pf_carrier.cpp
Normal file
@@ -0,0 +1,298 @@
|
||||
/*
|
||||
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||
|
||||
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* include own header first, to see missing includes */
|
||||
#include "pf_carrier.h"
|
||||
#include "fmv.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_dc_f(float* output, int size)
|
||||
{
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+i*0 */
|
||||
output[i++]=(127.0F / 128.0F);
|
||||
output[i++]=0.0F;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_dc_s16(short* output, int size)
|
||||
{
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+i*0 */
|
||||
output[i++]=SHRT_MAX;
|
||||
output[i++]=0;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_pos_fs4_f(float* output, int size)
|
||||
{
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+i*0 */
|
||||
output[i++]=(127.0F / 128.0F);
|
||||
output[i++]=0.0F;
|
||||
/* exp(i* +pi/2) = 0+i*1 */
|
||||
output[i++]=0.0F;
|
||||
output[i++]=(127.0F / 128.0F);
|
||||
/* exp(i* +pi) = -1+i*0 */
|
||||
output[i++]=(-127.0F / 128.0F);
|
||||
output[i++]=0.0F;
|
||||
/* exp(i* -pi/2) = 0+i*-1 */
|
||||
output[i++]=0.0F;
|
||||
output[i++]=(-127.0F / 128.0F);
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_pos_fs4_s16(short* output, int size)
|
||||
{
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+i*0 */
|
||||
output[i++]=SHRT_MAX;
|
||||
output[i++]=0;
|
||||
/* exp(i* +pi/2) = 0+i*1 */
|
||||
output[i++]=0;
|
||||
output[i++]=SHRT_MAX;
|
||||
/* exp(i* +pi) = -1+i*0 */
|
||||
output[i++]=-SHRT_MAX;
|
||||
output[i++]=0;
|
||||
/* exp(i* -pi/2) = 0+i*-1 */
|
||||
output[i++]=0;
|
||||
output[i++]=-SHRT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_neg_fs4_f(float* output, int size)
|
||||
{
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+i*0 */
|
||||
output[i++]=(127.0F / 128.0F);
|
||||
output[i++]=0.0F;
|
||||
/* exp(i* -pi/2) = 0+i*-1 */
|
||||
output[i++]=0.0F;
|
||||
output[i++]=(-127.0F / 128.0F);
|
||||
/* exp(i* +pi) = -1+i*0 */
|
||||
output[i++]=(-127.0F / 128.0F);
|
||||
output[i++]=0.0F;
|
||||
/* exp(i* +pi/2) = 0+i*1 */
|
||||
output[i++]=0.0F;
|
||||
output[i++]=(127.0F / 128.0F);
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_neg_fs4_s16(short* output, int size)
|
||||
{
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+i*0 */
|
||||
output[i++]=SHRT_MAX;
|
||||
output[i++]=0;
|
||||
/* exp(i* -pi/2) = 0+i*-1 */
|
||||
output[i++]=0;
|
||||
output[i++]=-SHRT_MAX;
|
||||
/* exp(i* +pi) = -1+i*0 */
|
||||
output[i++]=-SHRT_MAX;
|
||||
output[i++]=0;
|
||||
/* exp(i* +pi/2) = 0+i*1 */
|
||||
output[i++]=0;
|
||||
output[i++]=SHRT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************/
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_dc_pos_fs4_s16(short* output, int size)
|
||||
{
|
||||
const int m = SHRT_MAX / 2;
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+1+i*0 */
|
||||
output[i++]=m+m;
|
||||
output[i++]=0;
|
||||
/* exp(i* +pi/2) = 1+0+i*1 */
|
||||
output[i++]=m+0;
|
||||
output[i++]=m;
|
||||
/* exp(i* +pi) = 1-1+i*0 */
|
||||
output[i++]=m-m;
|
||||
output[i++]=0;
|
||||
/* exp(i* -pi/2) = 1+0+i*-1 */
|
||||
output[i++]=m;
|
||||
output[i++]=-m;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_dc_neg_fs4_s16(short* output, int size)
|
||||
{
|
||||
const int m = SHRT_MAX / 2;
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* exp(i*0) = 1+1+i*0 */
|
||||
output[i++]=m+m;
|
||||
output[i++]=0;
|
||||
/* exp(i* -pi/2) = 1+0+i*-1 */
|
||||
output[i++]=m+0;
|
||||
output[i++]=-m;
|
||||
/* exp(i* +pi) = 1-1+i*0 */
|
||||
output[i++]=m-m;
|
||||
output[i++]=0;
|
||||
/* exp(i* +pi/2) = 1+0+i*1 */
|
||||
output[i++]=m+0;
|
||||
output[i++]=m;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_pos_neg_fs4_s16(short* output, int size)
|
||||
{
|
||||
const int m = SHRT_MAX / 2;
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* pos(0) + neg(0) = exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */
|
||||
output[i++]=m;
|
||||
output[i++]=-m;
|
||||
|
||||
/* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */
|
||||
output[i++]=-m;
|
||||
output[i++]=m;
|
||||
|
||||
/* pos(2) + neg(2) = exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */
|
||||
output[i++]=-m;
|
||||
output[i++]=m;
|
||||
|
||||
/* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */
|
||||
output[i++]=m;
|
||||
output[i++]=-m;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_dc_pos_neg_fs4_s16(short* output, int size)
|
||||
{
|
||||
const int m = SHRT_MAX / 2;
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* dc + pos(0) + neg(0) = dc + exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */
|
||||
output[i++]=m+m;
|
||||
output[i++]=-m;
|
||||
|
||||
/* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */
|
||||
output[i++]=0;
|
||||
output[i++]=m;
|
||||
|
||||
/* dc + pos(2) + neg(2) = dc + exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */
|
||||
output[i++]=0;
|
||||
output[i++]=m;
|
||||
|
||||
/* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */
|
||||
output[i++]=m+m;
|
||||
output[i++]=-m;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_pos_neg_fs2_s16(short* output, int size)
|
||||
{
|
||||
const int m = SHRT_MAX / 2;
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* dc + exp(i* 0 ) = +1 */
|
||||
output[i++]=m;
|
||||
output[i++]=0;
|
||||
/* dc + exp(i* pi) = -1 */
|
||||
output[i++]=-m;
|
||||
output[i++]=0;
|
||||
/* dc + exp(i* 0 ) = +1 */
|
||||
output[i++]=m;
|
||||
output[i++]=0;
|
||||
/* dc + exp(i* pi) = -1 */
|
||||
output[i++]=-m;
|
||||
output[i++]=0;
|
||||
}
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void generate_dc_pos_neg_fs2_s16(short* output, int size)
|
||||
{
|
||||
const int m = SHRT_MAX / 2;
|
||||
/* size must be multiple of 4 */
|
||||
assert(!(size&3));
|
||||
for(int i=0;i<2*size;)
|
||||
{
|
||||
/* with dc = i*1 */
|
||||
/* dc + exp(i* 0 ) = i*1 +1 */
|
||||
output[i++]=m;
|
||||
output[i++]=m;
|
||||
/* dc + exp(i* pi) = i*1 -1 */
|
||||
output[i++]=-m;
|
||||
output[i++]=m;
|
||||
/* dc + exp(i* 0 ) = i*1 +1 */
|
||||
output[i++]=m;
|
||||
output[i++]=m;
|
||||
/* dc + exp(i* pi) = i*1 -1 */
|
||||
output[i++]=-m;
|
||||
output[i++]=m;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
75
pffft/pf_carrier.h
Normal file
75
pffft/pf_carrier.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||
|
||||
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
_____ _
|
||||
/ ____| | |
|
||||
| | ___ _ __ ___ _ __ | | _____ __
|
||||
| | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
|
||||
| |___| (_) | | | | | | |_) | | __/> <
|
||||
\_____\___/|_| |_| |_| .__/|_|\___/_/\_\
|
||||
| |
|
||||
|_|
|
||||
*/
|
||||
|
||||
typedef struct complexf_s { float i; float q; } complexf;
|
||||
|
||||
|
||||
/* generation functions */
|
||||
void generate_dc_f(float* output, int size);
|
||||
void generate_dc_s16(short* output, int size);
|
||||
void generate_pos_fs4_f(float* output, int size);
|
||||
void generate_pos_fs4_s16(short* output, int size);
|
||||
void generate_neg_fs4_f(float* output, int size);
|
||||
void generate_neg_fs4_s16(short* output, int size);
|
||||
|
||||
void generate_dc_pos_fs4_s16(short* output, int size);
|
||||
void generate_dc_neg_fs4_s16(short* output, int size);
|
||||
void generate_pos_neg_fs4_s16(short* output, int size);
|
||||
void generate_dc_pos_neg_fs4_s16(short* output, int size);
|
||||
|
||||
void generate_pos_neg_fs2_s16(short* output, int size);
|
||||
void generate_dc_pos_neg_fs2_s16(short* output, int size);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
255
pffft/pf_cic.cpp
Normal file
255
pffft/pf_cic.cpp
Normal file
@@ -0,0 +1,255 @@
|
||||
/*
|
||||
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||
|
||||
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* gcc requires this for M_PI !? */
|
||||
#undef __STRICT_ANSI__
|
||||
|
||||
/* include own header first, to see missing includes */
|
||||
#include "pf_cic.h"
|
||||
#include "fmv.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
|
||||
/*
|
||||
____ ___ ____ ____ ____ ____
|
||||
/ ___|_ _/ ___| | _ \| _ \ / ___|
|
||||
| | | | | | | | | | | | |
|
||||
| |___ | | |___ | |_| | |_| | |___
|
||||
\____|___\____| |____/|____/ \____|
|
||||
*/
|
||||
|
||||
#define SINESHIFT 12
|
||||
#define SINESIZE (1<<SINESHIFT)
|
||||
typedef int64_t cic_dt; // data type used for integrators and combs
|
||||
typedef struct {
|
||||
int factor;
|
||||
uint64_t phase;
|
||||
float gain;
|
||||
cic_dt ig0a, ig0b, ig1a, ig1b;
|
||||
cic_dt comb0a, comb0b, comb1a, comb1b;
|
||||
int16_t *sinetable;
|
||||
} cicddc_t;
|
||||
|
||||
void *cicddc_init(int factor) {
|
||||
int i;
|
||||
int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
|
||||
cicddc_t *s;
|
||||
s = (cicddc_t *)malloc(sizeof(cicddc_t));
|
||||
memset(s, 0, sizeof(cicddc_t));
|
||||
|
||||
float sineamp = 32767.0f;
|
||||
s->factor = factor;
|
||||
s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
|
||||
|
||||
s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
|
||||
double f = 2.0 * M_PI / (double)SINESIZE;
|
||||
for(i = 0; i < sinesize2; i++) {
|
||||
s->sinetable[i] = sineamp * cos(f * i);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void cicddc_free(void *state) {
|
||||
cicddc_t *s = (cicddc_t *)state;
|
||||
free(s->sinetable);
|
||||
free(s);
|
||||
}
|
||||
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
|
||||
cicddc_t *s = (cicddc_t *)state;
|
||||
int k;
|
||||
int factor = s->factor;
|
||||
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
|
||||
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
|
||||
uint64_t phase = s->phase, freq;
|
||||
int16_t *sinetable = s->sinetable;
|
||||
float gain = s->gain;
|
||||
|
||||
freq = rate * ((float)(1ULL << 63) * 2);
|
||||
|
||||
int16_t *inp = input;
|
||||
for(k = 0; k < outsize; k++) {
|
||||
int i;
|
||||
cic_dt out0a, out0b, out1a, out1b;
|
||||
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
|
||||
for(i = 0; i < factor; i++) {
|
||||
cic_dt in_a, in_b;
|
||||
int sinep = phase >> (64-SINESHIFT);
|
||||
in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
|
||||
in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
|
||||
phase += freq;
|
||||
/* integrators:
|
||||
The calculations are ordered so that each integrator
|
||||
takes a result from previous loop iteration
|
||||
to make the code more "pipeline-friendly". */
|
||||
ig2a += ig1a; ig2b += ig1b;
|
||||
ig1a += ig0a; ig1b += ig0b;
|
||||
ig0a += in_a; ig0b += in_b;
|
||||
}
|
||||
inp += factor;
|
||||
// comb filters:
|
||||
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
|
||||
comb0a = ig2a; comb0b = ig2b;
|
||||
out1a = out0a - comb1a; out1b = out0b - comb1b;
|
||||
comb1a = out0a; comb1b = out0b;
|
||||
|
||||
output[k].i = (float)out1a * gain;
|
||||
output[k].q = (float)out1b * gain;
|
||||
}
|
||||
|
||||
s->ig0a = ig0a; s->ig0b = ig0b;
|
||||
s->ig1a = ig1a; s->ig1b = ig1b;
|
||||
s->comb0a = comb0a; s->comb0b = comb0b;
|
||||
s->comb1a = comb1a; s->comb1b = comb1b;
|
||||
s->phase = phase;
|
||||
}
|
||||
|
||||
PF_TARGET_CLONES
|
||||
void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
|
||||
cicddc_t *s = (cicddc_t *)state;
|
||||
int k;
|
||||
int factor = s->factor;
|
||||
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
|
||||
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
|
||||
uint64_t phase = s->phase, freq;
|
||||
int16_t *sinetable = s->sinetable;
|
||||
float gain = s->gain;
|
||||
|
||||
freq = rate * ((float)(1ULL << 63) * 2);
|
||||
|
||||
int16_t *inp = input;
|
||||
for(k = 0; k < outsize; k++) {
|
||||
int i;
|
||||
cic_dt out0a, out0b, out1a, out1b;
|
||||
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
|
||||
for(i = 0; i < factor; i++) {
|
||||
cic_dt in_a, in_b;
|
||||
int32_t m_a, m_b, m_c, m_d;
|
||||
int sinep = phase >> (64-SINESHIFT);
|
||||
m_a = inp[2*i];
|
||||
m_b = inp[2*i+1];
|
||||
m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
|
||||
m_d = (int32_t)sinetable[sinep];
|
||||
// complex multiplication:
|
||||
in_a = m_a*m_c - m_b*m_d;
|
||||
in_b = m_a*m_d + m_b*m_c;
|
||||
phase += freq;
|
||||
/* integrators:
|
||||
The calculations are ordered so that each integrator
|
||||
takes a result from previous loop iteration
|
||||
to make the code more "pipeline-friendly". */
|
||||
ig2a += ig1a; ig2b += ig1b;
|
||||
ig1a += ig0a; ig1b += ig0b;
|
||||
ig0a += in_a; ig0b += in_b;
|
||||
}
|
||||
inp += 2*factor;
|
||||
// comb filters:
|
||||
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
|
||||
comb0a = ig2a; comb0b = ig2b;
|
||||
out1a = out0a - comb1a; out1b = out0b - comb1b;
|
||||
comb1a = out0a; comb1b = out0b;
|
||||
|
||||
output[k].i = (float)out1a * gain;
|
||||
output[k].q = (float)out1b * gain;
|
||||
}
|
||||
|
||||
s->ig0a = ig0a; s->ig0b = ig0b;
|
||||
s->ig1a = ig1a; s->ig1b = ig1b;
|
||||
s->comb0a = comb0a; s->comb0b = comb0b;
|
||||
s->comb1a = comb1a; s->comb1b = comb1b;
|
||||
s->phase = phase;
|
||||
}
|
||||
|
||||
|
||||
/* This is almost copy paste from cicddc_cs16_c.
|
||||
I'm afraid this is going to be annoying to maintain... */
|
||||
PF_TARGET_CLONES
|
||||
void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
|
||||
cicddc_t *s = (cicddc_t *)state;
|
||||
int k;
|
||||
int factor = s->factor;
|
||||
cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
|
||||
cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
|
||||
uint64_t phase = s->phase, freq;
|
||||
int16_t *sinetable = s->sinetable;
|
||||
float gain = s->gain;
|
||||
|
||||
freq = rate * ((float)(1ULL << 63) * 2);
|
||||
|
||||
uint8_t *inp = input;
|
||||
for(k = 0; k < outsize; k++) {
|
||||
int i;
|
||||
cic_dt out0a, out0b, out1a, out1b;
|
||||
cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
|
||||
for(i = 0; i < factor; i++) {
|
||||
cic_dt in_a, in_b;
|
||||
int32_t m_a, m_b, m_c, m_d;
|
||||
int sinep = phase >> (64-SINESHIFT);
|
||||
// subtract 127.4 (good for rtl-sdr)
|
||||
m_a = (((int32_t)inp[2*i]) << 8) - 32614;
|
||||
m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
|
||||
m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
|
||||
m_d = (int32_t)sinetable[sinep];
|
||||
// complex multiplication:
|
||||
in_a = m_a*m_c - m_b*m_d;
|
||||
in_b = m_a*m_d + m_b*m_c;
|
||||
phase += freq;
|
||||
/* integrators:
|
||||
The calculations are ordered so that each integrator
|
||||
takes a result from previous loop iteration
|
||||
to make the code more "pipeline-friendly". */
|
||||
ig2a += ig1a; ig2b += ig1b;
|
||||
ig1a += ig0a; ig1b += ig0b;
|
||||
ig0a += in_a; ig0b += in_b;
|
||||
}
|
||||
inp += 2*factor;
|
||||
// comb filters:
|
||||
out0a = ig2a - comb0a; out0b = ig2b - comb0b;
|
||||
comb0a = ig2a; comb0b = ig2b;
|
||||
out1a = out0a - comb1a; out1b = out0b - comb1b;
|
||||
comb1a = out0a; comb1b = out0b;
|
||||
|
||||
output[k].i = (float)out1a * gain;
|
||||
output[k].q = (float)out1b * gain;
|
||||
}
|
||||
|
||||
s->ig0a = ig0a; s->ig0b = ig0b;
|
||||
s->ig1a = ig1a; s->ig1b = ig1b;
|
||||
s->comb0a = comb0a; s->comb0b = comb0b;
|
||||
s->comb1a = comb1a; s->comb1b = comb1b;
|
||||
s->phase = phase;
|
||||
}
|
||||
|
||||
58
pffft/pf_cic.h
Normal file
58
pffft/pf_cic.h
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||
|
||||
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
____ ___ ____ ____ ____ ____
|
||||
/ ___|_ _/ ___| | _ \| _ \ / ___|
|
||||
| | | | | | | | | | | | |
|
||||
| |___ | | |___ | |_| | |_| | |___
|
||||
\____|___\____| |____/|____/ \____|
|
||||
*/
|
||||
|
||||
typedef struct complexf_s { float i; float q; } complexf;
|
||||
|
||||
void *cicddc_init(int factor);
|
||||
void cicddc_free(void *state);
|
||||
void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
|
||||
void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
|
||||
void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
322
pffft/pf_conv.cpp
Normal file
322
pffft/pf_conv.cpp
Normal file
@@ -0,0 +1,322 @@
|
||||
|
||||
#include "pf_conv.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#if 0
|
||||
#include <stdio.h>
|
||||
|
||||
#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
|
||||
|
||||
#else
|
||||
#define DPRINT(...) do { } while (0)
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef HAVE_MIPP
|
||||
#include <mipp.h>
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef CONV_ARCH_POST
|
||||
#error CONV_ARCH_POST not defined
|
||||
#endif
|
||||
|
||||
#define PP_STRINGIFY(X) #X
|
||||
#define PP_TOSTRING(X) PP_STRINGIFY(X)
|
||||
#define PP_CONCAT_IMPL(x, y) x##y
|
||||
#define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y )
|
||||
|
||||
#define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST)
|
||||
|
||||
|
||||
const char * ARCHFUNCNAME(id)()
|
||||
{
|
||||
return PP_TOSTRING(CONV_ARCH_POST);
|
||||
}
|
||||
|
||||
|
||||
int ARCHFUNCNAME(conv_float_simd_size)()
|
||||
{
|
||||
#if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP)
|
||||
// have a completely MIPP independent implementation
|
||||
return 1;
|
||||
#else
|
||||
return mipp::N<float>();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state)
|
||||
{
|
||||
int R = state->size - state->offset; // this many samples from prev conv_float were not processed
|
||||
if (R > 0)
|
||||
{
|
||||
// memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
|
||||
std::copy(&s[state->offset], &s[state->size], s);
|
||||
}
|
||||
else
|
||||
R = 0;
|
||||
state->offset = 0; // data - to be processed - is at begin
|
||||
state->size = R; // this many unprocessed samples
|
||||
}
|
||||
|
||||
|
||||
void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state)
|
||||
{
|
||||
int R = state->size - state->offset; // this many samples from prev conv_float were not processed
|
||||
if (R > 0)
|
||||
{
|
||||
// memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
|
||||
std::copy(&s[state->offset], &s[state->size], s);
|
||||
}
|
||||
else
|
||||
R = 0;
|
||||
state->offset = 0; // data - to be processed - is at begin
|
||||
state->size = R; // this many unprocessed samples
|
||||
}
|
||||
|
||||
|
||||
#if defined(MIPP_NO_INTRINSICS)
|
||||
// have a completely MIPP independent implementation
|
||||
// #error missing HAVE_MIPP: there is no MIPP-independent implementation
|
||||
|
||||
int ARCHFUNCNAME(conv_float_inplace)(
|
||||
float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter
|
||||
)
|
||||
{
|
||||
const int off0 = state->offset;
|
||||
const int sz_s = state->size;
|
||||
int offset;
|
||||
|
||||
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||
{
|
||||
float accu = 0.0F;
|
||||
for (int k = 0; k < sz_filter; ++k)
|
||||
accu += s[offset+k] * filter[k];
|
||||
s[offset] = accu;
|
||||
}
|
||||
|
||||
state->offset = offset;
|
||||
return offset - off0;
|
||||
}
|
||||
|
||||
|
||||
int ARCHFUNCNAME(conv_float_oop)(
|
||||
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter,
|
||||
float * RESTRICT y
|
||||
)
|
||||
{
|
||||
const int off0 = state->offset;
|
||||
const int sz_s = state->size;
|
||||
int offset;
|
||||
|
||||
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||
{
|
||||
float accu = 0.0F;
|
||||
for (int k = 0; k < sz_filter; ++k)
|
||||
accu += s[offset+k] * filter[k];
|
||||
y[offset] = accu;
|
||||
}
|
||||
|
||||
state->offset = offset;
|
||||
return offset - off0;
|
||||
}
|
||||
|
||||
|
||||
int ARCHFUNCNAME(conv_cplx_float_oop)(
|
||||
const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter,
|
||||
complexf * RESTRICT y_cplx
|
||||
)
|
||||
{
|
||||
const int off0 = state->offset;
|
||||
const int sz_s = state->size;
|
||||
const int sz_f = sz_filter;
|
||||
int offset;
|
||||
|
||||
for ( offset = off0; offset + sz_f <= sz_s; ++offset)
|
||||
{
|
||||
float accu_re = 0.0F;
|
||||
float accu_im = 0.0F;
|
||||
for (int k = 0; k < sz_filter; ++k)
|
||||
{
|
||||
accu_re = s_cplx[offset+k].i * filter[k]; // accu += rS * rH;
|
||||
accu_im = s_cplx[offset+k].q * filter[k]; // accu += rS * rH;
|
||||
}
|
||||
y_cplx[offset].i = accu_re; // == hadd() == sum of real parts
|
||||
y_cplx[offset].q = accu_im; // == hadd() == sum of imag parts
|
||||
}
|
||||
|
||||
state->offset = offset;
|
||||
return offset - off0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined(HAVE_MIPP)
|
||||
|
||||
|
||||
int ARCHFUNCNAME(conv_float_inplace)(
|
||||
float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter
|
||||
)
|
||||
{
|
||||
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
|
||||
|
||||
mipp::Reg<float> accu, rS, rH;
|
||||
const int off0 = state->offset;
|
||||
const int sz_s = state->size;
|
||||
int offset;
|
||||
|
||||
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||
{
|
||||
accu.set0();
|
||||
for (int k = 0; k < sz_filter; k += mipp::N<float>())
|
||||
{
|
||||
rS.load(&s[offset+k]);
|
||||
rH.load(&filter[k]);
|
||||
accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
|
||||
}
|
||||
s[offset] = accu.sum(); // == hadd()
|
||||
}
|
||||
|
||||
state->offset = offset;
|
||||
return offset - off0;
|
||||
}
|
||||
|
||||
|
||||
int ARCHFUNCNAME(conv_float_oop)(
|
||||
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter,
|
||||
float * RESTRICT y
|
||||
)
|
||||
{
|
||||
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
|
||||
|
||||
mipp::Reg<float> accu, rS, rH;
|
||||
const int off0 = state->offset;
|
||||
const int sz_s = state->size;
|
||||
int offset;
|
||||
|
||||
for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
|
||||
{
|
||||
accu.set0();
|
||||
for (int k = 0; k < sz_filter; k += mipp::N<float>())
|
||||
{
|
||||
rS.loadu(&s[offset+k]);
|
||||
rH.load(&filter[k]);
|
||||
accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
|
||||
}
|
||||
y[offset] = accu.sum(); // == hadd()
|
||||
}
|
||||
|
||||
state->offset = offset;
|
||||
return offset - off0;
|
||||
}
|
||||
|
||||
|
||||
int ARCHFUNCNAME(conv_cplx_float_oop)(
|
||||
const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter,
|
||||
complexf * RESTRICT y_cplx
|
||||
)
|
||||
{
|
||||
assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
|
||||
const float * RESTRICT s = &(s_cplx[0].i);
|
||||
float * RESTRICT y = &(y_cplx[0].i);
|
||||
|
||||
mipp::Regx2<float> accu_x2, rS_x2, H_x2;
|
||||
const int off0 = 2 * state->offset;
|
||||
const int sz_s = 2 * state->size;
|
||||
const int sz_f2 = 2 * sz_filter;
|
||||
int offset;
|
||||
|
||||
for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2)
|
||||
{
|
||||
accu_x2.val[0].set0();
|
||||
accu_x2.val[1].set0();
|
||||
for (int k = 0; k < sz_filter; k += mipp::N<float>())
|
||||
{
|
||||
mipp::Reg<float> rH;
|
||||
rS_x2.loadu(&s[offset+2*k]);
|
||||
rH.load(&filter[k]);
|
||||
H_x2 = mipp::interleave<float>(rH, rH);
|
||||
accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]); // accu += rS * rH;
|
||||
accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]); // accu += rS * rH;
|
||||
}
|
||||
H_x2 = mipp::deinterleave(accu_x2);
|
||||
y[offset] = H_x2.val[0].sum(); // == hadd() == sum of real parts
|
||||
y[offset+1] = H_x2.val[1].sum(); // == hadd() == sum of imag parts
|
||||
}
|
||||
|
||||
state->offset = offset /2;
|
||||
return (offset - off0) / 2;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static const conv_f_ptrs conv_ptrs =
|
||||
{
|
||||
PP_TOSTRING(CONV_ARCH_POST),
|
||||
#ifndef MIPP_NO_INTRINSICS
|
||||
1,
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
|
||||
ARCHFUNCNAME(id),
|
||||
ARCHFUNCNAME(conv_float_simd_size),
|
||||
|
||||
#if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP)
|
||||
ARCHFUNCNAME(conv_float_move_rest),
|
||||
ARCHFUNCNAME(conv_float_inplace),
|
||||
ARCHFUNCNAME(conv_float_oop),
|
||||
|
||||
ARCHFUNCNAME(conv_cplx_move_rest),
|
||||
ARCHFUNCNAME(conv_cplx_float_oop)
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
|
||||
nullptr,
|
||||
nullptr
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)()
|
||||
{
|
||||
DPRINT("arch pointer for '%s':\n", conv_ptrs.id);
|
||||
if (!strcmp(conv_ptrs.id, "none"))
|
||||
return &conv_ptrs;
|
||||
|
||||
#if defined(MIPP_NO_INTRINSICS)
|
||||
DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id);
|
||||
return &conv_ptrs;
|
||||
#elif defined(HAVE_MIPP)
|
||||
DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id);
|
||||
DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp);
|
||||
DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size());
|
||||
if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1)
|
||||
return &conv_ptrs;
|
||||
else
|
||||
DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size());
|
||||
#else
|
||||
DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id);
|
||||
#endif
|
||||
DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201703L)
|
||||
[[maybe_unused]]
|
||||
#endif
|
||||
static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);
|
||||
|
||||
109
pffft/pf_conv.h
Normal file
109
pffft/pf_conv.h
Normal file
@@ -0,0 +1,109 @@
|
||||
#pragma once
|
||||
|
||||
/* pf_conv.h/.cpp implements linear "slow" convolution.
|
||||
* this code is primarily for test/demonstration of runtime dispatching.
|
||||
* each "kernel" is compiled with different compiler/architecture options,
|
||||
* that activates different implementations in the MIPP headers.
|
||||
*
|
||||
* the dispatcher library 'pf_conv_dispatcher' collects (links agains)
|
||||
* all the pf_conv_arch_<opt> libraries ..
|
||||
* and provides the get_all_conv_arch_ptrs() function,
|
||||
* which delivers an array of pointers to the struct (conv_f_ptrs)
|
||||
* containing the function pointers for the different implementations.
|
||||
*
|
||||
* requirement(s):
|
||||
* - installed MIPP headers
|
||||
* - compiler definitions for the different architecture types:
|
||||
* see CMakeLists.txt CONV_ARCH_MSVC_AMD64, CONV_ARCH_GCC_ARM32NEON, ..
|
||||
* - one cmake library target pf_conv_arch_<opt> for each architecture option.
|
||||
* each one gets it's specific architecture/compiler options
|
||||
* utilizing the target_set_cxx_arch_option() macro in the CMakeLists.txt
|
||||
*/
|
||||
|
||||
#include "pf_cplx.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define RESTRICT __restrict
|
||||
#elif defined(__GNUC__)
|
||||
# define RESTRICT __restrict
|
||||
#else
|
||||
# define RESTRICT
|
||||
#endif
|
||||
|
||||
|
||||
struct conv_buffer_state
|
||||
{
|
||||
int offset; // sample index where data (to process) starts
|
||||
int size; // actual - or previous - size in amount of samples from buffer start (NOT offset)
|
||||
};
|
||||
|
||||
// declare provided function pointer types
|
||||
|
||||
typedef const char * (*f_conv_id)();
|
||||
|
||||
typedef int (*f_conv_float_simd_size)();
|
||||
|
||||
typedef void (*f_conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state);
|
||||
typedef void (*f_conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state);
|
||||
|
||||
typedef int (*f_conv_float_inplace)(
|
||||
float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter
|
||||
);
|
||||
|
||||
typedef int (*f_conv_float_oop)(
|
||||
const float * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter,
|
||||
float * RESTRICT y
|
||||
);
|
||||
|
||||
typedef int (*f_conv_cplx_float_oop)(
|
||||
const complexf * RESTRICT s, conv_buffer_state * RESTRICT state,
|
||||
const float * RESTRICT filter, const int sz_filter,
|
||||
complexf * RESTRICT y
|
||||
);
|
||||
|
||||
|
||||
// struct with the provided function pointers
|
||||
struct conv_f_ptrs
|
||||
{
|
||||
const char * id;
|
||||
const int using_mipp;
|
||||
f_conv_id fp_id;
|
||||
f_conv_float_simd_size fp_conv_float_simd_size;
|
||||
|
||||
f_conv_float_move_rest fp_conv_float_move_rest;
|
||||
f_conv_float_inplace fp_conv_float_inplace;
|
||||
f_conv_float_oop fp_conv_float_oop;
|
||||
|
||||
f_conv_cplx_move_rest fp_conv_cplx_move_rest;
|
||||
f_conv_cplx_float_oop fp_conv_cplx_float_oop;
|
||||
};
|
||||
|
||||
typedef const conv_f_ptrs * ptr_to_conv_f_ptrs;
|
||||
|
||||
// function pointer type, delivering the struct with the function pointers
|
||||
typedef const conv_f_ptrs* (*f_conv_ptrs)();
|
||||
|
||||
|
||||
// helper for systematic function names
|
||||
#define CONV_FN_ARCH(FN, ARCH) FN##_##ARCH
|
||||
|
||||
// declare all functions - returning the structs with the function pointers
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, none)(); // = conv_ptrs_none()
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, dflt)(); // simd / mipp is activated
|
||||
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse3)(); // = conv_ptrs_sse3()
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse4)();
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)();
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)();
|
||||
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, sse2)();
|
||||
//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx)(); // already declared
|
||||
//extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, avx2)(); // already declared
|
||||
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_vfpv4)(); // for armv7l / 32-bit ARM
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
|
||||
|
||||
extern const conv_f_ptrs* CONV_FN_ARCH(conv_ptrs, armv8a)(); // for aarch64
|
||||
61
pffft/pf_conv_dispatcher.cpp
Normal file
61
pffft/pf_conv_dispatcher.cpp
Normal file
@@ -0,0 +1,61 @@
|
||||
|
||||
#include "pf_conv_dispatcher.h"
|
||||
|
||||
#if 0
|
||||
#include <stdio.h>
|
||||
|
||||
#define DPRINT(...) fprintf(stderr, __VA_ARGS__)
|
||||
|
||||
#else
|
||||
#define DPRINT(...) do { } while (0)
|
||||
#endif
|
||||
|
||||
|
||||
#define N_DEFAULT_ARCHES 2
|
||||
// 0 is "none"
|
||||
// 1 "dflt"
|
||||
|
||||
ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch)
|
||||
{
|
||||
static ptr_to_conv_f_ptrs * all_arches = nullptr;
|
||||
static int n_arch = 0;
|
||||
if (!all_arches)
|
||||
{
|
||||
n_arch = N_DEFAULT_ARCHES;
|
||||
// @TODO: runtime check if actual CPU supports specific architecture
|
||||
#if defined(CONV_ARCH_GCC_AMD64)
|
||||
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+4] = {0};
|
||||
DPRINT("CONV_ARCH_GCC_AMD64: sse3, sse4, avx, avx2\n");
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse3)();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse4)();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
|
||||
#elif defined(CONV_ARCH_MSVC_AMD64)
|
||||
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
|
||||
DPRINT("CONV_ARCH_MSVC_AMD64: sse2, avx, avx2\n");
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, sse2)();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx) ();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, avx2)();
|
||||
#elif defined(CONV_ARCH_GCC_ARM32NEON)
|
||||
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+3] = {0};
|
||||
DPRINT("CONV_ARCH_GCC_ARM32NEON: neon_vfpv4, neon_rpi3_a53\n");
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_vfpv4)();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi3_a53)();
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, neon_rpi4_a72)();
|
||||
#elif defined(CONV_ARCH_GCC_AARCH64)
|
||||
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES+1] = {0};
|
||||
DPRINT("CONV_ARCH_GCC_AARCH64: -\n");
|
||||
conv_arch_ptrs[n_arch++] = CONV_FN_ARCH(conv_ptrs, armv8a)();
|
||||
#else
|
||||
static const conv_f_ptrs *conv_arch_ptrs[N_DEFAULT_ARCHES] = {0};
|
||||
DPRINT("unknown CONV_ARCH: -\n");
|
||||
#endif
|
||||
conv_arch_ptrs[0] = CONV_FN_ARCH(conv_ptrs, none)();
|
||||
conv_arch_ptrs[1] = CONV_FN_ARCH(conv_ptrs, dflt)();
|
||||
all_arches = conv_arch_ptrs;
|
||||
}
|
||||
if (p_num_arch)
|
||||
*p_num_arch = n_arch;
|
||||
return all_arches;
|
||||
}
|
||||
|
||||
6
pffft/pf_conv_dispatcher.h
Normal file
6
pffft/pf_conv_dispatcher.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "pf_conv.h"
|
||||
|
||||
ptr_to_conv_f_ptrs * get_all_conv_arch_ptrs(int * p_num_arch);
|
||||
|
||||
44
pffft/pf_cplx.h
Normal file
44
pffft/pf_cplx.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||
|
||||
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
_____ _
|
||||
/ ____| | |
|
||||
| | ___ _ __ ___ _ __ | | _____ __
|
||||
| | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
|
||||
| |___| (_) | | | | | | |_) | | __/> <
|
||||
\_____\___/|_| |_| |_| .__/|_|\___/_/\_\
|
||||
| |
|
||||
|_|
|
||||
*/
|
||||
|
||||
typedef struct complexf_s { float i; float q; } complexf;
|
||||
|
||||
1148
pffft/pf_mixer.cpp
Normal file
1148
pffft/pf_mixer.cpp
Normal file
File diff suppressed because it is too large
Load Diff
270
pffft/pf_mixer.h
Normal file
270
pffft/pf_mixer.h
Normal file
@@ -0,0 +1,270 @@
|
||||
/*
|
||||
This software is part of pffft/pfdsp, a set of simple DSP routines.
|
||||
|
||||
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||
Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "pf_cplx.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
// =================================================================================
|
||||
|
||||
int have_sse_shift_mixer_impl();
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO A ***/
|
||||
/**************/
|
||||
|
||||
float shift_math_cc(const complexf *input, complexf* output, int input_size, float rate, float starting_phase);
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO B ***/
|
||||
/**************/
|
||||
|
||||
typedef struct shift_table_data_s
|
||||
{
|
||||
float* table;
|
||||
int table_size;
|
||||
} shift_table_data_t;
|
||||
|
||||
void shift_table_deinit(shift_table_data_t table_data);
|
||||
shift_table_data_t shift_table_init(int table_size);
|
||||
float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO C ***/
|
||||
/**************/
|
||||
|
||||
typedef struct shift_addfast_data_s
|
||||
{
|
||||
float dsin[4];
|
||||
float dcos[4];
|
||||
float phase_increment;
|
||||
} shift_addfast_data_t;
|
||||
|
||||
shift_addfast_data_t shift_addfast_init(float rate);
|
||||
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
|
||||
float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO D ***/
|
||||
/**************/
|
||||
|
||||
typedef struct shift_unroll_data_s
|
||||
{
|
||||
float* dsin;
|
||||
float* dcos;
|
||||
float phase_increment;
|
||||
int size;
|
||||
} shift_unroll_data_t;
|
||||
|
||||
shift_unroll_data_t shift_unroll_init(float rate, int size);
|
||||
void shift_unroll_deinit(shift_unroll_data_t* d);
|
||||
float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
|
||||
float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO E ***/
|
||||
/**************/
|
||||
|
||||
/* similar to shift_unroll_cc() - but, have fixed and limited precalc size
|
||||
* idea: smaller cache usage by table
|
||||
* size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
|
||||
*/
|
||||
#define PF_SHIFT_LIMITED_UNROLL_SIZE 128
|
||||
#define PF_SHIFT_LIMITED_SIMD_SZ 4
|
||||
|
||||
typedef struct shift_limited_unroll_data_s
|
||||
{
|
||||
float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
|
||||
float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
|
||||
complexf complex_phase;
|
||||
float phase_increment;
|
||||
} shift_limited_unroll_data_t;
|
||||
|
||||
shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
|
||||
/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
|
||||
/* starting_phase for next call is kept internal in state */
|
||||
void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
|
||||
void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO F ***/
|
||||
/**************/
|
||||
|
||||
typedef struct shift_limited_unroll_A_sse_data_s
|
||||
{
|
||||
/* small/limited trig table */
|
||||
float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
/* 4 times complex phase */
|
||||
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
/* N_cplx_per_block times increment - for future parallel variants */
|
||||
float dcos_blk;
|
||||
float dsin_blk;
|
||||
/* */
|
||||
float phase_increment;
|
||||
} shift_limited_unroll_A_sse_data_t;
|
||||
|
||||
shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
|
||||
void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO G ***/
|
||||
/**************/
|
||||
|
||||
typedef struct shift_limited_unroll_B_sse_data_s
|
||||
{
|
||||
/* small/limited trig table */
|
||||
float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
/* 4 times complex phase */
|
||||
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
/* N_cplx_per_block times increment - for future parallel variants */
|
||||
float dcos_blk;
|
||||
float dsin_blk;
|
||||
/* */
|
||||
float phase_increment;
|
||||
} shift_limited_unroll_B_sse_data_t;
|
||||
|
||||
shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
|
||||
void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO H ***/
|
||||
/**************/
|
||||
|
||||
typedef struct shift_limited_unroll_C_sse_data_s
|
||||
{
|
||||
/* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
|
||||
float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
|
||||
/* 4 times complex phase */
|
||||
float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
|
||||
/* N_cplx_per_block times increment - for future parallel variants */
|
||||
float dcos_blk;
|
||||
float dsin_blk;
|
||||
/* */
|
||||
float phase_increment;
|
||||
} shift_limited_unroll_C_sse_data_t;
|
||||
|
||||
shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
|
||||
void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
|
||||
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO I ***/
|
||||
/**************/
|
||||
|
||||
/* Recursive Quadrature Oscillator functions "recursive_osc"
|
||||
* see https://www.vicanek.de/articles/QuadOsc.pdf
|
||||
*/
|
||||
#define PF_SHIFT_RECURSIVE_SIMD_SZ 8
|
||||
typedef struct shift_recursive_osc_s
|
||||
{
|
||||
float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
|
||||
float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
|
||||
} shift_recursive_osc_t;
|
||||
|
||||
typedef struct shift_recursive_osc_conf_s
|
||||
{
|
||||
float k1;
|
||||
float k2;
|
||||
} shift_recursive_osc_conf_t;
|
||||
|
||||
void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
|
||||
void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||
|
||||
/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
|
||||
/* starting_phase for next call is kept internal in state */
|
||||
void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||
void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||
void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/**************/
|
||||
/*** ALGO J ***/
|
||||
/**************/
|
||||
|
||||
#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ 4
|
||||
typedef struct shift_recursive_osc_sse_s
|
||||
{
|
||||
float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
|
||||
float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
|
||||
} shift_recursive_osc_sse_t;
|
||||
|
||||
typedef struct shift_recursive_osc_sse_conf_s
|
||||
{
|
||||
float k1;
|
||||
float k2;
|
||||
} shift_recursive_osc_sse_conf_t;
|
||||
|
||||
void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
|
||||
void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
|
||||
void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
264
pffft/pffastconv.c
Normal file
264
pffft/pffastconv.c
Normal file
@@ -0,0 +1,264 @@
|
||||
/*
|
||||
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||
*/
|
||||
|
||||
#include "pffastconv.h"
|
||||
#include "pffft.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#define FASTCONV_DBG_OUT 0
|
||||
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
# define RESTRICT __restrict
|
||||
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||
#elif defined(__GNUC__)
|
||||
# define RESTRICT __restrict
|
||||
#endif
|
||||
|
||||
|
||||
void *pffastconv_malloc(size_t nb_bytes)
|
||||
{
|
||||
return pffft_aligned_malloc(nb_bytes);
|
||||
}
|
||||
|
||||
void pffastconv_free(void *p)
|
||||
{
|
||||
pffft_aligned_free(p);
|
||||
}
|
||||
|
||||
int pffastconv_simd_size()
|
||||
{
|
||||
return pffft_simd_size();
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct PFFASTCONV_Setup
|
||||
{
|
||||
float * Xt; /* input == x in time domain - copy for alignment */
|
||||
float * Xf; /* input == X in freq domain */
|
||||
float * Hf; /* filterCoeffs == H in freq domain */
|
||||
float * Mf; /* input * filterCoeffs in freq domain */
|
||||
PFFFT_Setup *st;
|
||||
int filterLen; /* convolution length */
|
||||
int Nfft; /* FFT/block length */
|
||||
int flags;
|
||||
float scale;
|
||||
};
|
||||
|
||||
|
||||
PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
|
||||
{
|
||||
PFFASTCONV_Setup * s = NULL;
|
||||
const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
|
||||
const int minFftLen = 2*pffft_simd_size()*pffft_simd_size();
|
||||
int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1);
|
||||
#if FASTCONV_DBG_OUT
|
||||
const int iOldBlkLen = *blockLen;
|
||||
#endif
|
||||
|
||||
if ( Nfft < minFftLen )
|
||||
Nfft = minFftLen;
|
||||
|
||||
if ( flags & PFFASTCONV_CPLX_FILTER )
|
||||
return NULL;
|
||||
|
||||
s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
|
||||
|
||||
if ( *blockLen > Nfft ) {
|
||||
Nfft = *blockLen;
|
||||
Nfft = pffft_next_power_of_two(Nfft);
|
||||
}
|
||||
*blockLen = Nfft; /* this is in (complex) samples */
|
||||
|
||||
Nfft *= cplxFactor;
|
||||
|
||||
if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
|
||||
s->Xt = NULL;
|
||||
else
|
||||
s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||
s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||
s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||
s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
|
||||
s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */
|
||||
s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */
|
||||
if ( cplxFactor == 2 )
|
||||
s->filterLen = 2 * filterLen - 1;
|
||||
s->Nfft = Nfft; /* FFT/block length */
|
||||
s->flags = flags;
|
||||
s->scale = (float)( 1.0 / Nfft );
|
||||
|
||||
memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
|
||||
if ( flags & PFFASTCONV_CORRELATION ) {
|
||||
for ( i = 0; i < filterLen; ++i )
|
||||
s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ];
|
||||
} else {
|
||||
for ( i = 0; i < filterLen; ++i )
|
||||
s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
|
||||
}
|
||||
|
||||
pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||
|
||||
#if FASTCONV_DBG_OUT
|
||||
printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
|
||||
, filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
|
||||
#endif
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
|
||||
{
|
||||
if (!s)
|
||||
return;
|
||||
pffft_destroy_setup(s->st);
|
||||
pffastconv_free(s->Mf);
|
||||
pffastconv_free(s->Hf);
|
||||
pffastconv_free(s->Xf);
|
||||
if ( s->Xt )
|
||||
pffastconv_free(s->Xt);
|
||||
pffastconv_free(s);
|
||||
}
|
||||
|
||||
|
||||
int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
|
||||
{
|
||||
const float * RESTRICT X = input_;
|
||||
float * RESTRICT Y = output_;
|
||||
const int Nfft = s->Nfft;
|
||||
const int filterLen = s->filterLen;
|
||||
const int flags = s->flags;
|
||||
const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
|
||||
const int inputLen = cplxFactor * cplxInputLen;
|
||||
int inpOff, procLen, numOut = 0, j, part, cplxOff;
|
||||
|
||||
/* applyFlush != 0:
|
||||
* inputLen - inpOff -filterLen + 1 > 0
|
||||
* <=> inputLen -filterLen + 1 > inpOff
|
||||
* <=> inpOff < inputLen -filterLen + 1
|
||||
*
|
||||
* applyFlush == 0:
|
||||
* inputLen - inpOff >= Nfft
|
||||
* <=> inputLen - Nfft >= inpOff
|
||||
* <=> inpOff <= inputLen - Nfft
|
||||
* <=> inpOff < inputLen - Nfft + 1
|
||||
*/
|
||||
|
||||
if ( cplxFactor == 2 )
|
||||
{
|
||||
const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
|
||||
#if 0
|
||||
printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
|
||||
#endif
|
||||
for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
|
||||
{
|
||||
procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
|
||||
numOut = ( procLen - filterLen + 1 ) & ( ~1 );
|
||||
if (!numOut)
|
||||
break;
|
||||
#if 0
|
||||
if (!inpOff)
|
||||
printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
|
||||
if (inpOff + filterLen + 2 >= maxOff )
|
||||
printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
|
||||
#endif
|
||||
|
||||
if ( flags & PFFASTCONV_DIRECT_INP )
|
||||
{
|
||||
pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
|
||||
if ( procLen < Nfft )
|
||||
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
|
||||
|
||||
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||
}
|
||||
|
||||
pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
|
||||
|
||||
if ( flags & PFFASTCONV_DIRECT_OUT )
|
||||
{
|
||||
pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
|
||||
}
|
||||
else
|
||||
{
|
||||
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
|
||||
memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
|
||||
}
|
||||
}
|
||||
return inpOff / cplxFactor;
|
||||
}
|
||||
else
|
||||
{
|
||||
const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
|
||||
const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
|
||||
|
||||
for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
|
||||
{
|
||||
procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
|
||||
numOut = procLen - filterLen + 1;
|
||||
|
||||
for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */
|
||||
{
|
||||
|
||||
if ( flags & PFFASTCONV_CPLX_INP_OUT )
|
||||
{
|
||||
cplxOff = 2 * inpOff + part;
|
||||
for ( j = 0; j < procLen; ++j )
|
||||
s->Xt[j] = X[cplxOff + 2 * j];
|
||||
if ( procLen < Nfft )
|
||||
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
|
||||
|
||||
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||
}
|
||||
else if ( flags & PFFASTCONV_DIRECT_INP )
|
||||
{
|
||||
pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
|
||||
if ( procLen < Nfft )
|
||||
memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
|
||||
|
||||
pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
|
||||
}
|
||||
|
||||
pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
|
||||
|
||||
if ( flags & PFFASTCONV_CPLX_INP_OUT )
|
||||
{
|
||||
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
|
||||
|
||||
cplxOff = 2 * inpOff + part;
|
||||
for ( j = 0; j < numOut; ++j )
|
||||
Y[ cplxOff + 2 * j ] = s->Xf[j];
|
||||
}
|
||||
else if ( flags & PFFASTCONV_DIRECT_OUT )
|
||||
{
|
||||
pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
|
||||
}
|
||||
else
|
||||
{
|
||||
pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
|
||||
memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return inpOff;
|
||||
}
|
||||
}
|
||||
|
||||
171
pffft/pffastconv.h
Normal file
171
pffft/pffastconv.h
Normal file
@@ -0,0 +1,171 @@
|
||||
/* Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of PFFFT, PFFASTCONV, nor the names of its
|
||||
sponsors or contributors may be used to endorse or promote products
|
||||
derived from this Software without specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
PFFASTCONV : a Pretty Fast Fast Convolution
|
||||
|
||||
This is basically the implementation of fast convolution,
|
||||
utilizing the FFT (pffft).
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 32-bit single precision.
|
||||
|
||||
- all (float*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFASTCONV_H
|
||||
#define PFFASTCONV_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
#include "pffft.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff
|
||||
this struct can't be shared by many threads as it contains
|
||||
temporary data, computed within the convolution
|
||||
*/
|
||||
typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
|
||||
|
||||
typedef enum {
|
||||
PFFASTCONV_CPLX_INP_OUT = 1,
|
||||
/* set when input and output is complex,
|
||||
* with real and imag part interleaved in both vectors.
|
||||
* input[] has inputLen complex values: 2 * inputLen floats,
|
||||
* output[] is also written with complex values.
|
||||
* without this flag, the input is interpreted as real vector
|
||||
*/
|
||||
|
||||
PFFASTCONV_CPLX_FILTER = 2,
|
||||
/* set when filterCoeffs is complex,
|
||||
* with real and imag part interleaved.
|
||||
* filterCoeffs[] has filterLen complex values: 2 * filterLen floats
|
||||
* without this flag, the filter is interpreted as real vector
|
||||
* ATTENTION: this is not implemented yet!
|
||||
*/
|
||||
|
||||
PFFASTCONV_DIRECT_INP = 4,
|
||||
/* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
|
||||
* 1- input vecor X must be aligned
|
||||
* 2- (all) inputLen <= ouput blockLen
|
||||
* 3- X must have minimum length of output BlockLen
|
||||
* 4- the additional samples from inputLen .. BlockLen-1
|
||||
* must contain valid small and non-NAN samples (ideally zero)
|
||||
*
|
||||
* this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
|
||||
*/
|
||||
|
||||
PFFASTCONV_DIRECT_OUT = 8,
|
||||
/* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
|
||||
* 1- output vector Y must be aligned
|
||||
* 2- (all) inputLen <= ouput blockLen
|
||||
* 3- Y must have minimum length of output blockLen
|
||||
*
|
||||
* this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
|
||||
*/
|
||||
|
||||
PFFASTCONV_CPLX_SINGLE_FFT = 16,
|
||||
/* hint to process complex data with one single FFT;
|
||||
* default is to use 2 FFTs: one for real part, one for imag part
|
||||
* */
|
||||
|
||||
|
||||
PFFASTCONV_SYMMETRIC = 32,
|
||||
/* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
|
||||
|
||||
PFFASTCONV_CORRELATION = 64,
|
||||
/* filterCoeffs[] of pffastconv_new_setup are for correlation;
|
||||
* thus, do not flip them for the internal fft calculation
|
||||
* - as necessary for the fast convolution */
|
||||
|
||||
} pffastconv_flags_t;
|
||||
|
||||
/*
|
||||
prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
|
||||
The output 'blockLen' might be bigger to allow the fast convolution.
|
||||
|
||||
'flags' are bitmask over the 'pffastconv_flags_t' enum.
|
||||
|
||||
PFFASTCONV_Setup structure can't be shared accross multiple filters
|
||||
or concurrent threads.
|
||||
*/
|
||||
PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
|
||||
|
||||
void pffastconv_destroy_setup(PFFASTCONV_Setup *);
|
||||
|
||||
/*
|
||||
Perform the fast convolution.
|
||||
|
||||
'input' and 'output' don't need to be aligned - unless any of
|
||||
PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
|
||||
|
||||
inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
|
||||
in this case, multiple FFTs are called internally, to process the
|
||||
input[].
|
||||
|
||||
'output' vector must have size >= (inputLen - filterLen + 1)
|
||||
|
||||
set bool option 'applyFlush' to process the full input[].
|
||||
with this option, 'tail samples' of input are also processed.
|
||||
This might be inefficient, because the FFT is called to produce
|
||||
few(er) output samples, than possible.
|
||||
This option is useful to process the last samples of an input (file)
|
||||
or to reduce latency.
|
||||
|
||||
return value is the number of produced samples in output[].
|
||||
the same amount of samples is processed from input[]. to continue
|
||||
processing, the caller must save/move the remaining samples of
|
||||
input[].
|
||||
|
||||
*/
|
||||
int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
|
||||
|
||||
void *pffastconv_malloc(size_t nb_bytes);
|
||||
void pffastconv_free(void *);
|
||||
|
||||
/* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
|
||||
int pffastconv_simd_size();
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFASTCONV_H */
|
||||
134
pffft/pffft.c
Normal file
134
pffft/pffft.c
Normal file
@@ -0,0 +1,134 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||
of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This file is largerly based on the original FFTPACK implementation, modified in
|
||||
order to take advantage of SIMD instructions of modern CPUs.
|
||||
*/
|
||||
|
||||
/*
|
||||
ChangeLog:
|
||||
- 2011/10/02, version 1: This is the very first release of this file.
|
||||
*/
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
# define COMPILER_MSVC
|
||||
#elif defined(__GNUC__)
|
||||
# define COMPILER_GCC
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(COMPILER_GCC)
|
||||
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||
#elif defined(COMPILER_MSVC)
|
||||
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||
#endif
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
#include "simd/pf_float.h"
|
||||
|
||||
/* have code comparable with this definition */
|
||||
#define SETUP_STRUCT PFFFT_Setup
|
||||
#define FUNC_NEW_SETUP pffft_new_setup
|
||||
#define FUNC_DESTROY pffft_destroy_setup
|
||||
#define FUNC_TRANSFORM_UNORDRD pffft_transform
|
||||
#define FUNC_TRANSFORM_ORDERED pffft_transform_ordered
|
||||
#define FUNC_ZREORDER pffft_zreorder
|
||||
#define FUNC_ZCONVOLVE_ACCUMULATE pffft_zconvolve_accumulate
|
||||
#define FUNC_ZCONVOLVE_NO_ACCU pffft_zconvolve_no_accu
|
||||
|
||||
#define FUNC_ALIGNED_MALLOC pffft_aligned_malloc
|
||||
#define FUNC_ALIGNED_FREE pffft_aligned_free
|
||||
#define FUNC_SIMD_SIZE pffft_simd_size
|
||||
#define FUNC_MIN_FFT_SIZE pffft_min_fft_size
|
||||
#define FUNC_IS_VALID_SIZE pffft_is_valid_size
|
||||
#define FUNC_NEAREST_SIZE pffft_nearest_transform_size
|
||||
#define FUNC_SIMD_ARCH pffft_simd_arch
|
||||
#define FUNC_VALIDATE_SIMD_A validate_pffft_simd
|
||||
#define FUNC_VALIDATE_SIMD_EX validate_pffft_simd_ex
|
||||
|
||||
#define FUNC_CPLX_FINALIZE pffft_cplx_finalize
|
||||
#define FUNC_CPLX_PREPROCESS pffft_cplx_preprocess
|
||||
#define FUNC_REAL_PREPROCESS_4X4 pffft_real_preprocess_4x4
|
||||
#define FUNC_REAL_PREPROCESS pffft_real_preprocess
|
||||
#define FUNC_REAL_FINALIZE_4X4 pffft_real_finalize_4x4
|
||||
#define FUNC_REAL_FINALIZE pffft_real_finalize
|
||||
#define FUNC_TRANSFORM_INTERNAL pffft_transform_internal
|
||||
|
||||
#define FUNC_COS cosf
|
||||
#define FUNC_SIN sinf
|
||||
|
||||
|
||||
#include "pffft_priv_impl.h"
|
||||
|
||||
|
||||
241
pffft/pffft.h
Normal file
241
pffft/pffft.h
Normal file
@@ -0,0 +1,241 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 32-bit single precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (float*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFFT_H
|
||||
#define PFFFT_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFT_Setup PFFFT_Setup;
|
||||
|
||||
#ifndef PFFFT_COMMON_ENUMS
|
||||
#define PFFFT_COMMON_ENUMS
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFT_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
|
||||
void pffft_destroy_setup(PFFFT_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) floats, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a crash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
|
||||
input with input(=transformation) length N, the output array is
|
||||
'mostly' complex:
|
||||
index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N
|
||||
index k == 0 is a special case:
|
||||
the real() part contains the result for the DC frequency 0,
|
||||
the imag() part contains the result for the Nyquist frequency Samplerate/2
|
||||
both 0-frequency and half frequency components, which are real,
|
||||
are assembled in the first entry as F(0)+i*F(N/2).
|
||||
With the output size N/2 complex values (=N real/imag values), it is
|
||||
obvious, that the result for negative frequencies are not output,
|
||||
cause of symmetry.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
|
||||
int pffft_simd_size();
|
||||
|
||||
/* return string identifier of used architecture (SSE/NEON/Altivec/..) */
|
||||
const char * pffft_simd_arch();
|
||||
|
||||
|
||||
/* following functions are identical to the pffftd_ functions */
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffft_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine size N is valid
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||
returns bool
|
||||
*/
|
||||
int pffft_is_valid_size(int N, pffft_transform_t cplx);
|
||||
|
||||
/* determine nearest valid transform size (by brute-force testing)
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||
higher: bool-flag to find nearest higher value; else lower.
|
||||
*/
|
||||
int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||
|
||||
/*
|
||||
the float buffers must have the correct alignment (16-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFFT_H */
|
||||
|
||||
1060
pffft/pffft.hpp
Normal file
1060
pffft/pffft.hpp
Normal file
File diff suppressed because it is too large
Load Diff
53
pffft/pffft_common.c
Normal file
53
pffft/pffft_common.c
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
/* SSE and co like 16-bytes aligned pointers
|
||||
* with a 64-byte alignment, we are even aligned on L2 cache lines... */
|
||||
#define MALLOC_V4SF_ALIGNMENT 64
|
||||
|
||||
static void * Valigned_malloc(size_t nb_bytes) {
|
||||
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||
if (!p0) return (void *) 0;
|
||||
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
|
||||
*((void **) p - 1) = p0;
|
||||
return p;
|
||||
}
|
||||
|
||||
static void Valigned_free(void *p) {
|
||||
if (p) free(*((void **) p - 1));
|
||||
}
|
||||
|
||||
|
||||
static int next_power_of_two(int N) {
|
||||
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
|
||||
/* compute the next highest power of 2 of 32-bit v */
|
||||
unsigned v = N;
|
||||
v--;
|
||||
v |= v >> 1;
|
||||
v |= v >> 2;
|
||||
v |= v >> 4;
|
||||
v |= v >> 8;
|
||||
v |= v >> 16;
|
||||
v++;
|
||||
return v;
|
||||
}
|
||||
|
||||
static int is_power_of_two(int N) {
|
||||
/* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
|
||||
int f = N && !(N & (N - 1));
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||
void pffft_aligned_free(void *p) { Valigned_free(p); }
|
||||
int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||
int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||
|
||||
void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||
void pffftd_aligned_free(void *p) { Valigned_free(p); }
|
||||
int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||
int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||
147
pffft/pffft_double.c
Normal file
147
pffft/pffft_double.c
Normal file
@@ -0,0 +1,147 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||
of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This file is largerly based on the original FFTPACK implementation, modified in
|
||||
order to take advantage of SIMD instructions of modern CPUs.
|
||||
*/
|
||||
|
||||
/*
|
||||
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||
which works on 32 bit floating point precision using SSE instructions,
|
||||
to work with 64 bit floating point precision using AVX instructions.
|
||||
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||
*/
|
||||
|
||||
#include "pffft_double.h"
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
# define COMPILER_MSVC
|
||||
#elif defined(__GNUC__)
|
||||
# define COMPILER_GCC
|
||||
#endif
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
# define _USE_MATH_DEFINES
|
||||
# include <malloc.h>
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
# include <malloc.h>
|
||||
#else
|
||||
# include <alloca.h>
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(COMPILER_GCC)
|
||||
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||
#elif defined(COMPILER_MSVC)
|
||||
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||
#endif
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
#include "simd/pf_double.h"
|
||||
|
||||
/* have code comparable with this definition */
|
||||
#define float double
|
||||
#define SETUP_STRUCT PFFFTD_Setup
|
||||
#define FUNC_NEW_SETUP pffftd_new_setup
|
||||
#define FUNC_DESTROY pffftd_destroy_setup
|
||||
#define FUNC_TRANSFORM_UNORDRD pffftd_transform
|
||||
#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered
|
||||
#define FUNC_ZREORDER pffftd_zreorder
|
||||
#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate
|
||||
#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu
|
||||
|
||||
#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc
|
||||
#define FUNC_ALIGNED_FREE pffftd_aligned_free
|
||||
#define FUNC_SIMD_SIZE pffftd_simd_size
|
||||
#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size
|
||||
#define FUNC_IS_VALID_SIZE pffftd_is_valid_size
|
||||
#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size
|
||||
#define FUNC_SIMD_ARCH pffftd_simd_arch
|
||||
#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd
|
||||
#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex
|
||||
|
||||
#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize
|
||||
#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess
|
||||
#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4
|
||||
#define FUNC_REAL_PREPROCESS pffftd_real_preprocess
|
||||
#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4
|
||||
#define FUNC_REAL_FINALIZE pffftd_real_finalize
|
||||
#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal
|
||||
|
||||
#define FUNC_COS cos
|
||||
#define FUNC_SIN sin
|
||||
|
||||
|
||||
#include "pffft_priv_impl.h"
|
||||
|
||||
|
||||
236
pffft/pffft_double.h
Normal file
236
pffft/pffft_double.h
Normal file
@@ -0,0 +1,236 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||
which works on 32 bit floating point precision using SSE instructions,
|
||||
to work with 64 bit floating point precision using AVX instructions.
|
||||
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||
*/
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 64-bit double precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (double*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 32 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFFT_DOUBLE_H
|
||||
#define PFFFT_DOUBLE_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFTD_Setup PFFFTD_Setup;
|
||||
|
||||
#ifndef PFFFT_COMMON_ENUMS
|
||||
#define PFFFT_COMMON_ENUMS
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFTD_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
|
||||
void pffftd_destroy_setup(PFFFTD_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) doubles, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a crash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
|
||||
|
||||
/* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
|
||||
int pffftd_simd_size();
|
||||
|
||||
/* return string identifier of used architecture (AVX/..) */
|
||||
const char * pffftd_simd_arch();
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffftd_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine size N is valid
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||
*/
|
||||
int pffftd_is_valid_size(int N, pffft_transform_t cplx);
|
||||
|
||||
/* determine nearest valid transform size (by brute-force testing)
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||
higher: bool-flag to find nearest higher value; else lower.
|
||||
*/
|
||||
int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||
|
||||
|
||||
/* following functions are identical to the pffft_ functions - both declared */
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffftd_next_power_of_two(int N);
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffftd_is_power_of_two(int N);
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/*
|
||||
the double buffers must have the correct alignment (32-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffftd_aligned_malloc(size_t nb_bytes);
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffftd_aligned_free(void *);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFFT_DOUBLE_H */
|
||||
|
||||
2233
pffft/pffft_priv_impl.h
Normal file
2233
pffft/pffft_priv_impl.h
Normal file
File diff suppressed because it is too large
Load Diff
50
pffft/plots.sh
Executable file
50
pffft/plots.sh
Executable file
@@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
|
||||
OUTPNG="1"
|
||||
W="1024"
|
||||
H="768"
|
||||
PTS="20"
|
||||
LWS="20"
|
||||
|
||||
for f in $(ls -1 *-4-*.csv *-6-*.csv); do
|
||||
b=$(basename "$f" ".csv")
|
||||
#echo $b
|
||||
LASTCOL="$(head -n 1 $f |sed 's/,/,\n/g' |grep -c ',')"
|
||||
echo "${b}: last column is $LASTCOL"
|
||||
if [ $(echo "$b" |grep -c -- "-1-") -gt 0 ]; then
|
||||
YL="duration in ms; less is better"
|
||||
elif [ $(echo "$b" |grep -c -- "-4-") -gt 0 ]; then
|
||||
YL="duration relative to pffft; less is better"
|
||||
else
|
||||
YL=""
|
||||
fi
|
||||
|
||||
E=""
|
||||
if [ "${OUTPNG}" = "1" ]; then
|
||||
E="set terminal png size $W,$H"
|
||||
E="${E} ; set output '${b}.png'"
|
||||
fi
|
||||
if [ -z "${E}" ]; then
|
||||
E="set key outside"
|
||||
else
|
||||
E="${E} ; set key outside"
|
||||
fi
|
||||
E="${E} ; set datafile separator ','"
|
||||
E="${E} ; set title '${b}'"
|
||||
E="${E} ; set xlabel 'fft order: fft size N = 2\\^order'"
|
||||
if [ ! -z "${YL}" ]; then
|
||||
#echo " setting Y label to ${YL}"
|
||||
E="${E} ; set ylabel '${YL}'"
|
||||
fi
|
||||
# unfortunately no effect for
|
||||
#for LNO in $(seq 1 ${LASTCOL}) ; do
|
||||
# E="${E} ; set style line ${LNO} ps ${PTS} lw ${LWS}"
|
||||
#done
|
||||
E="${E} ; plot for [col=3:${LASTCOL}] '${f}' using 2:col with lines title columnhead"
|
||||
|
||||
if [ "${OUTPNG}" = "1" ]; then
|
||||
gnuplot -e "${E}"
|
||||
else
|
||||
gnuplot -e "${E}" --persist
|
||||
fi
|
||||
done
|
||||
81
pffft/simd/pf_altivec_float.h
Normal file
81
pffft/simd/pf_altivec_float.h
Normal file
@@ -0,0 +1,81 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_ALTIVEC_FLT_H
|
||||
#define PF_ALTIVEC_FLT_H
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
|
||||
#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
|
||||
typedef vector float v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
||||
# define VARCH "ALTIVEC"
|
||||
# define VZERO() ((vector float) vec_splat_u8(0))
|
||||
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
||||
# define VADD(a,b) vec_add(a,b)
|
||||
# define VMADD(a,b,c) vec_madd(a,b,c)
|
||||
# define VSUB(a,b) vec_sub(a,b)
|
||||
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
||||
# define LD_PS1(p) ld_ps1(&p)
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
||||
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
||||
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
||||
}
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
v4sf y0 = vec_mergeh(x0, x2); \
|
||||
v4sf y1 = vec_mergel(x0, x2); \
|
||||
v4sf y2 = vec_mergeh(x1, x3); \
|
||||
v4sf y3 = vec_mergel(x1, x3); \
|
||||
x0 = vec_mergeh(y0, y2); \
|
||||
x1 = vec_mergel(y0, y2); \
|
||||
x2 = vec_mergeh(y1, y3); \
|
||||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
145
pffft/simd/pf_avx_double.h
Normal file
145
pffft/simd/pf_avx_double.h
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_AVX_DBL_H
|
||||
#define PF_AVX_DBL_H
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
AVX support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
|
||||
#pragma message( __FILE__ ": AVX macros are defined" )
|
||||
|
||||
#include <immintrin.h>
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "AVX"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
|
||||
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
|
||||
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
|
||||
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
|
||||
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
|
||||
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
|
||||
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
84
pffft/simd/pf_double.h
Normal file
84
pffft/simd/pf_double.h
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_DBL_H
|
||||
#define PF_DBL_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef double vsfscalar;
|
||||
|
||||
#include "pf_avx_double.h"
|
||||
#include "pf_sse2_double.h"
|
||||
#include "pf_neon_double.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building double with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_double.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_DBL_H */
|
||||
|
||||
84
pffft/simd/pf_float.h
Normal file
84
pffft/simd/pf_float.h
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_FLT_H
|
||||
#define PF_FLT_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef float vsfscalar;
|
||||
|
||||
#include "pf_sse1_float.h"
|
||||
#include "pf_neon_float.h"
|
||||
#include "pf_altivec_float.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building float with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_float.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_FLT_H */
|
||||
|
||||
203
pffft/simd/pf_neon_double.h
Normal file
203
pffft/simd/pf_neon_double.h
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_DBL_H
|
||||
#define PF_NEON_DBL_H
|
||||
|
||||
/*
|
||||
NEON 64bit support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
|
||||
|
||||
#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
|
||||
|
||||
#include "pf_neon_double_from_avx.h"
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t al = vget_low_f64(a);
|
||||
float64x1_t bl = vget_low_f64(b);
|
||||
return vcombine_f64(al, bl);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t ah = vget_high_f64(a);
|
||||
float64x1_t bh = vget_high_f64(b);
|
||||
return vcombine_f64(ah, bh);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b.vect_f64[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[1];
|
||||
res.vect_f64[1] = b.vect_f64[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_reverse(__m256d x)
|
||||
{
|
||||
__m256d res;
|
||||
float64x2_t low = x.vect_f64[0];
|
||||
float64x2_t high = x.vect_f64[1];
|
||||
float64x1_t a = vget_low_f64(low);
|
||||
float64x1_t b = vget_high_f64(low);
|
||||
float64x1_t c = vget_low_f64(high);
|
||||
float64x1_t d = vget_high_f64(high);
|
||||
res.vect_f64[0] = vcombine_f64(d, c);
|
||||
res.vect_f64[1] = vcombine_f64(b, a);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
|
||||
_mm_shuffle_pd_11(low1__, low2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
|
||||
_mm_shuffle_pd_11(high1__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
|
||||
_mm_shuffle_pd_00(low2__, high2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
|
||||
_mm_shuffle_pd_11(low2__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
123
pffft/simd/pf_neon_double_from_avx.h
Normal file
123
pffft/simd/pf_neon_double_from_avx.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
|
||||
*/
|
||||
|
||||
//see https://github.com/kunpengcompute/AvxToNeon
|
||||
|
||||
#ifndef PF_NEON_DBL_FROM_AVX_H
|
||||
#define PF_NEON_DBL_FROM_AVX_H
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#else
|
||||
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE static inline
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
float32x4_t vect_f32[2];
|
||||
} __m256;
|
||||
|
||||
typedef struct {
|
||||
float64x2_t vect_f64[2];
|
||||
} __m256d;
|
||||
|
||||
typedef float64x2_t __m128d;
|
||||
|
||||
FORCE_INLINE __m256d _mm256_setzero_pd(void)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_set1_pd(double a)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
|
||||
{
|
||||
return a.vect_f64[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.vect_f64[imm8];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
87
pffft/simd/pf_neon_float.h
Normal file
87
pffft/simd/pf_neon_float.h
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_FLT_H
|
||||
#define PF_NEON_FLT_H
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
|
||||
#pragma message( __FILE__ ": ARM NEON macros are defined" )
|
||||
|
||||
# include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
||||
# define VZERO() vdupq_n_f32(0)
|
||||
# define VMUL(a,b) vmulq_f32(a,b)
|
||||
# define VADD(a,b) vaddq_f32(a,b)
|
||||
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
||||
# define VSUB(a,b) vsubq_f32(a,b)
|
||||
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
||||
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
||||
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
||||
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
||||
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
||||
}
|
||||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) vextq_f32(a, a, 2)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_NEON_FLT_H */
|
||||
|
||||
185
pffft/simd/pf_scalar_double.h
Normal file
185
pffft/simd/pf_scalar_double.h
Normal file
@@ -0,0 +1,185 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_DBL_H
|
||||
#define PF_SCAL_DBL_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.0
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_DBL_H */
|
||||
|
||||
185
pffft/simd/pf_scalar_float.h
Normal file
185
pffft/simd/pf_scalar_float.h
Normal file
@@ -0,0 +1,185 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_FLT_H
|
||||
#define PF_SCAL_FLT_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.f
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_FLT_H */
|
||||
|
||||
82
pffft/simd/pf_sse1_float.h
Normal file
82
pffft/simd/pf_sse1_float.h
Normal file
@@ -0,0 +1,82 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE1_FLT_H
|
||||
#define PF_SSE1_FLT_H
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
|
||||
#pragma message( __FILE__ ": SSE1 float macros are defined" )
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
|
||||
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
||||
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "SSE1"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm_setzero_ps()
|
||||
# define VMUL(a,b) _mm_mul_ps(a,b)
|
||||
# define VADD(a,b) _mm_add_ps(a,b)
|
||||
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||
# define VSUB(a,b) _mm_sub_ps(a,b)
|
||||
# define LD_PS1(p) _mm_set1_ps(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
||||
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
281
pffft/simd/pf_sse2_double.h
Normal file
281
pffft/simd/pf_sse2_double.h
Normal file
@@ -0,0 +1,281 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE2_DBL_H
|
||||
#define PF_SSE2_DBL_H
|
||||
|
||||
//detect sse2 support under MSVC
|
||||
#if defined ( _M_IX86_FP )
|
||||
# if _M_IX86_FP == 2
|
||||
# if !defined(__SSE2__)
|
||||
# define __SSE2__
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
SSE2 64bit support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
|
||||
#pragma message (__FILE__ ": SSE2 double macros are defined" )
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
typedef struct {
|
||||
__m128d d128[2];
|
||||
} m256d;
|
||||
|
||||
typedef m256d v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#elif defined (_MSC_VER)
|
||||
#define FORCE_INLINE static __forceinline
|
||||
|
||||
#else
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
#define FORCE_INLINE static inline
|
||||
#endif
|
||||
|
||||
FORCE_INLINE m256d mm256_setzero_pd(void)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_set1_pd(double a)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_load_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
# define VARCH "SSE2"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() mm256_setzero_pd()
|
||||
# define VMUL(a,b) mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
|
||||
|
||||
|
||||
FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
|
||||
{
|
||||
return a.d128[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.d128[imm8];
|
||||
}
|
||||
FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b;
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b.d128[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[1];
|
||||
res.d128[1] = b.d128[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_reverse(m256d x)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
|
||||
res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
#endif
|
||||
5956
pffft/sse2neon.h
Normal file
5956
pffft/sse2neon.h
Normal file
File diff suppressed because it is too large
Load Diff
142
pffft/test_fft_factors.c
Normal file
142
pffft/test_fft_factors.c
Normal file
@@ -0,0 +1,142 @@
|
||||
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
#include "pffft.h"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef PFFFT_ENABLE_DOUBLE
|
||||
#include "pffft_double.h"
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
int test_float(int TL)
|
||||
{
|
||||
PFFFT_Setup * S;
|
||||
|
||||
for (int dir_i = 0; dir_i <= 1; ++dir_i)
|
||||
{
|
||||
for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
|
||||
{
|
||||
const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
|
||||
const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
|
||||
const int N_min = pffft_min_fft_size(cplx);
|
||||
const int N_max = N_min * 11 + N_min;
|
||||
int NTL = pffft_nearest_transform_size(TL, cplx, (!dir_i));
|
||||
double near_off = (NTL - TL) * 100.0 / (double)TL;
|
||||
|
||||
fprintf(stderr, "testing float, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
|
||||
(!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
|
||||
|
||||
for (int N = (N_min/2); N <= N_max; N += (N_min/2))
|
||||
{
|
||||
int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
|
||||
const int factorizable = pffft_is_valid_size(N, cplx);
|
||||
while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; }
|
||||
while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; }
|
||||
while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; }
|
||||
tmp_f = (R == N_min) ? 1 : 0;
|
||||
assert( factorizable == tmp_f );
|
||||
|
||||
S = pffft_new_setup(N, cplx);
|
||||
|
||||
if ( S && !factorizable )
|
||||
{
|
||||
fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||
return 1;
|
||||
}
|
||||
else if ( !S && factorizable)
|
||||
{
|
||||
fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (S)
|
||||
pffft_destroy_setup(S);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef PFFFT_ENABLE_DOUBLE
|
||||
int test_double(int TL)
|
||||
{
|
||||
PFFFTD_Setup * S;
|
||||
for (int dir_i = 0; dir_i <= 1; ++dir_i)
|
||||
{
|
||||
for (int cplx_i = 0; cplx_i <= 1; ++cplx_i)
|
||||
{
|
||||
const pffft_direction_t dir = (!dir_i) ? PFFFT_FORWARD : PFFFT_BACKWARD;
|
||||
const pffft_transform_t cplx = (!cplx_i) ? PFFFT_REAL : PFFFT_COMPLEX;
|
||||
const int N_min = pffftd_min_fft_size(cplx);
|
||||
const int N_max = N_min * 11 + N_min;
|
||||
int NTL = pffftd_nearest_transform_size(TL, cplx, (!dir_i));
|
||||
double near_off = (NTL - TL) * 100.0 / (double)TL;
|
||||
|
||||
fprintf(stderr, "testing double, %s, %s ..\tminimum transform %d; nearest transform for %d is %d (%.2f%% off)\n",
|
||||
(!dir_i) ? "FORWARD" : "BACKWARD", (!cplx_i) ? "REAL" : "COMPLEX", N_min, TL, NTL, near_off );
|
||||
|
||||
for (int N = (N_min/2); N <= N_max; N += (N_min/2))
|
||||
{
|
||||
int R = N, f2 = 0, f3 = 0, f5 = 0, tmp_f;
|
||||
const int factorizable = pffftd_is_valid_size(N, cplx);
|
||||
while (R >= 5*N_min && (R % 5) == 0) { R /= 5; ++f5; }
|
||||
while (R >= 3*N_min && (R % 3) == 0) { R /= 3; ++f3; }
|
||||
while (R >= 2*N_min && (R % 2) == 0) { R /= 2; ++f2; }
|
||||
tmp_f = (R == N_min) ? 1 : 0;
|
||||
assert( factorizable == tmp_f );
|
||||
|
||||
S = pffftd_new_setup(N, cplx);
|
||||
|
||||
if ( S && !factorizable )
|
||||
{
|
||||
fprintf(stderr, "fft setup successful, but NOT factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||
return 1;
|
||||
}
|
||||
else if ( !S && factorizable)
|
||||
{
|
||||
fprintf(stderr, "fft setup UNsuccessful, but factorizable into min(=%d), 2^%d, 3^%d, 5^%d for N = %d (R = %d)\n", N_min, f2, f3, f5, N, R);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (S)
|
||||
pffftd_destroy_setup(S);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int N = (1 < argc) ? atoi(argv[1]) : 2;
|
||||
|
||||
int r = 0;
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
r = test_float(N);
|
||||
if (r)
|
||||
return r;
|
||||
#endif
|
||||
|
||||
#ifdef PFFFT_ENABLE_DOUBLE
|
||||
r = test_double(N);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
991
pffft/test_pffastconv.c
Normal file
991
pffft/test_pffastconv.c
Normal file
@@ -0,0 +1,991 @@
|
||||
/*
|
||||
Copyright (c) 2013 Julien Pommier.
|
||||
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||||
*/
|
||||
|
||||
#define _WANT_SNAN 1
|
||||
|
||||
#include "pffft.h"
|
||||
#include "pffastconv.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef HAVE_SYS_TIMES
|
||||
# include <sys/times.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
/* benchmark duration: 250 ms */
|
||||
#define BENCH_TEST_DURATION_IN_SEC 0.5
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
#if 0
|
||||
#include "simd/pf_float.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define RESTRICT __restrict
|
||||
#elif defined(__GNUC__)
|
||||
# define RESTRICT __restrict
|
||||
#else
|
||||
# define RESTRICT
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning( disable : 4244 )
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef SNANF
|
||||
#define INVALID_FLOAT_VAL SNANF
|
||||
#elif defined(SNAN)
|
||||
#define INVALID_FLOAT_VAL SNAN
|
||||
#elif defined(NAN)
|
||||
#define INVALID_FLOAT_VAL NAN
|
||||
#elif defined(INFINITY)
|
||||
#define INVALID_FLOAT_VAL INFINITY
|
||||
#else
|
||||
#define INVALID_FLOAT_VAL FLT_MAX
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(HAVE_SYS_TIMES)
|
||||
inline double uclock_sec(void) {
|
||||
static double ttclk = 0.;
|
||||
struct tms t;
|
||||
if (ttclk == 0.)
|
||||
ttclk = sysconf(_SC_CLK_TCK);
|
||||
times(&t);
|
||||
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
|
||||
return ((double)t.tms_utime)) / ttclk;
|
||||
}
|
||||
# else
|
||||
double uclock_sec(void)
|
||||
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
|
||||
typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags);
|
||||
typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
|
||||
typedef void (*pfnConvDestroy) (void * setup);
|
||||
|
||||
|
||||
struct ConvSetup
|
||||
{
|
||||
pfnConvolution pfn;
|
||||
int N;
|
||||
int B;
|
||||
float * H;
|
||||
int flags;
|
||||
};
|
||||
|
||||
|
||||
void * convSetupRev( float * H, int N, int * BlkLen, int flags )
|
||||
{
|
||||
struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
|
||||
int i, Nr = N;
|
||||
if (flags & PFFASTCONV_CPLX_INP_OUT)
|
||||
Nr *= 2;
|
||||
Nr += 4;
|
||||
s->pfn = NULL;
|
||||
s->N = N;
|
||||
s->B = *BlkLen;
|
||||
s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
|
||||
s->flags = flags;
|
||||
memset(s->H, 0, (unsigned)Nr * sizeof(float));
|
||||
if (flags & PFFASTCONV_CPLX_INP_OUT)
|
||||
{
|
||||
for ( i = 0; i < N; ++i ) {
|
||||
s->H[2*(N-1 -i) ] = H[i];
|
||||
s->H[2*(N-1 -i)+1] = H[i];
|
||||
}
|
||||
/* simpler detection of overruns */
|
||||
s->H[ 2*N ] = INVALID_FLOAT_VAL;
|
||||
s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
|
||||
s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
|
||||
s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < N; ++i )
|
||||
s->H[ N-1 -i ] = H[i];
|
||||
/* simpler detection of overruns */
|
||||
s->H[ N ] = INVALID_FLOAT_VAL;
|
||||
s->H[ N +1 ] = INVALID_FLOAT_VAL;
|
||||
s->H[ N +2 ] = INVALID_FLOAT_VAL;
|
||||
s->H[ N +3 ] = INVALID_FLOAT_VAL;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void convDestroyRev( void * setup )
|
||||
{
|
||||
struct ConvSetup * s = (struct ConvSetup*)setup;
|
||||
pffastconv_free(s->H);
|
||||
pffastconv_free(setup);
|
||||
}
|
||||
|
||||
|
||||
pfnConvolution ConvGetFnPtrRev( void * setup )
|
||||
{
|
||||
struct ConvSetup * s = (struct ConvSetup*)setup;
|
||||
if (!s)
|
||||
return NULL;
|
||||
return s->pfn;
|
||||
}
|
||||
|
||||
|
||||
void convSimdDestroy( void * setup )
|
||||
{
|
||||
convDestroyRev(setup);
|
||||
}
|
||||
|
||||
|
||||
void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
|
||||
{
|
||||
void * p = pffastconv_new_setup( H, N, BlkLen, flags );
|
||||
if (!p)
|
||||
printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
void fastConvDestroy( void * setup )
|
||||
{
|
||||
pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
|
||||
}
|
||||
|
||||
|
||||
|
||||
int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||||
{
|
||||
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||||
const float * RESTRICT X = input;
|
||||
const float * RESTRICT Hrev = p->H;
|
||||
float * RESTRICT Y = output;
|
||||
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||
int i, j;
|
||||
(void)Yref;
|
||||
(void)applyFlush;
|
||||
|
||||
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||
{
|
||||
for ( i = 0; i <= lenNr; i += 2 )
|
||||
{
|
||||
float sumRe = 0.0F, sumIm = 0.0F;
|
||||
for ( j = 0; j < Nr; j += 2 )
|
||||
{
|
||||
sumRe += X[i+j ] * Hrev[j];
|
||||
sumIm += X[i+j+1] * Hrev[j+1];
|
||||
}
|
||||
Y[i ] = sumRe;
|
||||
Y[i+1] = sumIm;
|
||||
}
|
||||
return i/2;
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i <= lenNr; ++i )
|
||||
{
|
||||
float sum = 0.0F;
|
||||
for (j = 0; j < Nr; ++j )
|
||||
sum += X[i+j] * Hrev[j];
|
||||
Y[i] = sum;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||||
{
|
||||
float sum[4];
|
||||
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||||
const float * RESTRICT X = input;
|
||||
const float * RESTRICT Hrev = p->H;
|
||||
float * RESTRICT Y = output;
|
||||
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||
int i, j;
|
||||
(void)Yref;
|
||||
(void)applyFlush;
|
||||
|
||||
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||
{
|
||||
if ( (Nr & 3) == 0 )
|
||||
{
|
||||
for ( i = 0; i <= lenNr; i += 2 )
|
||||
{
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < Nr; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j] * Hrev[j];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
Y[i ] = sum[0] + sum[2];
|
||||
Y[i+1] = sum[1] + sum[3];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int M = Nr & (~3);
|
||||
for ( i = 0; i <= lenNr; i += 2 )
|
||||
{
|
||||
float tailSumRe = 0.0F, tailSumIm = 0.0F;
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < M; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j ] * Hrev[j ];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
for ( ; j < Nr; j += 2 ) {
|
||||
tailSumRe += X[i+j ] * Hrev[j ];
|
||||
tailSumIm += X[i+j+1] * Hrev[j+1];
|
||||
}
|
||||
Y[i ] = ( sum[0] + sum[2] ) + tailSumRe;
|
||||
Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
|
||||
}
|
||||
}
|
||||
return i/2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( (Nr & 3) == 0 )
|
||||
{
|
||||
for ( i = 0; i <= lenNr; ++i )
|
||||
{
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < Nr; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j] * Hrev[j];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
}
|
||||
return i;
|
||||
}
|
||||
else
|
||||
{
|
||||
const int M = Nr & (~3);
|
||||
/* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
|
||||
for ( i = 0; i <= lenNr; ++i )
|
||||
{
|
||||
float tailSum = 0.0;
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < M; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j] * Hrev[j];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
for ( ; j < Nr; ++j )
|
||||
tailSum += X[i+j] * Hrev[j];
|
||||
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||||
{
|
||||
float sum[4];
|
||||
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||||
(void)Yref;
|
||||
(void)applyFlush;
|
||||
if (p->flags & PFFASTCONV_SYMMETRIC)
|
||||
{
|
||||
const float * RESTRICT X = input;
|
||||
const float * RESTRICT Hrev = p->H;
|
||||
float * RESTRICT Y = output;
|
||||
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||
const int h = Nr / 2 -4;
|
||||
const int E = Nr -4;
|
||||
int i, j;
|
||||
|
||||
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||
{
|
||||
for ( i = 0; i <= lenNr; i += 2 )
|
||||
{
|
||||
const int k = i + E;
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j <= h; j += 4 )
|
||||
{
|
||||
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] );
|
||||
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
|
||||
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] );
|
||||
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
|
||||
}
|
||||
Y[i ] = sum[0] + sum[2];
|
||||
Y[i+1] = sum[1] + sum[3];
|
||||
}
|
||||
return i/2;
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i <= lenNr; ++i )
|
||||
{
|
||||
const int k = i + E;
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j <= h; j += 4 )
|
||||
{
|
||||
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] );
|
||||
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
|
||||
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
|
||||
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] );
|
||||
}
|
||||
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const float * RESTRICT X = input;
|
||||
const float * RESTRICT Hrev = p->H;
|
||||
float * RESTRICT Y = output;
|
||||
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||||
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||||
int i, j;
|
||||
|
||||
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||||
{
|
||||
for ( i = 0; i <= lenNr; i += 2 )
|
||||
{
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < Nr; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j] * Hrev[j];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
Y[i ] = sum[0] + sum[2];
|
||||
Y[i+1] = sum[1] + sum[3];
|
||||
}
|
||||
return i/2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( (Nr & 3) == 0 )
|
||||
{
|
||||
for ( i = 0; i <= lenNr; ++i )
|
||||
{
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < Nr; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j] * Hrev[j];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
else
|
||||
{
|
||||
const int M = Nr & (~3);
|
||||
/* printf("B: Nr = %d\n", Nr ); */
|
||||
for ( i = 0; i <= lenNr; ++i )
|
||||
{
|
||||
float tailSum = 0.0;
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||||
for (j = 0; j < M; j += 4 )
|
||||
{
|
||||
sum[0] += X[i+j] * Hrev[j];
|
||||
sum[1] += X[i+j+1] * Hrev[j+1];
|
||||
sum[2] += X[i+j+2] * Hrev[j+2];
|
||||
sum[3] += X[i+j+3] * Hrev[j+3];
|
||||
}
|
||||
for ( ; j < Nr; ++j )
|
||||
tailSum += X[i+j] * Hrev[j];
|
||||
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
|
||||
{
|
||||
(void)Yref;
|
||||
return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void printFirst( const float * V, const char * st, const int N, const int perLine )
|
||||
{
|
||||
(void)V; (void)st; (void)N; (void)perLine;
|
||||
return;
|
||||
#if 0
|
||||
int i;
|
||||
for ( i = 0; i < N; ++i )
|
||||
{
|
||||
if ( (i % perLine) == 0 )
|
||||
printf("\n%s[%d]", st, i);
|
||||
printf("\t%.1f", V[i]);
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define NUMY 15
|
||||
|
||||
|
||||
int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed, int abortFirstFastAlgo, int printErrValues, int printAsCSV, int *pIsFirstFilterLen) {
|
||||
double t0, t1, tstop, td, tdref;
|
||||
float *X, *H;
|
||||
float *Y[NUMY];
|
||||
int64_t outN[NUMY];
|
||||
/* 256 KFloats or 16 MFloats data */
|
||||
#if 1
|
||||
const int len = testOutLen ? (1 << 18) : (1 << 24);
|
||||
#elif 0
|
||||
const int len = testOutLen ? (1 << 18) : (1 << 13);
|
||||
#else
|
||||
const int len = testOutLen ? (1 << 18) : (1024);
|
||||
#endif
|
||||
const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
|
||||
const int lenC = len / cplxFactor;
|
||||
|
||||
int yi, yc, posMaxErr;
|
||||
float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
|
||||
int i, j, numErrOverLimit, iter;
|
||||
int retErr = 0;
|
||||
|
||||
/* 0 1 2 3 4 5 6 7 8 9, 10, 11, 12, 13 */
|
||||
pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, };
|
||||
pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, };
|
||||
pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
|
||||
pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, };
|
||||
const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K", "fast_conv_8K", "fast_conv_16K", "fast_conv_32K", "fast_conv_64K", };
|
||||
int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, };
|
||||
void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
|
||||
//int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, };
|
||||
int aBlkLen[NUMY] = { 8192, 8192, 8192, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, };
|
||||
#if 1
|
||||
int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096, FILTERLEN<8192, FILTERLEN<16384, FILTERLEN<32768, FILTERLEN<65536, };
|
||||
#elif 0
|
||||
int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, };
|
||||
#else
|
||||
int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096, 0 && FILTERLEN<8192, 0 && FILTERLEN<16384, 0 && FILTERLEN<32768, 0 && FILTERLEN<65536, };
|
||||
#endif
|
||||
double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
|
||||
int aNumIters[NUMY], aNumLoops[NUMY];
|
||||
|
||||
X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
|
||||
for ( i=0; i < NUMY; ++i)
|
||||
{
|
||||
if ( 1 || i < 2 )
|
||||
Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
|
||||
else
|
||||
Y[i] = Y[1];
|
||||
|
||||
Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */
|
||||
aSpeedFactor[i] = -1.0;
|
||||
aDuration[i] = -1.0;
|
||||
procSmpPerSec[i] = -1.0;
|
||||
aNumIters[i] = 0;
|
||||
aNumLoops[i] = 0;
|
||||
}
|
||||
|
||||
H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
|
||||
|
||||
/* initialize input */
|
||||
if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
|
||||
{
|
||||
for ( i = 0; i < lenC; ++i )
|
||||
{
|
||||
X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
|
||||
X[2*i+1] = (float)((i+2048) % 4093);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < len; ++i )
|
||||
X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
|
||||
}
|
||||
X[ len ] = INVALID_FLOAT_VAL;
|
||||
X[ len +1 ] = INVALID_FLOAT_VAL;
|
||||
X[ len +2 ] = INVALID_FLOAT_VAL;
|
||||
X[ len +3 ] = INVALID_FLOAT_VAL;
|
||||
|
||||
if (!testOutLen)
|
||||
printFirst( X, "X", 64, 8 );
|
||||
|
||||
/* filter coeffs */
|
||||
memset( H, 0, FILTERLEN * sizeof(float) );
|
||||
#if 1
|
||||
if ( convFlags & PFFASTCONV_SYMMETRIC )
|
||||
{
|
||||
const int half = FILTERLEN / 2;
|
||||
for ( j = 0; j < half; ++j ) {
|
||||
switch (j % 3) {
|
||||
case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break;
|
||||
case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break;
|
||||
case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < FILTERLEN; ++j ) {
|
||||
switch (j % 3) {
|
||||
case 0: H[j] = -1.0F; break;
|
||||
case 1: H[j] = 1.0F; break;
|
||||
case 2: H[j] = 0.5F; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
H[0] = 1.0F;
|
||||
H[FILTERLEN -1] = 1.0F;
|
||||
#endif
|
||||
if (!testOutLen)
|
||||
printFirst( H, "H", FILTERLEN, 8 );
|
||||
|
||||
if (!printAsCSV)
|
||||
{
|
||||
printf("\n");
|
||||
printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
|
||||
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
|
||||
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
|
||||
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
|
||||
}
|
||||
|
||||
int hadFastAlgo = 0;
|
||||
|
||||
while (1)
|
||||
{
|
||||
|
||||
for ( yi = 0; yi < NUMY; ++yi )
|
||||
{
|
||||
if (!aRunAlgo[yi])
|
||||
continue;
|
||||
|
||||
if ( aFastAlgo[yi] && abortFirstFastAlgo && hadFastAlgo )
|
||||
{
|
||||
aRunAlgo[yi] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
hadFastAlgo = hadFastAlgo | aFastAlgo[yi];
|
||||
|
||||
aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
|
||||
|
||||
/* get effective apply function ptr */
|
||||
if ( aSetupCfg[yi] && aGetFnPtr[yi] )
|
||||
aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
|
||||
|
||||
if ( aSetupCfg[yi] && aConv[yi] )
|
||||
{
|
||||
if (testOutLen)
|
||||
{
|
||||
t0 = uclock_sec();
|
||||
outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
|
||||
t1 = uclock_sec();
|
||||
td = t1 - t0;
|
||||
}
|
||||
else
|
||||
{
|
||||
//const int blkLen = 4096; /* required for 'fast_conv_4K' */
|
||||
const int blkLen = aBlkLen[yi];
|
||||
int64_t offC = 0, offS, Nout;
|
||||
int k;
|
||||
iter = 0;
|
||||
outN[yi] = 0;
|
||||
aNumLoops[yi] = 1;
|
||||
t0 = uclock_sec();
|
||||
tstop = t0 + BENCH_TEST_DURATION_IN_SEC;
|
||||
do
|
||||
{
|
||||
const int prev_iter = iter;
|
||||
for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
|
||||
{
|
||||
offS = cplxFactor * offC;
|
||||
Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], 0 /* applyFlush */ );
|
||||
offC += Nout;
|
||||
++iter;
|
||||
if ( !Nout )
|
||||
break;
|
||||
}
|
||||
//if ( !Nout )
|
||||
// break;
|
||||
t1 = uclock_sec();
|
||||
if ( prev_iter == iter ) // restart from begin of input?
|
||||
{
|
||||
offC = 0;
|
||||
++aNumLoops[yi];
|
||||
}
|
||||
} while ( t1 < tstop );
|
||||
outN[yi] = offC;
|
||||
td = t1 - t0;
|
||||
procSmpPerSec[yi] = cplxFactor * (double)outN[yi] * (1.0 / td);
|
||||
aNumIters[yi] = iter;
|
||||
aDuration[yi] = td;
|
||||
|
||||
//printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%.1f ms\n",
|
||||
// convText[yi], (double)outN[yi]/(1000.0 * 1000.0), 1000.0 * aDuration[yi], procSmpPerSec[yi] * 0.001, aNumIters[yi], 1000.0 * td );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
outN[yi] = 0;
|
||||
}
|
||||
if ( yi == 0 ) {
|
||||
const float * Yvals = Y[0];
|
||||
const int64_t refOutLen = cplxFactor * outN[0];
|
||||
tdref = td;
|
||||
if (printDbg) {
|
||||
printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
|
||||
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
|
||||
}
|
||||
aSpeedFactor[yi] = 1.0;
|
||||
/* */
|
||||
yRangeMin = FLT_MAX;
|
||||
yRangeMax = FLT_MIN;
|
||||
for ( i = 0; i < refOutLen; ++i )
|
||||
{
|
||||
if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i];
|
||||
if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i];
|
||||
}
|
||||
yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
|
||||
/* yErrLimit = 0.01F; */
|
||||
if (testOutLen) {
|
||||
if (1) {
|
||||
printf("reference output len = %" PRId64 " smp\n", outN[0]);
|
||||
printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
|
||||
}
|
||||
printFirst( Yvals, "Yref", 64, 8 );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
aSpeedFactor[yi] = tdref / td;
|
||||
if (printDbg) {
|
||||
printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
|
||||
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int iMaxSpeedSlowAlgo = -1;
|
||||
int iFirstFastAlgo = -1;
|
||||
int iMaxSpeedFastAlgo = -1;
|
||||
int iPrintedRefOutLen = 0;
|
||||
{
|
||||
for ( yc = 1; yc < NUMY; ++yc )
|
||||
{
|
||||
if (!aRunAlgo[yc])
|
||||
continue;
|
||||
if (aFastAlgo[yc]) {
|
||||
if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
|
||||
iMaxSpeedFastAlgo = yc;
|
||||
|
||||
if (iFirstFastAlgo < 0)
|
||||
iFirstFastAlgo = yc;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
|
||||
iMaxSpeedSlowAlgo = yc;
|
||||
}
|
||||
}
|
||||
|
||||
if (printSpeed)
|
||||
{
|
||||
if (testOutLen)
|
||||
{
|
||||
if (iMaxSpeedSlowAlgo >= 0 )
|
||||
printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
|
||||
if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
|
||||
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
|
||||
if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
|
||||
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
|
||||
|
||||
if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
|
||||
printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]);
|
||||
if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
|
||||
printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]);
|
||||
|
||||
if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
|
||||
{
|
||||
printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
|
||||
if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
|
||||
printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
// print columns in 1st line
|
||||
if (printAsCSV && *pIsFirstFilterLen)
|
||||
{
|
||||
printf("\n# filterLen, filterOrder, Re/Cx, type, sym, ");
|
||||
for ( yc = 0; yc < NUMY; ++yc )
|
||||
{
|
||||
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
|
||||
continue;
|
||||
if (printAsCSV)
|
||||
printf("%s, ", convText[yc]);
|
||||
}
|
||||
*pIsFirstFilterLen = 0;
|
||||
}
|
||||
|
||||
for ( yc = 0; yc < NUMY; ++yc )
|
||||
{
|
||||
if (!yc)
|
||||
{
|
||||
double filterExp = log10((double)FILTERLEN) / log10(2.0);
|
||||
printf("\n%5d, %5.1f, %s, %s, %s, ", FILTERLEN, filterExp,
|
||||
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
|
||||
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
|
||||
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym")
|
||||
);
|
||||
}
|
||||
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
|
||||
continue;
|
||||
if (printAsCSV)
|
||||
printf("%.0f, ", procSmpPerSec[yc] * 0.001);
|
||||
else
|
||||
printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\t%d iters\t%d loops\n",
|
||||
convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001, aNumIters[yc], aNumLoops[yc] );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for ( yc = 1; yc < NUMY; ++yc )
|
||||
{
|
||||
const float * Yref;
|
||||
const float * Ycurr;
|
||||
int outMin;
|
||||
|
||||
if (!aRunAlgo[yc])
|
||||
continue;
|
||||
|
||||
if (printDbg)
|
||||
printf("\n");
|
||||
|
||||
if ( outN[yc] == 0 )
|
||||
{
|
||||
if (!printAsCSV)
|
||||
printf("output size 0: '%s' not implemented\n", convText[yc]);
|
||||
}
|
||||
else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
|
||||
{
|
||||
if (!iPrintedRefOutLen)
|
||||
{
|
||||
printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
|
||||
iPrintedRefOutLen = 1;
|
||||
}
|
||||
printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
|
||||
FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
|
||||
retErr = 1;
|
||||
}
|
||||
|
||||
posMaxErr = 0;
|
||||
maxErr = -1.0;
|
||||
Yref = Y[0];
|
||||
Ycurr = Y[yc];
|
||||
outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
|
||||
numErrOverLimit = 0;
|
||||
for ( i = 0; i < outMin; ++i )
|
||||
{
|
||||
if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit && printErrValues )
|
||||
{
|
||||
printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
|
||||
convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
|
||||
++numErrOverLimit;
|
||||
}
|
||||
|
||||
if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
|
||||
{
|
||||
maxErr = fabsf(Ycurr[i] - Yref[i]);
|
||||
posMaxErr = i;
|
||||
}
|
||||
}
|
||||
|
||||
if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
|
||||
printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
pffastconv_free(X);
|
||||
for ( i=0; i < NUMY; ++i)
|
||||
{
|
||||
if ( 1 || i < 2 )
|
||||
pffastconv_free( Y[i] );
|
||||
if (!aRunAlgo[i])
|
||||
continue;
|
||||
aDestroy[i]( aSetupCfg[i] );
|
||||
}
|
||||
|
||||
pffastconv_free(H);
|
||||
|
||||
return retErr;
|
||||
}
|
||||
|
||||
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
|
||||
void validate_pffft_simd();
|
||||
int validate_pffft_simd_ex(FILE * DbgOut);
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int result = 0;
|
||||
int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
|
||||
int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
|
||||
int testReal = 1, testCplx = 1, testSymetric = 0, abortFirstFastAlgo = 1, printErrValues = 0, printAsCSV = 1;
|
||||
int isFirstFilterLen = 1;
|
||||
|
||||
for ( i = 1; i < argc; ++i ) {
|
||||
|
||||
if (!strcmp(argv[i], "--test-simd")) {
|
||||
int numErrs = validate_pffft_simd_ex(stdout);
|
||||
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
|
||||
return ( numErrs > 0 ? 1 : 0 );
|
||||
}
|
||||
|
||||
if (!strcmp(argv[i], "--no-len")) {
|
||||
testOutLens = 0;
|
||||
}
|
||||
else if (!strcmp(argv[i], "--no-bench")) {
|
||||
benchConv = 0;
|
||||
}
|
||||
else if (!strcmp(argv[i], "--quick")) {
|
||||
quickTest = 1;
|
||||
}
|
||||
else if (!strcmp(argv[i], "--slow")) {
|
||||
slowTest = 1;
|
||||
}
|
||||
else if (!strcmp(argv[i], "--real")) {
|
||||
testCplx = 0;
|
||||
}
|
||||
else if (!strcmp(argv[i], "--cplx")) {
|
||||
testReal = 0;
|
||||
}
|
||||
else if (!strcmp(argv[i], "--sym")) {
|
||||
testSymetric = 1;
|
||||
}
|
||||
else /* if (!strcmp(argv[i], "--help")) */ {
|
||||
printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (testOutLens)
|
||||
{
|
||||
for ( k = 0; k < 3; ++k )
|
||||
{
|
||||
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
|
||||
continue;
|
||||
printf("\n\n==========\n");
|
||||
printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
|
||||
printf("==========\n");
|
||||
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
|
||||
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
|
||||
flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
|
||||
testOutLen = 1;
|
||||
printDbg = 0;
|
||||
printSpeed = 0;
|
||||
for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
|
||||
{
|
||||
if ( (M % 16) != 0 && testSymetric )
|
||||
continue;
|
||||
result |= test(M, flagsB, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, 0, &isFirstFilterLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (benchConv)
|
||||
{
|
||||
printf("quickTest is %d\n", quickTest);
|
||||
printf("slowTest is %d\n", slowTest);
|
||||
|
||||
for ( k = 0; k < 3; ++k )
|
||||
{
|
||||
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
|
||||
continue;
|
||||
if (!printAsCSV)
|
||||
{
|
||||
printf("\n\n==========\n");
|
||||
printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
|
||||
printf("==========\n");
|
||||
}
|
||||
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
|
||||
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
|
||||
flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
|
||||
testOutLen = 0;
|
||||
printDbg = 0;
|
||||
printSpeed = 1;
|
||||
if (!slowTest) {
|
||||
if (!quickTest) {
|
||||
result |= test(32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(32 + 16, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
}
|
||||
result |= test(64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
if (!quickTest) {
|
||||
result |= test(64 + 32, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
}
|
||||
}
|
||||
if (!quickTest) {
|
||||
result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(256, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(512, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(1024, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
|
||||
result |= test(2048, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(4096, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(8192, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(16384, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
result |= test(32768, flagsC, testOutLen, printDbg, printSpeed, abortFirstFastAlgo, printErrValues, printAsCSV, &isFirstFilterLen);
|
||||
}
|
||||
if (printAsCSV)
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
371
pffft/test_pffft.c
Normal file
371
pffft/test_pffft.c
Normal file
@@ -0,0 +1,371 @@
|
||||
/*
|
||||
Copyright (c) 2013 Julien Pommier.
|
||||
|
||||
Small test for PFFFT
|
||||
|
||||
How to build:
|
||||
|
||||
on linux, with fftw3:
|
||||
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
||||
|
||||
on macos, without fftw3:
|
||||
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
|
||||
|
||||
on macos, with fftw3:
|
||||
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
|
||||
|
||||
as alternative: replace clang by gcc.
|
||||
|
||||
on windows, with visual c++:
|
||||
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
|
||||
|
||||
build without SIMD instructions:
|
||||
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
|
||||
|
||||
*/
|
||||
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
#include "pffft.h"
|
||||
|
||||
typedef float pffft_scalar;
|
||||
#else
|
||||
/*
|
||||
Note: adapted for double precision dynamic range version.
|
||||
*/
|
||||
#include "pffft_double.h"
|
||||
|
||||
typedef double pffft_scalar;
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
/* define own constants required to turn off g++ extensions .. */
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846 /* pi */
|
||||
#endif
|
||||
|
||||
/* EXPECTED_DYN_RANGE in dB:
|
||||
* single precision float has 24 bits mantissa
|
||||
* => 24 Bits * 6 dB = 144 dB
|
||||
* allow a few dB tolerance (even 144 dB looks good on my PC)
|
||||
*/
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
#define EXPECTED_DYN_RANGE 140.0
|
||||
#else
|
||||
#define EXPECTED_DYN_RANGE 215.0
|
||||
#endif
|
||||
|
||||
/* maximum allowed phase error in degree */
|
||||
#define DEG_ERR_LIMIT 1E-4
|
||||
|
||||
/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
|
||||
#define MAG_ERR_LIMIT 1E-6
|
||||
|
||||
|
||||
#define PRINT_SPEC 0
|
||||
|
||||
#define PWR2LOG(PWR) ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
|
||||
|
||||
|
||||
|
||||
int test(int N, int cplx, int useOrdered) {
|
||||
int Nfloat = (cplx ? N*2 : N);
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
pffft_scalar *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *R = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
#else
|
||||
pffft_scalar *X = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *Y = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *R = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *Z = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
pffft_scalar *W = pffftd_aligned_malloc((unsigned)Nfloat * sizeof(pffft_scalar));
|
||||
#endif
|
||||
pffft_scalar amp = (pffft_scalar)1.0;
|
||||
double freq, dPhi, phi, phi0;
|
||||
double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
|
||||
int k, j, m, iter, kmaxOther, retError = 0;
|
||||
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
assert( pffft_is_power_of_two(N) );
|
||||
PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
||||
#else
|
||||
assert( pffftd_is_power_of_two(N) );
|
||||
PFFFTD_Setup *s = pffftd_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
||||
#endif
|
||||
assert(s);
|
||||
if (!s) {
|
||||
printf("Error setting up PFFFT!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
|
||||
{
|
||||
amp = (pffft_scalar)( ( (m % 3) == 0 ) ? 1.0 : 1.1 );
|
||||
freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
|
||||
dPhi = 2.0 * M_PI * freq;
|
||||
if ( dPhi < 0.0 )
|
||||
dPhi += 2.0 * M_PI;
|
||||
|
||||
iter = -1;
|
||||
while (1)
|
||||
{
|
||||
++iter;
|
||||
|
||||
if (iter)
|
||||
printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
|
||||
|
||||
/* generate cosine carrier as time signal - start at defined phase phi0 */
|
||||
phi = phi0 = (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
|
||||
for ( j = 0; j < N; ++j )
|
||||
{
|
||||
if (cplx) {
|
||||
X[2*j] = amp * (pffft_scalar)cos(phi); /* real part */
|
||||
X[2*j+1] = amp * (pffft_scalar)sin(phi); /* imag part */
|
||||
}
|
||||
else
|
||||
X[j] = amp * (pffft_scalar)cos(phi); /* only real part */
|
||||
|
||||
/* phase increment .. stay normalized - cos()/sin() might degrade! */
|
||||
phi += dPhi;
|
||||
if ( phi >= M_PI )
|
||||
phi -= 2.0 * M_PI;
|
||||
}
|
||||
|
||||
/* forward transform from X --> Y .. using work buffer W */
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
if ( useOrdered )
|
||||
pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
|
||||
else
|
||||
{
|
||||
pffft_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */
|
||||
pffft_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
|
||||
}
|
||||
#else
|
||||
if ( useOrdered )
|
||||
pffftd_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
|
||||
else
|
||||
{
|
||||
pffftd_transform(s, X, R, W, PFFFT_FORWARD ); /* use R for reordering */
|
||||
pffftd_zreorder(s, R, Y, PFFFT_FORWARD ); /* reorder into Y[] for power calculations */
|
||||
}
|
||||
#endif
|
||||
|
||||
pwrOther = -1.0;
|
||||
pwrCar = 0;
|
||||
|
||||
|
||||
/* for positive frequencies: 0 to 0.5 * samplerate */
|
||||
/* and also for negative frequencies: -0.5 * samplerate to 0 */
|
||||
for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
|
||||
{
|
||||
if (!cplx && !j) /* special treatment for DC for real input */
|
||||
pwr = Y[j]*Y[j];
|
||||
else if (!cplx && j == N/2) /* treat 0.5 * samplerate */
|
||||
pwr = Y[1] * Y[1]; /* despite j (for freq calculation) we have index 1 */
|
||||
else
|
||||
pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
|
||||
if (iter || PRINT_SPEC)
|
||||
printf("%s fft %d: pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
|
||||
if (k == j)
|
||||
pwrCar = pwr;
|
||||
else if ( pwr > pwrOther ) {
|
||||
pwrOther = pwr;
|
||||
kmaxOther = j;
|
||||
}
|
||||
}
|
||||
|
||||
if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
|
||||
printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
|
||||
printf(" carrier power at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
|
||||
printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
|
||||
printf(" max other pwr at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
|
||||
printf(" dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
|
||||
retError = 1;
|
||||
if ( iter == 0 )
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( k > 0 && k != N/2 )
|
||||
{
|
||||
phi = atan2( Y[2*k+1], Y[2*k] );
|
||||
if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
|
||||
{
|
||||
retError = 1;
|
||||
printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg expected = %f deg\n",
|
||||
(cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
|
||||
}
|
||||
}
|
||||
|
||||
expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
|
||||
mag = sqrt(pwrCar) / N;
|
||||
if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
|
||||
{
|
||||
retError = 1;
|
||||
printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
|
||||
}
|
||||
|
||||
|
||||
/* now convert spectrum back */
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
if (useOrdered)
|
||||
pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
|
||||
else
|
||||
pffft_transform(s, R, Z, W, PFFFT_BACKWARD);
|
||||
#else
|
||||
if (useOrdered)
|
||||
pffftd_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
|
||||
else
|
||||
pffftd_transform(s, R, Z, W, PFFFT_BACKWARD);
|
||||
#endif
|
||||
|
||||
errSum = 0.0;
|
||||
for ( j = 0; j < (cplx ? (2*N) : N); ++j )
|
||||
{
|
||||
/* scale back */
|
||||
Z[j] /= N;
|
||||
/* square sum errors over real (and imag parts) */
|
||||
err = (X[j]-Z[j]) * (X[j]-Z[j]);
|
||||
errSum += err;
|
||||
}
|
||||
|
||||
if ( errSum > N * 1E-7 )
|
||||
{
|
||||
retError = 1;
|
||||
printf("%s fft %d bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
pffft_destroy_setup(s);
|
||||
pffft_aligned_free(X);
|
||||
pffft_aligned_free(Y);
|
||||
pffft_aligned_free(Z);
|
||||
pffft_aligned_free(R);
|
||||
pffft_aligned_free(W);
|
||||
#else
|
||||
pffftd_destroy_setup(s);
|
||||
pffftd_aligned_free(X);
|
||||
pffftd_aligned_free(Y);
|
||||
pffftd_aligned_free(Z);
|
||||
pffftd_aligned_free(R);
|
||||
pffftd_aligned_free(W);
|
||||
#endif
|
||||
|
||||
return retError;
|
||||
}
|
||||
|
||||
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
|
||||
void validate_pffft_simd();
|
||||
int validate_pffft_simd_ex(FILE * DbgOut);
|
||||
void validate_pffftd_simd();
|
||||
int validate_pffftd_simd_ex(FILE * DbgOut);
|
||||
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int N, result, resN, resAll, i, k, resNextPw2, resIsPw2, resFFT;
|
||||
|
||||
int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
|
||||
int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
|
||||
|
||||
for ( i = 1; i < argc; ++i ) {
|
||||
|
||||
if (!strcmp(argv[i], "--test-simd")) {
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
int numErrs = validate_pffft_simd_ex(stdout);
|
||||
#else
|
||||
int numErrs = validate_pffftd_simd_ex(stdout);
|
||||
#endif
|
||||
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
|
||||
return ( numErrs > 0 ? 1 : 0 );
|
||||
}
|
||||
}
|
||||
|
||||
resNextPw2 = 0;
|
||||
resIsPw2 = 0;
|
||||
for ( k = 0; k < (sizeof(inp_power_of_two)/sizeof(inp_power_of_two[0])); ++k) {
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
N = pffft_next_power_of_two(inp_power_of_two[k]);
|
||||
#else
|
||||
N = pffftd_next_power_of_two(inp_power_of_two[k]);
|
||||
#endif
|
||||
if (N != ref_power_of_two[k]) {
|
||||
resNextPw2 = 1;
|
||||
printf("pffft_next_power_of_two(%d) does deliver %d, which is not reference result %d!\n",
|
||||
inp_power_of_two[k], N, ref_power_of_two[k] );
|
||||
}
|
||||
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
result = pffft_is_power_of_two(inp_power_of_two[k]);
|
||||
#else
|
||||
result = pffftd_is_power_of_two(inp_power_of_two[k]);
|
||||
#endif
|
||||
if (inp_power_of_two[k] == ref_power_of_two[k]) {
|
||||
if (!result) {
|
||||
resIsPw2 = 1;
|
||||
printf("pffft_is_power_of_two(%d) delivers false; expected true!\n", inp_power_of_two[k]);
|
||||
}
|
||||
} else {
|
||||
if (result) {
|
||||
resIsPw2 = 1;
|
||||
printf("pffft_is_power_of_two(%d) delivers true; expected false!\n", inp_power_of_two[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!resNextPw2)
|
||||
printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
|
||||
if (!resIsPw2)
|
||||
printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
|
||||
|
||||
resFFT = 0;
|
||||
for ( N = 32; N <= 65536; N *= 2 )
|
||||
{
|
||||
result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
|
||||
resN = result;
|
||||
resFFT |= result;
|
||||
|
||||
result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
|
||||
resN |= result;
|
||||
resFFT |= result;
|
||||
|
||||
result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
|
||||
resN |= result;
|
||||
resFFT |= result;
|
||||
|
||||
result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
|
||||
resN |= result;
|
||||
resFFT |= result;
|
||||
|
||||
if (!resN)
|
||||
printf("tests for size %d succeeded successfully.\n", N);
|
||||
}
|
||||
|
||||
if (!resFFT) {
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, float) succeeded successfully.\n");
|
||||
#else
|
||||
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, double) succeeded successfully.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
resAll = resNextPw2 | resIsPw2 | resFFT;
|
||||
if (!resAll)
|
||||
printf("all tests succeeded successfully.\n");
|
||||
else
|
||||
printf("there are failed tests!\n");
|
||||
|
||||
return resAll;
|
||||
}
|
||||
|
||||
377
pffft/test_pffft.cpp
Normal file
377
pffft/test_pffft.cpp
Normal file
@@ -0,0 +1,377 @@
|
||||
/*
|
||||
Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Small test & bench for PFFFT, comparing its performance with the scalar
|
||||
FFTPACK, FFTW, and Apple vDSP
|
||||
|
||||
How to build:
|
||||
|
||||
on linux, with fftw3:
|
||||
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c
|
||||
test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
||||
|
||||
on macos, without fftw3:
|
||||
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c
|
||||
-L/usr/local/lib -I/usr/local/include/ -framework Accelerate
|
||||
|
||||
on macos, with fftw3:
|
||||
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c
|
||||
test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f
|
||||
-framework Accelerate
|
||||
|
||||
as alternative: replace clang by gcc.
|
||||
|
||||
on windows, with visual c++:
|
||||
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
|
||||
|
||||
build without SIMD instructions:
|
||||
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c
|
||||
fftpack.c -lm
|
||||
|
||||
*/
|
||||
|
||||
#include "pffft.hpp"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
/* define own constants required to turn off g++ extensions .. */
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846 /* pi */
|
||||
#endif
|
||||
|
||||
/* maximum allowed phase error in degree */
|
||||
#define DEG_ERR_LIMIT 1E-4
|
||||
|
||||
/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
|
||||
#define MAG_ERR_LIMIT 1E-6
|
||||
|
||||
#define PRINT_SPEC 0
|
||||
|
||||
#define PWR2LOG(PWR) ((PWR) < 1E-30 ? 10.0 * log10(1E-30) : 10.0 * log10(PWR))
|
||||
|
||||
template<typename T>
|
||||
bool
|
||||
Ttest(int N, bool useOrdered)
|
||||
{
|
||||
typedef pffft::Fft<T> Fft;
|
||||
typedef typename pffft::Fft<T>::Scalar FftScalar;
|
||||
typedef typename Fft::Complex FftComplex;
|
||||
|
||||
const bool cplx = pffft::Fft<T>::isComplexTransform();
|
||||
const double EXPECTED_DYN_RANGE = Fft::isDoubleScalar() ? 215.0 : 140.0;
|
||||
|
||||
assert(Fft::isPowerOfTwo(N));
|
||||
|
||||
Fft fft = Fft(N); // instantiate and prepareLength() for length N
|
||||
|
||||
#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
|
||||
|
||||
// possible ways to declare/instatiate aligned vectors with C++11
|
||||
// some lines require a typedef of above
|
||||
auto X = fft.valueVector(); // for X = input vector
|
||||
pffft::AlignedVector<typename Fft::Complex> Y = fft.spectrumVector(); // for Y = forward(X)
|
||||
pffft::AlignedVector<FftScalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
|
||||
pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
|
||||
// or Z = inverseInternalLayout(R)
|
||||
#else
|
||||
|
||||
// possible ways to declare/instatiate aligned vectors with C++98
|
||||
pffft::AlignedVector<T> X = fft.valueVector(); // for X = input vector
|
||||
pffft::AlignedVector<FftComplex> Y = fft.spectrumVector(); // for Y = forward(X)
|
||||
pffft::AlignedVector<typename Fft::Scalar> R = fft.internalLayoutVector(); // for R = forwardInternalLayout(X)
|
||||
pffft::AlignedVector<T> Z = fft.valueVector(); // for Z = inverse(Y) = inverse( forward(X) )
|
||||
// or Z = inverseInternalLayout(R)
|
||||
#endif
|
||||
|
||||
// work with complex - without the capabilities of a higher c++ standard
|
||||
FftScalar* Xs = reinterpret_cast<FftScalar*>(X.data()); // for X = input vector
|
||||
FftScalar* Ys = reinterpret_cast<FftScalar*>(Y.data()); // for Y = forward(X)
|
||||
FftScalar* Zs = reinterpret_cast<FftScalar*>(Z.data()); // for Z = inverse(Y) = inverse( forward(X) )
|
||||
|
||||
int k, j, m, iter, kmaxOther;
|
||||
bool retError = false;
|
||||
double freq, dPhi, phi, phi0;
|
||||
double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
|
||||
double amp = 1.0;
|
||||
|
||||
for (k = m = 0; k < (cplx ? N : (1 + N / 2)); k += N / 16, ++m) {
|
||||
amp = ((m % 3) == 0) ? 1.0F : 1.1F;
|
||||
freq = (k < N / 2) ? ((double)k / N) : ((double)(k - N) / N);
|
||||
dPhi = 2.0 * M_PI * freq;
|
||||
if (dPhi < 0.0)
|
||||
dPhi += 2.0 * M_PI;
|
||||
|
||||
iter = -1;
|
||||
while (1) {
|
||||
++iter;
|
||||
|
||||
if (iter)
|
||||
printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
|
||||
|
||||
/* generate cosine carrier as time signal - start at defined phase phi0 */
|
||||
phi = phi0 =
|
||||
(m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
|
||||
for (j = 0; j < N; ++j) {
|
||||
if (cplx) {
|
||||
Xs[2 * j] = (FftScalar)( amp * cos(phi) ); /* real part */
|
||||
Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */
|
||||
} else
|
||||
Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */
|
||||
|
||||
/* phase increment .. stay normalized - cos()/sin() might degrade! */
|
||||
phi += dPhi;
|
||||
if (phi >= M_PI)
|
||||
phi -= 2.0 * M_PI;
|
||||
}
|
||||
|
||||
/* forward transform from X --> Y .. using work buffer W */
|
||||
if (useOrdered)
|
||||
fft.forward(X, Y);
|
||||
else {
|
||||
fft.forwardToInternalLayout(X, R); /* use R for reordering */
|
||||
fft.reorderSpectrum(R, Y); /* have canonical order in Y[] for power calculations */
|
||||
}
|
||||
|
||||
pwrOther = -1.0;
|
||||
pwrCar = 0;
|
||||
|
||||
/* for positive frequencies: 0 to 0.5 * samplerate */
|
||||
/* and also for negative frequencies: -0.5 * samplerate to 0 */
|
||||
for (j = 0; j < (cplx ? N : (1 + N / 2)); ++j) {
|
||||
if (!cplx && !j) /* special treatment for DC for real input */
|
||||
pwr = Ys[j] * Ys[j];
|
||||
else if (!cplx && j == N / 2) /* treat 0.5 * samplerate */
|
||||
pwr = Ys[1] *
|
||||
Ys[1]; /* despite j (for freq calculation) we have index 1 */
|
||||
else
|
||||
pwr = Ys[2 * j] * Ys[2 * j] + Ys[2 * j + 1] * Ys[2 * j + 1];
|
||||
if (iter || PRINT_SPEC)
|
||||
printf("%s fft %d: pwr[j = %d] = %g == %f dB\n",
|
||||
(cplx ? "cplx" : "real"),
|
||||
N,
|
||||
j,
|
||||
pwr,
|
||||
PWR2LOG(pwr));
|
||||
if (k == j)
|
||||
pwrCar = pwr;
|
||||
else if (pwr > pwrOther) {
|
||||
pwrOther = pwr;
|
||||
kmaxOther = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE) {
|
||||
printf("%s fft %d amp %f iter %d:\n",
|
||||
(cplx ? "cplx" : "real"),
|
||||
N,
|
||||
amp,
|
||||
iter);
|
||||
printf(" carrier power at bin %d: %g == %f dB\n",
|
||||
k,
|
||||
pwrCar,
|
||||
PWR2LOG(pwrCar));
|
||||
printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar));
|
||||
printf(" max other pwr at bin %d: %g == %f dB\n",
|
||||
kmaxOther,
|
||||
pwrOther,
|
||||
PWR2LOG(pwrOther));
|
||||
printf(" dynamic range: %f dB\n\n",
|
||||
PWR2LOG(pwrCar) - PWR2LOG(pwrOther));
|
||||
retError = true;
|
||||
if (iter == 0)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (k > 0 && k != N / 2) {
|
||||
phi = atan2(Ys[2 * k + 1], Ys[2 * k]);
|
||||
if (fabs(phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0) {
|
||||
retError = true;
|
||||
printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg "
|
||||
"expected = %f deg\n",
|
||||
(cplx ? "cplx" : "real"),
|
||||
N,
|
||||
k,
|
||||
amp,
|
||||
phi * 180.0 / M_PI,
|
||||
phi0 * 180.0 / M_PI);
|
||||
}
|
||||
}
|
||||
|
||||
expextedMag = cplx ? amp : ((k == 0 || k == N / 2) ? amp : (amp / 2));
|
||||
mag = sqrt(pwrCar) / N;
|
||||
if (fabs(mag - expextedMag) > MAG_ERR_LIMIT) {
|
||||
retError = true;
|
||||
printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n",
|
||||
(cplx ? "cplx" : "real"),
|
||||
N,
|
||||
k,
|
||||
amp,
|
||||
mag,
|
||||
expextedMag);
|
||||
}
|
||||
|
||||
/* now convert spectrum back */
|
||||
if (useOrdered)
|
||||
fft.inverse(Y, Z);
|
||||
else
|
||||
fft.inverseFromInternalLayout(R, Z); /* inverse() from internal Layout */
|
||||
|
||||
errSum = 0.0;
|
||||
for (j = 0; j < (cplx ? (2 * N) : N); ++j) {
|
||||
/* scale back */
|
||||
Zs[j] /= N;
|
||||
/* square sum errors over real (and imag parts) */
|
||||
err = (Xs[j] - Zs[j]) * (Xs[j] - Zs[j]);
|
||||
errSum += err;
|
||||
}
|
||||
|
||||
if (errSum > N * 1E-7) {
|
||||
retError = true;
|
||||
printf("%s fft %d bin %d : inverse FFT doesn't match original signal! "
|
||||
"errSum = %g ; mean err = %g\n",
|
||||
(cplx ? "cplx" : "real"),
|
||||
N,
|
||||
k,
|
||||
errSum,
|
||||
errSum / N);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// using the std::vector<> base classes .. no need for alignedFree() for X, Y, Z and R
|
||||
|
||||
return retError;
|
||||
}
|
||||
|
||||
bool
|
||||
test(int N, bool useComplex, bool useOrdered)
|
||||
{
|
||||
if (useComplex) {
|
||||
return
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
Ttest< std::complex<float> >(N, useOrdered)
|
||||
#endif
|
||||
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
|
||||
&&
|
||||
#endif
|
||||
#ifdef PFFFT_ENABLE_DOUBLE
|
||||
Ttest< std::complex<double> >(N, useOrdered)
|
||||
#endif
|
||||
;
|
||||
} else {
|
||||
return
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
Ttest<float>(N, useOrdered)
|
||||
#endif
|
||||
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
|
||||
&&
|
||||
#endif
|
||||
#ifdef PFFFT_ENABLE_DOUBLE
|
||||
Ttest<double>(N, useOrdered)
|
||||
#endif
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int N, result, resN, resAll, k, resNextPw2, resIsPw2, resFFT;
|
||||
|
||||
int inp_power_of_two[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 511, 512, 513 };
|
||||
int ref_power_of_two[] = { 1, 2, 4, 4, 8, 8, 8, 8, 16, 512, 512, 1024 };
|
||||
|
||||
resNextPw2 = 0;
|
||||
resIsPw2 = 0;
|
||||
for (k = 0; k < (sizeof(inp_power_of_two) / sizeof(inp_power_of_two[0]));
|
||||
++k) {
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
N = pffft::Fft<float>::nextPowerOfTwo(inp_power_of_two[k]);
|
||||
#else
|
||||
N = pffft::Fft<double>::nextPowerOfTwo(inp_power_of_two[k]);
|
||||
#endif
|
||||
if (N != ref_power_of_two[k]) {
|
||||
resNextPw2 = 1;
|
||||
printf("pffft_next_power_of_two(%d) does deliver %d, which is not "
|
||||
"reference result %d!\n",
|
||||
inp_power_of_two[k],
|
||||
N,
|
||||
ref_power_of_two[k]);
|
||||
}
|
||||
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
result = pffft::Fft<float>::isPowerOfTwo(inp_power_of_two[k]);
|
||||
#else
|
||||
result = pffft::Fft<double>::isPowerOfTwo(inp_power_of_two[k]);
|
||||
#endif
|
||||
if (inp_power_of_two[k] == ref_power_of_two[k]) {
|
||||
if (!result) {
|
||||
resIsPw2 = 1;
|
||||
printf("pffft_is_power_of_two(%d) delivers false; expected true!\n",
|
||||
inp_power_of_two[k]);
|
||||
}
|
||||
} else {
|
||||
if (result) {
|
||||
resIsPw2 = 1;
|
||||
printf("pffft_is_power_of_two(%d) delivers true; expected false!\n",
|
||||
inp_power_of_two[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!resNextPw2)
|
||||
printf("tests for pffft_next_power_of_two() succeeded successfully.\n");
|
||||
if (!resIsPw2)
|
||||
printf("tests for pffft_is_power_of_two() succeeded successfully.\n");
|
||||
|
||||
resFFT = 0;
|
||||
for (N = 32; N <= 65536; N *= 2) {
|
||||
result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
|
||||
resN = result;
|
||||
resFFT |= result;
|
||||
|
||||
result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
|
||||
resN |= result;
|
||||
resFFT |= result;
|
||||
|
||||
result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
|
||||
resN |= result;
|
||||
resFFT |= result;
|
||||
|
||||
result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
|
||||
resN |= result;
|
||||
resFFT |= result;
|
||||
|
||||
if (!resN)
|
||||
printf("tests for size %d succeeded successfully.\n", N);
|
||||
}
|
||||
|
||||
if (!resFFT)
|
||||
printf("all pffft transform tests (FORWARD/BACKWARD, REAL/COMPLEX, "
|
||||
#ifdef PFFFT_ENABLE_FLOAT
|
||||
"float"
|
||||
#endif
|
||||
#if defined(PFFFT_ENABLE_FLOAT) && defined(PFFFT_ENABLE_DOUBLE)
|
||||
"/"
|
||||
#endif
|
||||
#ifdef PFFFT_ENABLE_DOUBLE
|
||||
"double"
|
||||
#endif
|
||||
") succeeded successfully.\n");
|
||||
|
||||
resAll = resNextPw2 | resIsPw2 | resFFT;
|
||||
if (!resAll)
|
||||
printf("all tests succeeded successfully.\n");
|
||||
else
|
||||
printf("there are failed tests!\n");
|
||||
|
||||
return resAll;
|
||||
}
|
||||
24
pffft/uninstall.cmake
Normal file
24
pffft/uninstall.cmake
Normal file
@@ -0,0 +1,24 @@
|
||||
set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
|
||||
|
||||
if(NOT EXISTS ${MANIFEST})
|
||||
message(FATAL_ERROR "Cannot find install manifest: '${MANIFEST}'")
|
||||
endif()
|
||||
|
||||
file(STRINGS ${MANIFEST} files)
|
||||
foreach(file ${files})
|
||||
if(EXISTS ${file})
|
||||
message(STATUS "Removing file: '${file}'")
|
||||
|
||||
exec_program(
|
||||
${CMAKE_COMMAND} ARGS "-E remove ${file}"
|
||||
OUTPUT_VARIABLE stdout
|
||||
RETURN_VALUE result
|
||||
)
|
||||
|
||||
if(NOT "${result}" STREQUAL 0)
|
||||
message(FATAL_ERROR "Failed to remove file: '${file}'.")
|
||||
endif()
|
||||
else()
|
||||
MESSAGE(STATUS "File '${file}' does not exist.")
|
||||
endif()
|
||||
endforeach(file)
|
||||
2
pffft/use_gcc8.inc
Normal file
2
pffft/use_gcc8.inc
Normal file
@@ -0,0 +1,2 @@
|
||||
export GCC_WITH_CMAKE=$(which gcc-8)
|
||||
export GPP_WITH_CMAKE=$(which g++-8)
|
||||
Reference in New Issue
Block a user