initial commit

2026-01-01 19:24:32 -06:00 · 2024-10-30 04:07:12 -05:00
commit bb505acce1
28 changed files with 5273 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,8 @@
+.DS_Store
+/.build
+/Packages
+xcuserdata/
+DerivedData/
+.swiftpm/configuration/registries.json
+.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
+.netrc
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,37 @@
+Copyright (c) 2024  John K. Luebs
+Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
+Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+Copyright (c) 2004 the University Corporation for Atmospheric
+Research ("UCAR"). All rights reserved. Developed by NCAR's
+Computational and Information Systems Laboratory, UCAR,
+www.cisl.ucar.edu.
+
+Redistribution and use of the Software in source and binary forms,
+with or without modification, is permitted provided that the
+following conditions are met:
+
+- Neither the names of NCAR's Computational and Information Systems
+Laboratory, the University Corporation for Atmospheric Research,
+nor the names of its sponsors or contributors may be used to
+endorse or promote products derived from this Software without
+specific prior written permission.
+
+- Redistributions of source code must retain the above copyright
+notices, this list of conditions, and the disclaimer below.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions, and the disclaimer below in the
+documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
--- a/Package.resolved
+++ b/Package.resolved
@@ -0,0 +1,15 @@
+{
+  "originHash" : "d50b9049eb671b1ad14bc8ba592c78735f9e357a6b59835343af6e20e8be4701",
+  "pins" : [
+    {
+      "identity" : "swift-numerics",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-numerics",
+      "state" : {
+        "revision" : "0a5bc04095a675662cf24757cc0640aa2204253b",
+        "version" : "1.0.2"
+      }
+    }
+  ],
+  "version" : 3
+}
--- a/Package.swift
+++ b/Package.swift
@@ -0,0 +1,38 @@
+// swift-tools-version: 6.0
+// The swift-tools-version declares the minimum version of Swift required to build this package.
+
+import PackageDescription
+
+let package = Package(
+    name: "SwiftPFFFT",
+    products: [
+        .library(
+            name: "PFFFT",
+            targets: ["PFFFT", "PFFFTLib"]
+        ),
+    ],
+    dependencies: [
+        .package(url: "https://github.com/apple/swift-numerics", from: "1.0.0"),
+    ],
+    targets: [
+        .target(
+            name: "PFFFTLib",
+            publicHeadersPath: "include",
+            cSettings: [
+                .define("PFFFT_SCALVEC_ENABLED", to: "1"),
+                .define("_USE_MATH_DEFINES"),
+                .define("NDEBUG"),
+                .unsafeFlags(["-O3"]),
+            ]
+        ),
+        .target(
+            name: "PFFFT",
+            dependencies: ["PFFFTLib", .product(name: "Numerics", package: "swift-numerics")]
+        ),
+        .testTarget(
+            name: "PFFFTTests",
+            dependencies: ["PFFFT"]
+        ),
+    ],
+    cLanguageStandard: .c99
+)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,33 @@
+# SwiftPFFFT
+
+Swift package providing a PFFFT (Pretty Fast, Fast Fourier Transform) library with wrapper.
+
+This code is based on the marton78 fork of PFFFT that was forked from the original PFFFT implementation
+by Julien Pommier. This fork provides support for doubles in addition to floats and provides additional SIMD implementations:
+
+The origin for the C implementation in this package is https://github.com/marton78/pffft.
+
+PFFFT provides substantial performance improvement over KissFFT. The advantage over FFTW is reasonable
+performance with much simpler usage and a permissive 3 clause BSD license.
+
+## Example
+
+``` swift
+
+// construct an interface for FFT, IFFT and convolutions. The interface is parameterized on the
+// type of element in the signal (time) domain. The spectrum (frequency) domain type will always be
+// complex. For a real valued signal the spectrum size will be `n / 2`, with the packing convention
+// 
+let fft = try FFT<Complex<Float>>(n: 16)
+let signal = fft.makeSignalBuffer()
+
+signal.mutateEach { (i, v) in
+    v = Complex(Float(i) + 1.0, Float(i) - 2.0)
+}
+
+let spectrum = fft.makeSpectrumBuffer()
+fft.forward(signal: signal, spectrum: spectrum)
+
+fft.inverse(spectrum: spectrum, signal: signal)
+
+```
--- a/Sources/PFFFT/Buffer.swift
+++ b/Sources/PFFFT/Buffer.swift
@@ -0,0 +1,61 @@
+import ComplexModule
+import RealModule
+
+let bufferAlignment = 32
+
+@frozen
+public struct Buffer<T>: ~Copyable {
+    public let buffer: UnsafeMutableBufferPointer<T>
+    var count: Int { buffer.count }
+    var baseAddress: UnsafeMutablePointer<T> { buffer.baseAddress! }
+
+    public init(capacity: Int) {
+        buffer = UnsafeMutableRawBufferPointer.allocate(
+            byteCount: MemoryLayout<T>.stride * capacity,
+            alignment: bufferAlignment
+        ).bindMemory(to: T.self)
+    }
+
+    deinit {
+        buffer.deallocate()
+    }
+
+    @inlinable public func withUnsafeMutableBufferPointer<R>(_ body: (UnsafeMutableBufferPointer<T>) throws -> R) rethrows -> R {
+        try body(buffer)
+    }
+
+    @inlinable public func withUnsafeBufferPointer<R>(_ body: (UnsafeBufferPointer<T>) throws -> R) rethrows -> R {
+        try body(UnsafeBufferPointer(buffer))
+    }
+
+    @inlinable public func withUnsafeMutableBytes<R>(_ body: (UnsafeMutableRawBufferPointer) throws -> R) rethrows -> R {
+        try body(UnsafeMutableRawBufferPointer(buffer))
+    }
+
+    @inlinable public func map<U>(_ transform: (T) throws -> U) rethrows -> [U] {
+        try buffer.map(transform)
+    }
+
+    @inlinable public func mutateEach(_ body: (Int, inout T) throws -> Void) rethrows {
+        for i in 0 ..< buffer.count {
+            try body(i, &buffer[i])
+        }
+    }
+}
+
+public protocol ComplexType {
+    associatedtype RealType: Real
+    var real: RealType { get set }
+    var imaginary: RealType { get set }
+}
+
+extension Complex: ComplexType {}
+
+public extension Buffer where T: ComplexType {
+    @inlinable public func mutateEachSwapLast(_ body: (Int, inout T) throws -> Void) rethrows {
+        for i in 0 ..< buffer.count {
+            try body(i, &buffer[i])
+        }
+        buffer[0].imaginary = buffer[buffer.count - 1].real
+    }
+}
--- a/Sources/PFFFT/FFT.swift
+++ b/Sources/PFFFT/FFT.swift
@@ -0,0 +1,366 @@
+internal import PFFFTLib
+import ComplexModule
+import RealModule
+
+@frozen
+public enum FFTType {
+    case real
+    case complex
+}
+
+@frozen
+public enum FFTSign {
+    case forward
+    case backward
+}
+
+public enum FFTError: Error {
+    case invalidSize
+}
+
+public protocol FFTElement {
+    associatedtype FFTScalarType: FFTScalar
+    associatedtype FFTComplexType = Complex<FFTScalarType>
+
+    static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer
+    static func pffftMinFftSize(_ type: FFTType) -> Int
+    static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool
+    static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int
+}
+
+public protocol FFTScalar: Real {
+    static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign)
+    static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign)
+    static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ dir: FFTSign)
+    static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self)
+    static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self)
+    static func pffftSimdArch() -> String
+}
+
+extension Float: FFTScalar {
+    public static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign) {
+        pffft_transform_ordered(ptr, input, output, work, pffft_direction_t(dir))
+    }
+
+    public static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign) {
+        pffft_transform(ptr, input, output, work, pffft_direction_t(dir))
+    }
+
+    public static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ dir: FFTSign) {
+        pffft_zreorder(ptr, input, output, pffft_direction_t(dir))
+    }
+
+    public static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self) {
+        pffft_zconvolve_accumulate(ptr, dftA, dftB, dftAB, scaling)
+    }
+
+    public static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self) {
+        pffft_zconvolve_no_accu(ptr, dftA, dftB, dftAB, scaling)
+    }
+
+    public static func pffftSimdArch() -> String {
+        String(cString: pffft_simd_arch())
+    }
+}
+
+extension Double: FFTScalar {
+    public static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Double>, _ output: UnsafeMutablePointer<Double>, _ work: UnsafeMutablePointer<Double>?, _ dir: FFTSign) {
+        pffftd_transform_ordered(ptr, input, output, work, pffft_direction_t(dir))
+    }
+
+    public static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Double>, _ output: UnsafeMutablePointer<Double>, _ work: UnsafeMutablePointer<Double>?, _ dir: FFTSign) {
+        pffftd_transform(ptr, input, output, work, pffft_direction_t(dir))
+    }
+
+    public static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Double>, _ output: UnsafeMutablePointer<Double>, _ dir: FFTSign) {
+        pffftd_zreorder(ptr, input, output, pffft_direction_t(dir))
+    }
+
+    public static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Double>, _ dftB: UnsafeMutablePointer<Double>, _ dftAB: UnsafeMutablePointer<Double>, _ scaling: Double) {
+        pffftd_zconvolve_accumulate(ptr, dftA, dftB, dftAB, scaling)
+    }
+
+    public static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Double>, _ dftB: UnsafeMutablePointer<Double>, _ dftAB: UnsafeMutablePointer<Double>, _ scaling: Double) {
+        pffftd_zconvolve_no_accu(ptr, dftA, dftB, dftAB, scaling)
+    }
+
+    public static func pffftSimdArch() -> String {
+        String(cString: pffftd_simd_arch())
+    }
+}
+
+extension Complex: FFTElement where RealType: FFTElement & FFTScalar {
+    public typealias FFTScalarType = RealType
+    public typealias FFTComplexType = Self
+
+    public static func pffftSetup(_ n: Int, _: FFTType) throws -> OpaquePointer {
+        return try FFTScalarType.pffftSetup(n, .complex)
+    }
+
+    public static func pffftMinFftSize(_: FFTType) -> Int {
+        return FFTScalarType.pffftMinFftSize(.complex)
+    }
+
+    public static func pffftIsValidSize(_ n: Int, _: FFTType) -> Bool {
+        return FFTScalarType.pffftIsValidSize(n, .complex)
+    }
+
+    public static func pffftNearestValidSize(_ n: Int, _: FFTType, _ higher: Bool) -> Int {
+        return FFTScalarType.pffftNearestValidSize(n, .complex, higher)
+    }
+}
+
+extension Double: FFTElement {
+    public typealias FFTScalarType = Double
+
+    public static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer {
+        guard let ptr = pffftd_new_setup(Int32(n), pffft_transform_t(type)) else { throw FFTError.invalidSize }
+        return ptr
+    }
+
+    public static func pffftMinFftSize(_ type: FFTType) -> Int {
+        Int(pffftd_min_fft_size(pffft_transform_t(type)))
+    }
+
+    public static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool {
+        pffftd_is_valid_size(Int32(n), pffft_transform_t(type)) != 0
+    }
+
+    public static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int {
+        Int(pffftd_nearest_transform_size(Int32(n), pffft_transform_t(type), higher ? 1 : 0))
+    }
+}
+
+extension Float: FFTElement {
+    public typealias FFTScalarType = Float
+
+    public static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer {
+        guard let ptr = pffft_new_setup(Int32(n), pffft_transform_t(type)) else { throw FFTError.invalidSize }
+        return ptr
+    }
+
+    public static func pffftMinFftSize(_ type: FFTType) -> Int {
+        Int(pffft_min_fft_size(pffft_transform_t(type)))
+    }
+
+    public static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool {
+        pffft_is_valid_size(Int32(n), pffft_transform_t(type)) != 0
+    }
+
+    public static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int {
+        Int(pffft_nearest_transform_size(Int32(n), pffft_transform_t(type), higher ? 1 : 0))
+    }
+}
+
+@frozen
+public struct FFT<T: FFTElement>: ~Copyable {
+    public typealias ComplexType = T.FFTComplexType
+    public typealias ScalarType = T.FFTScalarType
+
+    let ptr: OpaquePointer
+    let n: Int
+    let work: Buffer<ScalarType>?
+    let setup: Setup
+
+    public init(setup: Setup) {
+        self.setup = setup
+        ptr = setup.ptr
+        n = setup.n
+        work = n > 4096 ? Buffer<ScalarType>(capacity: T.self == ComplexType.self ? 2 * n : n) : nil
+    }
+
+    /// Initialize the FFT implementation with the given size and type.
+    /// Since an FFT setup for a given size and element type is expensive to create but consists
+    /// of read only data, a global cache is used to reuse setups.
+    /// - Parameters:
+    /// - n: The size of the FFT.
+    /// - Throws: `FFTError.invalidSize` if the size is invalid.
+    public init(n: Int) throws {
+        try self.init(setup: SetupCache.shared.get(n: n, type: T.self))
+    }
+
+    /// Make a buffer for the FFT (time-domain) signal.
+    /// - Parameters:
+    /// - extra: An extra number of elements to allocate.
+    public func makeSignalBuffer(extra: Int = 0) -> Buffer<T> {
+        Buffer(capacity: n + extra)
+    }
+
+    /// Make a buffer for the FFT (frequency-domain) spectrum.
+    /// - Parameters:
+    /// - extra: An extra number of elements to allocate.
+    public func makeSpectrumBuffer(extra: Int = 0) -> Buffer<ComplexType> {
+        Buffer(capacity: T.self == ComplexType.self ? (n + extra) : n / 2 + extra)
+    }
+
+    /// Make a buffer for the internal layout of the FFT (frequency-domain) spectrum.
+    /// - Parameters:
+    /// - extra: An extra number of points to allocate. For complex FFTs, 2 * extra
+    /// additional elements will be allocated.
+    public func makeInternalLayoutBuffer(extra: Int = 0) -> Buffer<ScalarType> {
+        Buffer(capacity: (T.self == ComplexType.self ? 2 : 1) * (n + extra))
+    }
+
+    @inline(__always)
+    func toAddress(_ work: borrowing Buffer<ScalarType>?) -> UnsafeMutablePointer<ScalarType>? {
+        switch work {
+        case let .some(b): return b.baseAddress
+        case .none: return nil
+        }
+    }
+
+    @inline(__always)
+    func rebind<I>(_ buffer: borrowing Buffer<I>) -> UnsafeMutablePointer<ScalarType>! {
+        UnsafeMutableRawBufferPointer(buffer.buffer).bindMemory(to: ScalarType.self).baseAddress
+    }
+
+    @inline(__always)
+    func checkFftBufferCounts(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ComplexType>) {
+        guard signal.count >= n else {
+            fatalError("signal buffer too small")
+        }
+        guard spectrum.count >= (T.self == ComplexType.self ? n : n / 2) else {
+            fatalError("spectrum buffer too small")
+        }
+    }
+
+    @inline(__always)
+    func checkFftInternalLayoutBufferCounts(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ScalarType>) {
+        guard signal.count >= n else {
+            fatalError("signal buffer too small")
+        }
+        guard spectrum.count >= (T.self == ComplexType.self ? 2 * n : n) else {
+            fatalError("spectrum buffer too small")
+        }
+    }
+
+    @inline(__always)
+    func checkConvolveBufferCounts(a: borrowing Buffer<ScalarType>, b: borrowing Buffer<ScalarType>, ab: borrowing Buffer<ScalarType>) {
+        let minCount = T.self == ComplexType.self ? 2 * n : n
+
+        guard a.count >= minCount else {
+            fatalError("a buffer too small")
+        }
+        guard b.count >= minCount else {
+            fatalError("b buffer too small")
+        }
+        guard ab.count >= minCount else {
+            fatalError("ab buffer too small")
+        }
+    }
+
+    /// Perform a forward FFT on the input buffer.
+    ///
+    /// The input and output buffers may be the same.
+    /// The data is stores in order as expected (interleaved complex components ordered by frequency).
+    /// The input and output buffer must have a capacity of at least `n` for real FFTs and `2 * n` for complex FFTs.
+    /// A fatal error will occur if any buffer is too small.
+    ///
+    /// For a real forward transform with real input, the output array is organized as follows:
+    /// index k > 2 where k is even is the real part of the k/2-th complex coefficient.
+    /// index k > 2 where k is odd is the imaginary part of the k/2-th complex coefficient.
+    /// index k = 0 is the real part of the 0 frequency (DC) coefficient.
+    /// index k = 1 is the real part of the Nyquist coefficient.
+    ///
+    /// Transforms are not scaled. fft_backward(fft_forward(x)) == n * x.
+    ///
+    /// - Parameters:
+    ///  - input: The input buffer.
+    ///  - output: The output buffer.
+    ///  - work: An optional work buffer. Must have capacity of at least `n` for real FFTs and `2 * n` for complex FFTs.
+    ///  - sign: The direction of the FFT.
+    public func forward(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ComplexType>) {
+        checkFftBufferCounts(signal: signal, spectrum: spectrum)
+        ScalarType.pffftTransformOrdered(ptr, rebind(signal), rebind(spectrum), toAddress(work), .forward)
+    }
+
+    public func inverse(spectrum: borrowing Buffer<ComplexType>, signal: borrowing Buffer<T>) {
+        checkFftBufferCounts(signal: signal, spectrum: spectrum)
+        ScalarType.pffftTransformOrdered(ptr, rebind(spectrum), rebind(signal), toAddress(work), .backward)
+    }
+
+    /// Perform a forward FFT on the input buffer, with implementation defined order.
+    ///
+    /// This function behaves similarly to `fft` however the z-domain data is stored in most efficient ordering,
+    /// which is suitable for transforming back with this function, or for convolution.
+    /// - Parameters:
+    ///  - input: The input buffer.
+    ///  - output: The output buffer.
+    ///  - work: An optional work buffer. Must have capacity of at least `n` for real FFTs and `2 * n` for complex FFTs.
+    ///  - sign: The direction of the FFT.
+    public func forwardToInternalLayout(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ScalarType>) {
+        checkFftInternalLayoutBufferCounts(signal: signal, spectrum: spectrum)
+        ScalarType.pffftTransform(ptr, rebind(signal), spectrum.baseAddress, toAddress(work), .forward)
+    }
+
+    public func inverseFromInternalLayout(spectrum: borrowing Buffer<ScalarType>, signal: borrowing Buffer<T>) {
+        checkFftInternalLayoutBufferCounts(signal: signal, spectrum: spectrum)
+        ScalarType.pffftTransform(ptr, spectrum.baseAddress, rebind(signal), toAddress(work), .backward)
+    }
+
+    public func reorder(spectrum: borrowing Buffer<ScalarType>, output: borrowing Buffer<ComplexType>) {
+        guard spectrum.count >= (T.self == ComplexType.self ? 2 * n : n) else {
+            fatalError("signal buffer too small")
+        }
+        guard output.count >= n else {
+            fatalError("outputbuffer too small")
+        }
+        ScalarType.pffftZreorder(ptr, spectrum.baseAddress, rebind(output), .forward)
+    }
+
+    /// Perform a convolution of two complex signals in the frequency domain.
+    ///
+    /// Multiplies frequency domain components of `dftA` and `dftB` and stores the result in `dftAB`.
+    /// The operation performed is `dftAB = (dftA * dftB) * scaling`.
+    /// - Parameters:
+    /// - dftA: The first input buffer of frequency domain data.
+    /// - dftB: The second input buffer of frequency domain data.
+    /// - dftAB: The output buffer of frequency domain data.
+    /// - scaling: The scaling factor to apply to the result.
+    public func convolve(dftA: borrowing Buffer<ScalarType>, dftB: borrowing Buffer<ScalarType>, dftAB: borrowing Buffer<ScalarType>, scaling: ScalarType) {
+        checkConvolveBufferCounts(a: dftA, b: dftB, ab: dftAB)
+        ScalarType.pffftZconvolveNoAccu(ptr, dftA.baseAddress, dftB.baseAddress, dftAB.baseAddress, scaling)
+    }
+
+    /// Perform a convolution of two complex signals in the frequency domain.
+    ///
+    /// Multiplies frequency domain components of `dftA` and `dftB` and accumulates the result in `dftAB`.
+    /// The operation performed is `dftAB += (dftA * dftB) * scaling`.
+    /// - Parameters:
+    /// - dftA: The first input buffer of frequency domain data.
+    /// - dftB: The second input buffer of frequency domain data.
+    /// - dftAB: The output buffer of frequency domain data.
+    /// - scaling: The scaling factor to apply to the result.
+    public func convolveAccumulate(dftA: borrowing Buffer<ScalarType>, dftB: borrowing Buffer<ScalarType>, dftAB: borrowing Buffer<ScalarType>, scaling: ScalarType) {
+        checkConvolveBufferCounts(a: dftA, b: dftB, ab: dftAB)
+        ScalarType.pffftZconvolveAccumulate(ptr, dftA.baseAddress, dftB.baseAddress, dftAB.baseAddress, scaling)
+    }
+
+    /// Returns whether the given size is valid for the given type.
+    ///
+    /// The PFFFT library requires `n` to be factorizable to `minFftSize` with factors of 2, 3, 5.
+    /// - Parameters:
+    /// - n: The size to check.
+    /// - Returns: Whether the size is valid.
+    public static func isValidSize(_ n: Int) -> Bool {
+        T.pffftIsValidSize(n, .real)
+    }
+
+    /// Returns the nearest valid size for the given type.
+    /// - Parameters:
+    /// - n: The size to check.
+    /// - higher: Whether to return the next higher size if `n` is invalid.
+    /// - Returns: The nearest valid size.
+    public static func nearestValidSize(_ n: Int, higher: Bool) -> Int {
+        T.pffftNearestValidSize(n, .real, higher)
+    }
+
+    /// The minimum FFT size for this type of setup.
+    public static var minFftSize: Int {
+        T.pffftMinFftSize(.real)
+    }
+
+    public static var simdArch: String {
+        ScalarType.pffftSimdArch()
+    }
+}
--- a/Sources/PFFFT/PFFFT.swift
+++ b/Sources/PFFFT/PFFFT.swift
@@ -0,0 +1,22 @@
+internal import PFFFTLib
+
+extension pffft_transform_t {
+    @inline(__always)
+    init(_ type: FFTType) {
+        switch type {
+        case .real: self = PFFFT_REAL
+        case .complex: self = PFFFT_COMPLEX
+        }
+    }
+}
+
+extension pffft_direction_t {
+    @inline(__always)
+    init(_ sign: FFTSign) {
+        switch sign {
+        case .forward: self = PFFFT_FORWARD
+        case .backward: self = PFFFT_BACKWARD
+        }
+    }
+}
+
--- a/Sources/PFFFT/Setup.swift
+++ b/Sources/PFFFT/Setup.swift
@@ -0,0 +1,15 @@
+internal import PFFFTLib
+
+public class Setup {
+    let ptr: OpaquePointer
+    let n: Int
+
+    init<T: FFTElement>(n: Int, type: T.Type) throws {
+        ptr = try type.pffftSetup(n, .real)
+        self.n = n
+    }
+
+    deinit {
+        pffft_destroy_setup(ptr)
+    }
+}
--- a/Sources/PFFFT/SetupCache.swift
+++ b/Sources/PFFFT/SetupCache.swift
@@ -0,0 +1,46 @@
+import Foundation
+
+public class SetupCache: @unchecked Sendable {
+    struct CacheKey: Hashable {
+        let n: Int
+        let type: ObjectIdentifier
+
+        init(n: Int, type: any FFTElement.Type) {
+            self.n = n
+            self.type = ObjectIdentifier(type)
+        }
+    }
+
+    var cache: [CacheKey: Setup?] = [:]
+    let queue = DispatchQueue(label: String(describing: SetupCache.self), attributes: .concurrent)
+
+    public init() {}
+
+    public func get<T: FFTElement>(n: Int, type: T.Type) throws -> Setup {
+        var setup: Setup??
+        queue.sync {
+            setup = cache[CacheKey(n: n, type: type)]
+        }
+        if setup == nil {
+            queue.sync(flags: .barrier) {
+                let key = CacheKey(n: n, type: type)
+                setup = cache[key]
+                if setup == nil {
+                    let entry = try? Setup(n: n, type: type)
+                    cache[key] = entry
+                    setup = entry
+                }
+            }
+        }
+        guard let s = setup! else { throw FFTError.invalidSize }
+        return s
+    }
+
+    public func clear() {
+        queue.sync(flags: .barrier) {
+            cache.removeAll()
+        }
+    }
+
+    public static let shared = SetupCache()
+}
--- a/Sources/PFFFTLib/include/pffft.h
+++ b/Sources/PFFFTLib/include/pffft.h
@@ -0,0 +1,241 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+   
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+   
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.  
+
+   Restrictions: 
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.  
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+#ifndef PFFFT_COMMON_ENUMS
+#define PFFFT_COMMON_ENUMS
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+  
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+#endif
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads. 
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /* 
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384). Threads usually have a small stack, that
+     there's no sufficient amount of memory, usually leading to a crash!
+     Use the heap with pffft_aligned_malloc() in this case.
+
+     For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
+     input with input(=transformation) length N, the output array is
+     'mostly' complex:
+       index k in 1 .. N/2 -1  corresponds to frequency k * Samplerate / N
+       index k == 0 is a special case:
+         the real() part contains the result for the DC frequency 0,
+         the imag() part contains the result for the Nyquist frequency Samplerate/2
+     both 0-frequency and half frequency components, which are real,
+     are assembled in the first entry as  F(0)+i*F(N/2).
+     With the output size N/2 complex values (=N real/imag values), it is
+     obvious, that the result for negative frequencies are not output,
+     cause of symmetry.
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /* 
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+     
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /* 
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+     
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+     
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and put result in dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
+  int pffft_simd_size();
+
+  /* return string identifier of used architecture (SSE/NEON/Altivec/..) */
+  const char * pffft_simd_arch();
+
+
+  /* following functions are identical to the pffftd_ functions */
+
+  /* simple helper to get minimum possible fft size */
+  int pffft_min_fft_size(pffft_transform_t transform);
+
+  /* simple helper to determine next power of 2
+     - without inexact/rounding floating point operations
+  */
+  int pffft_next_power_of_two(int N);
+
+  /* simple helper to determine if power of 2 - returns bool */
+  int pffft_is_power_of_two(int N);
+
+  /* simple helper to determine size N is valid
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5
+     returns bool
+  */
+  int pffft_is_valid_size(int N, pffft_transform_t cplx);
+
+  /* determine nearest valid transform size  (by brute-force testing)
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5.
+     higher: bool-flag to find nearest higher value; else lower.
+  */
+  int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.  
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFFT_H */
+
--- a/Sources/PFFFTLib/include/pffft_double.h
+++ b/Sources/PFFFTLib/include/pffft_double.h
@@ -0,0 +1,236 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+/*
+   NOTE: This file is adapted from Julien Pommier's original PFFFT,
+   which works on 32 bit floating point precision using SSE instructions,
+   to work with 64 bit floating point precision using AVX instructions.
+   Author: Dario Mambro @ https://github.com/unevens/pffft
+*/
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+   
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.  
+
+   Restrictions: 
+
+   - 1D transforms only, with 64-bit double precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (double*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 32 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_DOUBLE_H
+#define PFFFT_DOUBLE_H
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.  
+  */
+  typedef struct PFFFTD_Setup PFFFTD_Setup;
+
+#ifndef PFFFT_COMMON_ENUMS
+#define PFFFT_COMMON_ENUMS
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+  
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+#endif
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFTD_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads. 
+  */
+  PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
+  void pffftd_destroy_setup(PFFFTD_Setup *);
+  /* 
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+     
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) doubles, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384). Threads usually have a small stack, that
+     there's no sufficient amount of memory, usually leading to a crash!
+     Use the heap with pffft_aligned_malloc() in this case.
+
+     input and output may alias.
+  */
+  void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
+
+  /* 
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+     
+     input and output may alias.
+  */
+  void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
+
+  /* 
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+     
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+     
+     input and output should not alias.
+  */
+  void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and put result in dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
+
+  /* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
+  int pffftd_simd_size();
+
+  /* return string identifier of used architecture (AVX/..) */
+  const char * pffftd_simd_arch();
+
+  /* simple helper to get minimum possible fft size */
+  int pffftd_min_fft_size(pffft_transform_t transform);
+
+  /* simple helper to determine size N is valid
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5
+  */
+  int pffftd_is_valid_size(int N, pffft_transform_t cplx);
+
+  /* determine nearest valid transform size  (by brute-force testing)
+     - factorizable to pffft_min_fft_size() with factors 2, 3, 5.
+     higher: bool-flag to find nearest higher value; else lower.
+  */
+  int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
+
+
+  /* following functions are identical to the pffft_ functions - both declared */
+
+  /* simple helper to determine next power of 2
+     - without inexact/rounding floating point operations
+  */
+  int pffftd_next_power_of_two(int N);
+  int pffft_next_power_of_two(int N);
+
+  /* simple helper to determine if power of 2 - returns bool */
+  int pffftd_is_power_of_two(int N);
+  int pffft_is_power_of_two(int N);
+
+  /*
+    the double buffers must have the correct alignment (32-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.  
+  */
+  void *pffftd_aligned_malloc(size_t nb_bytes);
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffftd_aligned_free(void *);
+  void pffft_aligned_free(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFFT_DOUBLE_H */
+
--- a/Sources/PFFFTLib/pffft.c
+++ b/Sources/PFFFTLib/pffft.c
@@ -0,0 +1,134 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog: 
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#include "pffft.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#include "simd/pf_float.h"
+
+/* have code comparable with this definition */
+#define SETUP_STRUCT               PFFFT_Setup
+#define FUNC_NEW_SETUP             pffft_new_setup
+#define FUNC_DESTROY               pffft_destroy_setup
+#define FUNC_TRANSFORM_UNORDRD     pffft_transform
+#define FUNC_TRANSFORM_ORDERED     pffft_transform_ordered
+#define FUNC_ZREORDER              pffft_zreorder
+#define FUNC_ZCONVOLVE_ACCUMULATE  pffft_zconvolve_accumulate
+#define FUNC_ZCONVOLVE_NO_ACCU     pffft_zconvolve_no_accu
+
+#define FUNC_ALIGNED_MALLOC        pffft_aligned_malloc
+#define FUNC_ALIGNED_FREE          pffft_aligned_free
+#define FUNC_SIMD_SIZE             pffft_simd_size
+#define FUNC_MIN_FFT_SIZE          pffft_min_fft_size
+#define FUNC_IS_VALID_SIZE         pffft_is_valid_size
+#define FUNC_NEAREST_SIZE          pffft_nearest_transform_size
+#define FUNC_SIMD_ARCH             pffft_simd_arch
+#define FUNC_VALIDATE_SIMD_A       validate_pffft_simd
+#define FUNC_VALIDATE_SIMD_EX      validate_pffft_simd_ex
+
+#define FUNC_CPLX_FINALIZE         pffft_cplx_finalize
+#define FUNC_CPLX_PREPROCESS       pffft_cplx_preprocess
+#define FUNC_REAL_PREPROCESS_4X4   pffft_real_preprocess_4x4
+#define FUNC_REAL_PREPROCESS       pffft_real_preprocess
+#define FUNC_REAL_FINALIZE_4X4     pffft_real_finalize_4x4
+#define FUNC_REAL_FINALIZE         pffft_real_finalize
+#define FUNC_TRANSFORM_INTERNAL    pffft_transform_internal
+
+#define FUNC_COS  cosf
+#define FUNC_SIN  sinf
+
+
+#include "pffft_priv_impl.h"
+
+
--- a/Sources/PFFFTLib/pffft_common.c
+++ b/Sources/PFFFTLib/pffft_common.c
@@ -0,0 +1,53 @@
+
+#include "pffft.h"
+
+#include <stdlib.h>
+
+/* SSE and co like 16-bytes aligned pointers
+ * with a 64-byte alignment, we are even aligned on L2 cache lines... */
+#define MALLOC_V4SF_ALIGNMENT 64
+
+static void * Valigned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
+
+static void Valigned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
+
+
+static int next_power_of_two(int N) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  /* compute the next highest power of 2 of 32-bit v */
+  unsigned v = N;
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+static int is_power_of_two(int N) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
+  int f = N && !(N & (N - 1));
+  return f;
+}
+
+
+
+void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
+void pffft_aligned_free(void *p) { Valigned_free(p); }
+int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
+int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
+
+void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
+void pffftd_aligned_free(void *p) { Valigned_free(p); }
+int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
+int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
--- a/Sources/PFFFTLib/pffft_double.c
+++ b/Sources/PFFFTLib/pffft_double.c
@@ -0,0 +1,147 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+   NOTE: This file is adapted from Julien Pommier's original PFFFT,
+   which works on 32 bit floating point precision using SSE instructions,
+   to work with 64 bit floating point precision using AVX instructions.
+   Author: Dario Mambro @ https://github.com/unevens/pffft
+*/
+
+#include "pffft_double.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#ifdef COMPILER_MSVC
+#  define _USE_MATH_DEFINES
+#  include <malloc.h>
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+#  include <malloc.h>
+#else
+#  include <alloca.h>
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   AVX -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#include "simd/pf_double.h"
+
+/* have code comparable with this definition */
+#define float double
+#define SETUP_STRUCT               PFFFTD_Setup
+#define FUNC_NEW_SETUP             pffftd_new_setup
+#define FUNC_DESTROY               pffftd_destroy_setup
+#define FUNC_TRANSFORM_UNORDRD     pffftd_transform
+#define FUNC_TRANSFORM_ORDERED     pffftd_transform_ordered
+#define FUNC_ZREORDER              pffftd_zreorder
+#define FUNC_ZCONVOLVE_ACCUMULATE  pffftd_zconvolve_accumulate
+#define FUNC_ZCONVOLVE_NO_ACCU     pffftd_zconvolve_no_accu
+
+#define FUNC_ALIGNED_MALLOC        pffftd_aligned_malloc
+#define FUNC_ALIGNED_FREE          pffftd_aligned_free
+#define FUNC_SIMD_SIZE             pffftd_simd_size
+#define FUNC_MIN_FFT_SIZE          pffftd_min_fft_size
+#define FUNC_IS_VALID_SIZE         pffftd_is_valid_size
+#define FUNC_NEAREST_SIZE          pffftd_nearest_transform_size
+#define FUNC_SIMD_ARCH             pffftd_simd_arch
+#define FUNC_VALIDATE_SIMD_A       validate_pffftd_simd
+#define FUNC_VALIDATE_SIMD_EX      validate_pffftd_simd_ex
+
+#define FUNC_CPLX_FINALIZE         pffftd_cplx_finalize
+#define FUNC_CPLX_PREPROCESS       pffftd_cplx_preprocess
+#define FUNC_REAL_PREPROCESS_4X4   pffftd_real_preprocess_4x4
+#define FUNC_REAL_PREPROCESS       pffftd_real_preprocess
+#define FUNC_REAL_FINALIZE_4X4     pffftd_real_finalize_4x4
+#define FUNC_REAL_FINALIZE         pffftd_real_finalize
+#define FUNC_TRANSFORM_INTERNAL    pffftd_transform_internal
+
+#define FUNC_COS  cos
+#define FUNC_SIN  sin
+
+
+#include "pffft_priv_impl.h"
+
+
--- a/Sources/PFFFTLib/pffft_priv_impl.h
+++ b/Sources/PFFFTLib/pffft_priv_impl.h
--- a/Sources/PFFFTLib/simd/pf_altivec_float.h
+++ b/Sources/PFFFTLib/simd/pf_altivec_float.h
@@ -0,0 +1,81 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_ALTIVEC_FLT_H
+#define PF_ALTIVEC_FLT_H
+
+/*
+   Altivec support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
+typedef vector float v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#  define VREQUIRES_ALIGN 1  /* not sure, if really required */
+#  define VARCH "ALTIVEC"
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+#endif
+
+#endif /* PF_SSE1_FLT_H */
+
--- a/Sources/PFFFTLib/simd/pf_avx_double.h
+++ b/Sources/PFFFTLib/simd/pf_avx_double.h
@@ -0,0 +1,145 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_AVX_DBL_H
+#define PF_AVX_DBL_H
+
+/*
+   vector support macros: the rest of the code is independant of
+   AVX -- adding support for other platforms with 4-element
+   vectors should be limited to these macros
+*/
+
+
+/*
+  AVX support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
+#pragma message( __FILE__ ": AVX macros are defined" )
+
+#include <immintrin.h>
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "AVX"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+		_mm_shuffle_pd(low1__, low2__, 3),								\
+		1);																\
+	out2 = _mm256_insertf128_pd(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+		_mm_shuffle_pd(high1__, high2__, 3),							\
+		1);																\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+		_mm_shuffle_pd(low2__, high2__, 0),								\
+		1);																\
+	out2 = _mm256_insertf128_pd(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+		_mm_shuffle_pd(low2__, high2__, 3),								\
+		1);																\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {				\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     \
+                                                            \
+        tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);       \
+        tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);       \
+        tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);       \
+        tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);       \
+                                                            \
+        (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);	\
+        (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);  \
+        (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);  \
+        (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);  \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+
+/* reverse/flip all floats */
+#  define VREV_S(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
--- a/Sources/PFFFTLib/simd/pf_double.h
+++ b/Sources/PFFFTLib/simd/pf_double.h
@@ -0,0 +1,84 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_DBL_H
+#define PF_DBL_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ *  SIMD reference material:
+ *
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ *
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ *
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ *
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ *
+ */
+
+typedef double vsfscalar;
+
+#include "pf_avx_double.h"
+#include "pf_sse2_double.h"
+#include "pf_neon_double.h"
+
+#ifndef SIMD_SZ
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    pragma message( "building double with simd disabled !" )
+#    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
+#  endif
+#endif
+
+#include "pf_scalar_double.h"
+
+/* shortcuts for complex multiplcations */
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#endif /* PF_DBL_H */
+
--- a/Sources/PFFFTLib/simd/pf_float.h
+++ b/Sources/PFFFTLib/simd/pf_float.h
@@ -0,0 +1,84 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_FLT_H
+#define PF_FLT_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ *  SIMD reference material:
+ *
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ *
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ *
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ *
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ *
+ */
+
+typedef float vsfscalar;
+
+#include "pf_sse1_float.h"
+#include "pf_neon_float.h"
+#include "pf_altivec_float.h"
+
+#ifndef SIMD_SZ
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    pragma message( "building float with simd disabled !" )
+#    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
+#  endif
+#endif
+
+#include "pf_scalar_float.h"
+
+/* shortcuts for complex multiplcations */
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#endif /* PF_FLT_H */
+
--- a/Sources/PFFFTLib/simd/pf_neon_double.h
+++ b/Sources/PFFFTLib/simd/pf_neon_double.h
@@ -0,0 +1,203 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+/*
+  NEON 64bit support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
+
+#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
+
+#include "pf_neon_double_from_avx.h"
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "NEON"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b;
+    return res;
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
+{
+    float64x1_t al = vget_low_f64(a);
+    float64x1_t bl = vget_low_f64(b);
+    return vcombine_f64(al, bl);
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
+{
+    float64x1_t ah = vget_high_f64(a);
+    float64x1_t bh = vget_high_f64(b);
+    return vcombine_f64(ah, bh);
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b.vect_f64[0];
+    return res;
+}
+
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[1];
+    res.vect_f64[1] = b.vect_f64[1];
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+{
+    __m256d res;
+    float64x2_t low = x.vect_f64[0];
+    float64x2_t high = x.vect_f64[1];
+    float64x1_t a = vget_low_f64(low);
+    float64x1_t b = vget_high_f64(low);
+    float64x1_t c = vget_low_f64(high);
+    float64x1_t d = vget_high_f64(high);
+    res.vect_f64[0] =  vcombine_f64(d, c);
+    res.vect_f64[1] =  vcombine_f64(b, a);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
+		_mm_shuffle_pd_11(low1__, low2__));								\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
+		_mm_shuffle_pd_11(high1__, high2__));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
+		_mm_shuffle_pd_00(low2__, high2__));							\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
+		_mm_shuffle_pd_11(low2__, high2__));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   _mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
--- a/Sources/PFFFTLib/simd/pf_neon_double_from_avx.h
+++ b/Sources/PFFFTLib/simd/pf_neon_double_from_avx.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+//see https://github.com/kunpengcompute/AvxToNeon
+
+#ifndef PF_NEON_DBL_FROM_AVX_H
+#define PF_NEON_DBL_FROM_AVX_H
+#include <arm_neon.h>
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#else
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+
+#define FORCE_INLINE static inline
+
+#endif
+
+typedef struct {
+    float32x4_t vect_f32[2];
+} __m256;
+
+typedef struct {
+    float64x2_t vect_f64[2];
+} __m256d;
+
+typedef float64x2_t __m128d;
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
+}
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+    return a.vect_f64[0];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f64[imm8];
+}
+
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+    __m256d res;
+    res.vect_f64[0] = a;
+    return res;
+}
+
+#endif /* PF_AVX_DBL_H */
+
--- a/Sources/PFFFTLib/simd/pf_neon_float.h
+++ b/Sources/PFFFTLib/simd/pf_neon_float.h
@@ -0,0 +1,87 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_FLT_H
+#define PF_NEON_FLT_H
+
+/*
+  ARM NEON support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
+#pragma message( __FILE__ ": ARM NEON macros are defined" )
+
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "NEON"
+#  define VREQUIRES_ALIGN 0  /* usually no alignment required */
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
+#  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+// marginally faster version
+//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)    vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
+/* reverse/flip complex floats */
+#  define VREV_C(a)    vextq_f32(a, a, 2)
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+
+#else
+/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
+#endif
+
+#endif /* PF_NEON_FLT_H */
+
--- a/Sources/PFFFTLib/simd/pf_scalar_double.h
+++ b/Sources/PFFFTLib/simd/pf_scalar_double.h
@@ -0,0 +1,185 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SCAL_DBL_H
+#define PF_SCAL_DBL_H
+
+/*
+  fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
+*/
+
+#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
+#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
+
+typedef struct {
+  vsfscalar a;
+  vsfscalar b;
+  vsfscalar c;
+  vsfscalar d;
+} v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "4xScalar"
+#  define VREQUIRES_ALIGN 0
+
+  static ALWAYS_INLINE(v4sf) VZERO() {
+    v4sf r = { 0.f, 0.f, 0.f, 0.f };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
+    v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
+    v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
+    v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
+    v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
+    v4sf r = { v, v, v, v };
+    return r;
+  }
+
+#  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
+
+#  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
+
+
+  /* INTERLEAVE2() */
+  #define INTERLEAVE2( A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, B.a, A.b, B.b }; \
+    v4sf Dr = { A.c, B.c, A.d, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* UNINTERLEAVE2() */
+  #define UNINTERLEAVE2(A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, A.c, B.a, B.c }; \
+    v4sf Dr = { A.b, A.d, B.b, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VTRANSPOSE4() */
+  #define VTRANSPOSE4(A, B, C, D) \
+  do { \
+    v4sf Ar = { A.a, B.a, C.a, D.a }; \
+    v4sf Br = { A.b, B.b, C.b, D.b }; \
+    v4sf Cr = { A.c, B.c, C.c, D.c }; \
+    v4sf Dr = { A.d, B.d, C.d, D.d }; \
+    A = Ar; \
+    B = Br; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VSWAPHL() */
+  static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
+    v4sf r = { B.a, B.b, A.c, A.d };
+    return r;
+  }
+
+
+  /* reverse/flip all floats */
+  static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
+    v4sf r = { A.d, A.c, A.b, A.a };
+    return r;
+  }
+
+  /* reverse/flip complex floats */
+  static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
+    v4sf r = { A.c, A.d, A.a, A.b };
+    return r;
+  }
+
+#else
+/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
+#endif
+
+
+#if !defined(SIMD_SZ)
+#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
+typedef vsfscalar v4sf;
+
+#  define SIMD_SZ 1
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "Scalar"
+#  define VREQUIRES_ALIGN 0
+#  define VZERO() 0.0
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
+
+#else
+/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
+#endif
+
+
+#endif /* PF_SCAL_DBL_H */
+
--- a/Sources/PFFFTLib/simd/pf_scalar_float.h
+++ b/Sources/PFFFTLib/simd/pf_scalar_float.h
@@ -0,0 +1,185 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SCAL_FLT_H
+#define PF_SCAL_FLT_H
+
+/*
+  fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
+*/
+
+#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
+#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
+
+typedef struct {
+  vsfscalar a;
+  vsfscalar b;
+  vsfscalar c;
+  vsfscalar d;
+} v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "4xScalar"
+#  define VREQUIRES_ALIGN 0
+
+  static ALWAYS_INLINE(v4sf) VZERO() {
+    v4sf r = { 0.f, 0.f, 0.f, 0.f };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
+    v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
+    v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
+    v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
+    v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
+    return r;
+  }
+
+  static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
+    v4sf r = { v, v, v, v };
+    return r;
+  }
+
+#  define VLOAD_UNALIGNED(ptr)  (*((v4sf*)(ptr)))
+
+#  define VLOAD_ALIGNED(ptr)    (*((v4sf*)(ptr)))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
+
+
+  /* INTERLEAVE2() */
+  #define INTERLEAVE2( A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, B.a, A.b, B.b }; \
+    v4sf Dr = { A.c, B.c, A.d, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* UNINTERLEAVE2() */
+  #define UNINTERLEAVE2(A, B, C, D) \
+  do { \
+    v4sf Cr = { A.a, A.c, B.a, B.c }; \
+    v4sf Dr = { A.b, A.d, B.b, B.d }; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VTRANSPOSE4() */
+  #define VTRANSPOSE4(A, B, C, D) \
+  do { \
+    v4sf Ar = { A.a, B.a, C.a, D.a }; \
+    v4sf Br = { A.b, B.b, C.b, D.b }; \
+    v4sf Cr = { A.c, B.c, C.c, D.c }; \
+    v4sf Dr = { A.d, B.d, C.d, D.d }; \
+    A = Ar; \
+    B = Br; \
+    C = Cr; \
+    D = Dr; \
+  } while (0)
+
+
+  /* VSWAPHL() */
+  static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
+    v4sf r = { B.a, B.b, A.c, A.d };
+    return r;
+  }
+
+
+  /* reverse/flip all floats */
+  static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
+    v4sf r = { A.d, A.c, A.b, A.a };
+    return r;
+  }
+
+  /* reverse/flip complex floats */
+  static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
+    v4sf r = { A.c, A.d, A.a, A.b };
+    return r;
+  }
+
+#else
+/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
+#endif
+
+
+#if !defined(SIMD_SZ)
+#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
+typedef vsfscalar v4sf;
+
+#  define SIMD_SZ 1
+
+typedef union v4sf_union {
+  v4sf  v;
+  vsfscalar f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "Scalar"
+#  define VREQUIRES_ALIGN 0
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
+
+#else
+/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
+#endif
+
+
+#endif /* PF_SCAL_FLT_H */
+
--- a/Sources/PFFFTLib/simd/pf_sse1_float.h
+++ b/Sources/PFFFTLib/simd/pf_sse1_float.h
@@ -0,0 +1,82 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SSE1_FLT_H
+#define PF_SSE1_FLT_H
+
+/*
+  SSE1 support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
+#pragma message( __FILE__ ": SSE1 float macros are defined" )
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+
+/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
+ *  anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "SSE1"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm_loadu_ps(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm_load_ps(ptr)
+
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+#else
+/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
+#endif
+
+#endif /* PF_SSE1_FLT_H */
+
--- a/Sources/PFFFTLib/simd/pf_sse2_double.h
+++ b/Sources/PFFFTLib/simd/pf_sse2_double.h
@@ -0,0 +1,281 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SSE2_DBL_H
+#define PF_SSE2_DBL_H
+
+//detect sse2 support under MSVC
+#if defined ( _M_IX86_FP )
+#  if _M_IX86_FP == 2
+#    if !defined(__SSE2__)
+#      define __SSE2__
+#    endif
+#  endif
+#endif
+
+/*
+  SSE2 64bit support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
+#pragma message (__FILE__ ": SSE2 double macros are defined" )
+
+#include <emmintrin.h>
+
+typedef struct {
+    __m128d d128[2];
+} m256d;
+
+typedef m256d v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#elif defined (_MSC_VER)
+#define FORCE_INLINE static __forceinline
+
+#else
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#endif
+
+FORCE_INLINE m256d mm256_setzero_pd(void)
+{
+    m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_set1_pd(double a)
+{
+    m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
+{
+    m256d res;
+    res.d128[0] = _mm_load_pd((const double *)mem_addr);
+    res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
+{
+    m256d res;
+    res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
+    res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
+    return res;
+}
+
+
+#  define VARCH "SSE2"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() mm256_setzero_pd()
+#  define VMUL(a,b) mm256_mul_pd(a,b)
+#  define VADD(a,b) mm256_add_pd(a,b)
+#  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) mm256_sub_pd(a,b)
+#  define LD_PS1(p) mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
+
+
+FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
+{
+    return a.d128[0];
+}
+
+FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.d128[imm8];
+}
+FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
+{
+    m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b;
+    return res;
+}
+FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
+{
+    m256d res;
+    res.d128[0] = a;
+    return res;
+}
+
+FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
+    return res;
+}
+
+FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
+    return res;
+}
+
+FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
+    m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b.d128[0];
+    return res;
+}
+
+
+FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = a.d128[1];
+    res.d128[1] = b.d128[1];
+    return res;
+}
+
+FORCE_INLINE m256d mm256_reverse(m256d x)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
+    res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+		_mm_shuffle_pd(low1__, low2__, 3));								\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+		_mm_shuffle_pd(high1__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+		_mm_shuffle_pd(low2__, high2__, 0));							\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+		_mm_shuffle_pd(low2__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+#endif
--- a/Tests/PFFFTTests/PFFFTTests.swift
+++ b/Tests/PFFFTTests/PFFFTTests.swift
@@ -0,0 +1,50 @@
+import Testing
+import ComplexModule
+@testable import PFFFT
+
+@Test func fftFloat() async throws {
+
+    let fft = try FFT<Complex<Float>>(n: 16)
+    let signal = fft.makeSignalBuffer()
+    let spectrum = fft.makeSpectrumBuffer()
+
+    signal.mutateEach { (i, v) in
+        v = Complex(Float(i) + 1.0, Float(i) - 2.0)
+    }
+
+    fft.forward(signal: signal, spectrum: spectrum)
+
+    let result = spectrum.map { $0 }
+    let expected: [Complex<Float>] = [
+      .init(136.0, 88.0),
+      .init(-48.218716, 32.218716),
+      .init(-27.31371, 11.313708),
+      .init(-19.972847, 3.972846),
+      .init(-16.0, 0.0),
+      .init(-13.345428, -2.6545706),
+      .init(-11.313709, -4.6862917),
+      .init(-9.591298, -6.408703),
+      .init(-8.0, -8.0),
+      .init(-6.408703, -9.591298),
+      .init(-4.6862917, -11.313708),
+      .init(-2.6545706, -13.345429),
+      .init(0.0, -16.0),
+      .init(3.972845, -19.972847),
+      .init(11.313707, -27.31371),
+      .init(32.218716, -48.218716),
+    ]
+    zip(result, expected).forEach { r, e in
+        #expect(r.isApproximatelyEqual(to: e))
+    }
+
+    fft.inverse(spectrum: spectrum, signal: signal)
+
+    let signalResult = signal.map { $0 }
+    let signalExpected = (0..<16).map { i in
+        Complex(Float(i) + 1.0, Float(i) - 2.0) * 16
+    }
+
+    zip(signalResult, signalExpected).forEach { r, e in
+        #expect(r.isApproximatelyEqual(to: e))
+    }
+}