diff --git a/Package.resolved b/Package.resolved index 1eb44e4..d5c4ad5 100644 --- a/Package.resolved +++ b/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "bf0b92ec2751ab82eed76f0b5f4c42d45557a303569895153367fd1d19d1045a", + "originHash" : "8b424c89cf909d23f0294225dd14ece13680ef4d122b099e92d093e9817a9b21", "pins" : [ { "identity" : "swift-algorithms", @@ -18,6 +18,24 @@ "revision" : "0a5bc04095a675662cf24757cc0640aa2204253b", "version" : "1.0.2" } + }, + { + "identity" : "swift-odeint", + "kind" : "remoteSourceControl", + "location" : "https://github.com/jkl1337/swift-odeint", + "state" : { + "branch" : "master", + "revision" : "6e195efd2276dec7cedd2745be8ed61e0cbaa370" + } + }, + { + "identity" : "swiftpffft", + "kind" : "remoteSourceControl", + "location" : "https://github.com/jkl1337/SwiftPFFFT", + "state" : { + "branch" : "master", + "revision" : "437f984b112fecbd4303d21f7c934f2085b6d9af" + } } ], "version" : 3 diff --git a/Package.swift b/Package.swift index f1f6896..a9b1598 100644 --- a/Package.swift +++ b/Package.swift @@ -14,14 +14,16 @@ let package = Package( ], dependencies: [ .package(url: "https://github.com/apple/swift-algorithms", from: "1.2.0"), - .package(path: "./Packages/PFFFT/"), + .package(url: "https://github.com/jkl1337/SwiftPFFFT", branch: "master"), + .package(url: "https://github.com/jkl1337/swift-odeint", branch: "master"), ], targets: [ .target( name: "EcgSynKit", dependencies: [ - .product(name: "PFFFT", package: "PFFFT"), + .product(name: "PFFFT", package: "SwiftPFFFT"), .product(name: "Algorithms", package: "swift-algorithms"), + .product(name: "OdeInt", package: "swift-odeint"), ] ), .testTarget( diff --git a/Packages/PFFFT/.gitignore b/Packages/PFFFT/.gitignore deleted file mode 100644 index 0023a53..0000000 --- a/Packages/PFFFT/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -.DS_Store -/.build -/Packages -xcuserdata/ -DerivedData/ -.swiftpm/configuration/registries.json -.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata -.netrc diff --git a/Packages/PFFFT/Package.swift b/Packages/PFFFT/Package.swift deleted file mode 100644 index dce638b..0000000 --- a/Packages/PFFFT/Package.swift +++ /dev/null @@ -1,37 +0,0 @@ -// swift-tools-version: 6.0 -// The swift-tools-version declares the minimum version of Swift required to build this package. - -import PackageDescription - -let package = Package( - name: "PFFFT", - products: [ - .library( - name: "PFFFT", - targets: ["PFFFT", "PFFFTLib"] - ), - ], - dependencies: [ - .package(url: "https://github.com/apple/swift-numerics", from: "1.0.0"), - ], - targets: [ - .target( - name: "PFFFTLib", - publicHeadersPath: "include", - cSettings: [ - .define("PFFFT_SCALVEC_ENABLED", to: "1"), - .define("_USE_MATH_DEFINES"), - .define("NDEBUG"), - .unsafeFlags(["-O3", "-std=c99"]), - ] - ), - .target( - name: "PFFFT", - dependencies: ["PFFFTLib", .product(name: "Numerics", package: "swift-numerics")] - ), - .testTarget( - name: "PFFFTTests", - dependencies: ["PFFFT"] - ), - ] -) diff --git a/Packages/PFFFT/Sources/PFFFT/Buffer.swift b/Packages/PFFFT/Sources/PFFFT/Buffer.swift deleted file mode 100644 index 67c1988..0000000 --- a/Packages/PFFFT/Sources/PFFFT/Buffer.swift +++ /dev/null @@ -1,31 +0,0 @@ -let bufferAlignment = 32 - -@frozen -public struct Buffer: ~Copyable { - let buffer: UnsafeMutableBufferPointer - var count: Int { buffer.count } - var baseAddress: UnsafeMutablePointer { buffer.baseAddress! } - - public init(capacity: Int) { - buffer = UnsafeMutableRawBufferPointer.allocate( - byteCount: MemoryLayout.stride * capacity, - alignment: bufferAlignment - ).bindMemory(to: T.self) - } - - deinit { - buffer.deallocate() - } - - public func withUnsafeMutableBufferPointer(_ body: (UnsafeMutableBufferPointer) throws -> R) rethrows -> R { - try body(buffer) - } - - public func withUnsafeBufferPointer(_ body: (UnsafeBufferPointer) throws -> R) rethrows -> R { - try body(UnsafeBufferPointer(buffer)) - } - - public func withUnsafeMutableBytes(_ body: (UnsafeMutableRawBufferPointer) throws -> R) rethrows -> R { - try body(UnsafeMutableRawBufferPointer(buffer)) - } -} diff --git a/Packages/PFFFT/Sources/PFFFT/FFT.swift b/Packages/PFFFT/Sources/PFFFT/FFT.swift deleted file mode 100644 index aa2a210..0000000 --- a/Packages/PFFFT/Sources/PFFFT/FFT.swift +++ /dev/null @@ -1,348 +0,0 @@ -internal import PFFFTLib -import ComplexModule -import RealModule - -@frozen -public enum FFTType { - case real - case complex -} - -@frozen -public enum FFTSign: Int { - case forward = -1 - case backward = 1 -} - -public enum FFTError: Error { - case invalidSize -} - -public protocol FFTElement { - associatedtype FFTScalarType: FFTScalar - associatedtype FFTComplexType = Complex - - static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer - static func pffftMinFftSize(_ type: FFTType) -> Int - static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool - static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int -} - -public protocol FFTScalar: Real { - static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ work: UnsafeMutablePointer?, _ dir: FFTSign) - static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ work: UnsafeMutablePointer?, _ dir: FFTSign) - static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ dir: FFTSign) - static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer, _ dftB: UnsafeMutablePointer, _ dftAB: UnsafeMutablePointer, _ scaling: Self) - static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer, _ dftB: UnsafeMutablePointer, _ dftAB: UnsafeMutablePointer, _ scaling: Self) -} - -extension Float: FFTScalar { - public static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ work: UnsafeMutablePointer?, _ dir: FFTSign) { - pffft_transform_ordered(ptr, input, output, work, pffft_direction_t(dir)) - } - - public static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ work: UnsafeMutablePointer?, _ dir: FFTSign) { - pffft_transform(ptr, input, output, work, pffft_direction_t(dir)) - } - - public static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ dir: FFTSign) { - pffft_zreorder(ptr, input, output, pffft_direction_t(dir)) - } - - public static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer, _ dftB: UnsafeMutablePointer, _ dftAB: UnsafeMutablePointer, _ scaling: Self) { - pffft_zconvolve_accumulate(ptr, dftA, dftB, dftAB, scaling) - } - - public static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer, _ dftB: UnsafeMutablePointer, _ dftAB: UnsafeMutablePointer, _ scaling: Self) { - pffft_zconvolve_no_accu(ptr, dftA, dftB, dftAB, scaling) - } -} - -extension Double: FFTScalar { - public static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ work: UnsafeMutablePointer?, _ dir: FFTSign) { - pffftd_transform_ordered(ptr, input, output, work, pffft_direction_t(dir)) - } - - public static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ work: UnsafeMutablePointer?, _ dir: FFTSign) { - pffftd_transform(ptr, input, output, work, pffft_direction_t(dir)) - } - - public static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer, _ output: UnsafeMutablePointer, _ dir: FFTSign) { - pffftd_zreorder(ptr, input, output, pffft_direction_t(dir)) - } - - public static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer, _ dftB: UnsafeMutablePointer, _ dftAB: UnsafeMutablePointer, _ scaling: Double) { - pffftd_zconvolve_accumulate(ptr, dftA, dftB, dftAB, scaling) - } - - public static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer, _ dftB: UnsafeMutablePointer, _ dftAB: UnsafeMutablePointer, _ scaling: Double) { - pffftd_zconvolve_no_accu(ptr, dftA, dftB, dftAB, scaling) - } -} - -extension Complex: FFTElement where RealType: FFTElement & FFTScalar { - public typealias FFTScalarType = RealType - - public static func pffftSetup(_ n: Int, _: FFTType) throws -> OpaquePointer { - return try FFTScalarType.pffftSetup(n, .complex) - } - - public static func pffftMinFftSize(_: FFTType) -> Int { - return FFTScalarType.pffftMinFftSize(.complex) - } - - public static func pffftIsValidSize(_ n: Int, _: FFTType) -> Bool { - return FFTScalarType.pffftIsValidSize(n, .complex) - } - - public static func pffftNearestValidSize(_ n: Int, _: FFTType, _ higher: Bool) -> Int { - return FFTScalarType.pffftNearestValidSize(n, .complex, higher) - } -} - -extension Double: FFTElement { - public typealias FFTScalarType = Double - - public static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer { - guard let ptr = pffftd_new_setup(Int32(n), pffft_transform_t(type)) else { throw FFTError.invalidSize } - return ptr - } - - public static func pffftMinFftSize(_ type: FFTType) -> Int { - Int(pffftd_min_fft_size(pffft_transform_t(type))) - } - - public static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool { - pffftd_is_valid_size(Int32(n), pffft_transform_t(type)) != 0 - } - - public static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int { - Int(pffftd_nearest_transform_size(Int32(n), pffft_transform_t(type), higher ? 1 : 0)) - } -} - -extension Float: FFTElement { - public typealias FFTScalarType = Float - - public static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer { - guard let ptr = pffft_new_setup(Int32(n), pffft_transform_t(type)) else { throw FFTError.invalidSize } - return ptr - } - - public static func pffftMinFftSize(_ type: FFTType) -> Int { - Int(pffft_min_fft_size(pffft_transform_t(type))) - } - - public static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool { - pffft_is_valid_size(Int32(n), pffft_transform_t(type)) != 0 - } - - public static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int { - Int(pffft_nearest_transform_size(Int32(n), pffft_transform_t(type), higher ? 1 : 0)) - } -} - -public final class FFT { - public typealias ComplexType = T.FFTComplexType - public typealias ScalarType = T.FFTScalarType - - let ptr: OpaquePointer - let n: Int - let work: Buffer? - - /// Initialize the FFT implementation with the given size and type. - /// - Parameters: - /// - n: The size of the FFT. - /// - Throws: `FFTError.invalidSize` if the size is invalid. - public init(n: Int) throws { - ptr = try T.pffftSetup(n, .real) - self.n = n - let capacity = T.self == ComplexType.self ? 2 * n : n - work = n > 4096 ? Buffer(capacity: capacity) : nil - } - - /// Make a buffer for the FFT (time-domain) signal. - /// - Parameters: - /// - extra: An extra number of elements to allocate. - public func makeSignalBuffer(extra: Int = 0) -> Buffer { - Buffer(capacity: n + extra) - } - - /// Make a buffer for the FFT (frequency-domain) spectrum. - /// - Parameters: - /// - extra: An extra number of elements to allocate. - public func makeSpectrumBuffer(extra: Int = 0) -> Buffer { - Buffer(capacity: T.self == ComplexType.self ? (n + extra) : n / 2 + extra) - } - - /// Make a buffer for the internal layout of the FFT (frequency-domain) spectrum. - /// - Parameters: - /// - extra: An extra number of points to allocate. For complex FFTs, 2 * extra - /// additional elements will be allocated. - public func makeInternalLayoutBuffer(extra: Int = 0) -> Buffer { - Buffer(capacity: (T.self == ComplexType.self ? 2 : 1) * (n + extra)) - } - - @inline(__always) - func toAddress(_ work: borrowing Buffer?) -> UnsafeMutablePointer? { - switch work { - case let .some(b): return b.baseAddress - case .none: return nil - } - } - - @inline(__always) - func rebind(_ buffer: borrowing Buffer) -> UnsafeMutablePointer! { - UnsafeMutableRawBufferPointer(buffer.buffer).bindMemory(to: ScalarType.self).baseAddress - } - - @inline(__always) - func checkFftBufferCounts(signal: borrowing Buffer, spectrum: borrowing Buffer) { - guard signal.count >= n else { - fatalError("signal buffer too small") - } - guard spectrum.count >= (T.self == ComplexType.self ? n : n / 2) else { - fatalError("spectrum buffer too small") - } - } - - @inline(__always) - func checkFftInternalLayoutBufferCounts(signal: borrowing Buffer, spectrum: borrowing Buffer) { - guard signal.count >= n else { - fatalError("signal buffer too small") - } - guard spectrum.count >= (T.self == ComplexType.self ? 2 * n : n) else { - fatalError("spectrum buffer too small") - } - } - - @inline(__always) - func checkConvolveBufferCounts(a: borrowing Buffer, b: borrowing Buffer, ab: borrowing Buffer) { - let minCount = T.self == ComplexType.self ? 2 * n : n - - guard a.count >= minCount else { - fatalError("a buffer too small") - } - guard b.count >= minCount else { - fatalError("b buffer too small") - } - guard ab.count >= minCount else { - fatalError("ab buffer too small") - } - } - - /// Perform a forward FFT on the input buffer. - /// - /// The input and output buffers may be the same. - /// The data is stores in order as expected (interleaved complex components ordered by frequency). - /// The input and output buffer must have a capacity of at least `n` for real FFTs and `2 * n` for complex FFTs. - /// A fatal error will occur if any buffer is too small. - /// - /// For a real forward transform with real input, the output array is organized as follows: - /// index k > 2 where k is even is the real part of the k/2-th complex coefficient. - /// index k > 2 where k is odd is the imaginary part of the k/2-th complex coefficient. - /// index k = 0 is the real part of the 0 frequency (DC) coefficient. - /// index k = 1 is the real part of the Nyquist coefficient. - /// - /// Transforms are not scaled. fft_backward(fft_forward(x)) == n * x. - /// - /// - Parameters: - /// - input: The input buffer. - /// - output: The output buffer. - /// - work: An optional work buffer. Must have capacity of at least `n` for real FFTs and `2 * n` for complex FFTs. - /// - sign: The direction of the FFT. - public func forward(signal: borrowing Buffer, spectrum: borrowing Buffer) { - checkFftBufferCounts(signal: signal, spectrum: spectrum) - ScalarType.pffftTransformOrdered(ptr, rebind(signal), rebind(spectrum), toAddress(work), .forward) - } - - public func inverse(spectrum: borrowing Buffer, signal: borrowing Buffer) { - checkFftBufferCounts(signal: signal, spectrum: spectrum) - ScalarType.pffftTransformOrdered(ptr, rebind(spectrum), rebind(signal), toAddress(work), .backward) - } - - /// Perform a forward FFT on the input buffer, with implementation defined order. - /// - /// This function behaves similarly to `fft` however the z-domain data is stored in most efficient ordering, - /// which is suitable for transforming back with this function, or for convolution. - /// - Parameters: - /// - input: The input buffer. - /// - output: The output buffer. - /// - work: An optional work buffer. Must have capacity of at least `n` for real FFTs and `2 * n` for complex FFTs. - /// - sign: The direction of the FFT. - public func forwardToInternalLayout(signal: borrowing Buffer, spectrum: borrowing Buffer) { - checkFftInternalLayoutBufferCounts(signal: signal, spectrum: spectrum) - ScalarType.pffftTransform(ptr, rebind(signal), spectrum.baseAddress, toAddress(work), .forward) - } - - public func inverseFromInternalLayout(spectrum: borrowing Buffer, signal: borrowing Buffer) { - checkFftInternalLayoutBufferCounts(signal: signal, spectrum: spectrum) - ScalarType.pffftTransform(ptr, spectrum.baseAddress, rebind(signal), toAddress(work), .backward) - } - - public func reorder(spectrum: borrowing Buffer, output: borrowing Buffer) { - guard spectrum.count >= (T.self == ComplexType.self ? 2 * n : n) else { - fatalError("signal buffer too small") - } - guard output.count >= n else { - fatalError("outputbuffer too small") - } - ScalarType.pffftZreorder(ptr, spectrum.baseAddress, rebind(output), .forward) - } - - /// Perform a convolution of two complex signals in the frequency domain. - /// - /// Multiplies frequency domain components of `dftA` and `dftB` and stores the result in `dftAB`. - /// The operation performed is `dftAB = (dftA * dftB) * scaling`. - /// - Parameters: - /// - dftA: The first input buffer of frequency domain data. - /// - dftB: The second input buffer of frequency domain data. - /// - dftAB: The output buffer of frequency domain data. - /// - scaling: The scaling factor to apply to the result. - public func convolve(dftA: borrowing Buffer, dftB: borrowing Buffer, dftAB: borrowing Buffer, scaling: ScalarType) { - checkConvolveBufferCounts(a: dftA, b: dftB, ab: dftAB) - ScalarType.pffftZconvolveNoAccu(ptr, dftA.baseAddress, dftB.baseAddress, dftAB.baseAddress, scaling) - } - - /// Perform a convolution of two complex signals in the frequency domain. - /// - /// Multiplies frequency domain components of `dftA` and `dftB` and accumulates the result in `dftAB`. - /// The operation performed is `dftAB += (dftA * dftB) * scaling`. - /// - Parameters: - /// - dftA: The first input buffer of frequency domain data. - /// - dftB: The second input buffer of frequency domain data. - /// - dftAB: The output buffer of frequency domain data. - /// - scaling: The scaling factor to apply to the result. - public func convolveAccumulate(dftA: borrowing Buffer, dftB: borrowing Buffer, dftAB: borrowing Buffer, scaling: ScalarType) { - checkConvolveBufferCounts(a: dftA, b: dftB, ab: dftAB) - ScalarType.pffftZconvolveAccumulate(ptr, dftA.baseAddress, dftB.baseAddress, dftAB.baseAddress, scaling) - } - - /// Returns the minimum FFT size for this type of setup. - public static func minFftSize() -> Int { - T.pffftMinFftSize(.real) - } - - /// Returns whether the given size is valid for the given type. - /// - /// The PFFFT library requires `n` to be factorizable to `minFftSize` with factors of 2, 3, 5. - /// - Parameters: - /// - n: The size to check. - /// - Returns: Whether the size is valid. - public static func isValidSize(_ n: Int) -> Bool { - T.pffftIsValidSize(n, .real) - } - - /// Returns the nearest valid size for the given type. - /// - Parameters: - /// - n: The size to check. - /// - higher: Whether to return the next higher size if `n` is invalid. - /// - Returns: The nearest valid size. - public static func nearestValidSize(_ n: Int, higher: Bool) -> Int { - T.pffftNearestValidSize(n, .real, higher) - } - - deinit { - pffftd_destroy_setup(ptr) - } -} diff --git a/Packages/PFFFT/Sources/PFFFT/PFFFT.swift b/Packages/PFFFT/Sources/PFFFT/PFFFT.swift deleted file mode 100644 index fc47727..0000000 --- a/Packages/PFFFT/Sources/PFFFT/PFFFT.swift +++ /dev/null @@ -1,26 +0,0 @@ -internal import PFFFTLib - -public var simdArch: String { - String(cString: pffft_simd_arch()) -} - -extension pffft_transform_t { - @inline(__always) - init(_ type: FFTType) { - switch type { - case .real: self = PFFFT_REAL - case .complex: self = PFFFT_COMPLEX - } - } -} - -extension pffft_direction_t { - @inline(__always) - init(_ sign: FFTSign) { - switch sign { - case .forward: self = PFFFT_FORWARD - case .backward: self = PFFFT_BACKWARD - } - } -} - diff --git a/Packages/PFFFT/Sources/PFFFT/SetupCache.swift b/Packages/PFFFT/Sources/PFFFT/SetupCache.swift deleted file mode 100644 index 45e2dbf..0000000 --- a/Packages/PFFFT/Sources/PFFFT/SetupCache.swift +++ /dev/null @@ -1,35 +0,0 @@ -import Foundation - -public class SetupCache : @unchecked Sendable { - typealias FFTType = FFT - private var cache: [Int: FFT] = [:] - - private let queue = DispatchQueue(label: String(describing: SetupCache.self), attributes: .concurrent) - - public init() {} - - public func get(for n: Int) throws -> FFT { - var setup: FFTType?? - queue.sync { - setup = cache[n] - } - if setup == nil { - queue.sync(flags: .barrier) { - setup = cache[n] - if setup == nil { - let entry = try? FFTType(n: n) - cache[n] = entry - setup = entry - } - } - } - guard let s = setup! else { throw FFTError.invalidSize } - return s - } - - public func clear() { - queue.sync(flags: .barrier) { - cache.removeAll() - } - } -} diff --git a/Packages/PFFFT/Sources/PFFFTLib/include/pffft.h b/Packages/PFFFT/Sources/PFFFTLib/include/pffft.h deleted file mode 100644 index 7ad925c..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/include/pffft.h +++ /dev/null @@ -1,241 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB, - authored by Dr Paul Swarztrauber of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -/* - PFFFT : a Pretty Fast FFT. - - This is basically an adaptation of the single precision fftpack - (v4) as found on netlib taking advantage of SIMD instruction found - on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). - - For architectures where no SIMD instruction is available, the code - falls back to a scalar version. - - Restrictions: - - - 1D transforms only, with 32-bit single precision. - - - supports only transforms for inputs of length N of the form - N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, - 144, 160, etc are all acceptable lengths). Performance is best for - 128<=N<=8192. - - - all (float*) pointers in the functions below are expected to - have an "simd-compatible" alignment, that is 16 bytes on x86 and - powerpc CPUs. - - You can allocate such buffers with the functions - pffft_aligned_malloc / pffft_aligned_free (or with stuff like - posix_memalign..) - -*/ - -#ifndef PFFFT_H -#define PFFFT_H - -#include /* for size_t */ - -#ifdef __cplusplus -extern "C" { -#endif - - /* opaque struct holding internal stuff (precomputed twiddle factors) - this struct can be shared by many threads as it contains only - read-only data. - */ - typedef struct PFFFT_Setup PFFFT_Setup; - -#ifndef PFFFT_COMMON_ENUMS -#define PFFFT_COMMON_ENUMS - - /* direction of the transform */ - typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; - - /* type of transform */ - typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; - -#endif - - /* - prepare for performing transforms of size N -- the returned - PFFFT_Setup structure is read-only so it can safely be shared by - multiple concurrent threads. - */ - PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); - void pffft_destroy_setup(PFFFT_Setup *); - /* - Perform a Fourier transform , The z-domain data is stored in the - most efficient order for transforming it back, or using it for - convolution. If you need to have its content sorted in the - "usual" way, that is as an array of interleaved complex numbers, - either use pffft_transform_ordered , or call pffft_zreorder after - the forward fft, and before the backward fft. - - Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. - Typically you will want to scale the backward transform by 1/N. - - The 'work' pointer should point to an area of N (2*N for complex - fft) floats, properly aligned. If 'work' is NULL, then stack will - be used instead (this is probably the best strategy for small - FFTs, say for N < 16384). Threads usually have a small stack, that - there's no sufficient amount of memory, usually leading to a crash! - Use the heap with pffft_aligned_malloc() in this case. - - For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real - input with input(=transformation) length N, the output array is - 'mostly' complex: - index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N - index k == 0 is a special case: - the real() part contains the result for the DC frequency 0, - the imag() part contains the result for the Nyquist frequency Samplerate/2 - both 0-frequency and half frequency components, which are real, - are assembled in the first entry as F(0)+i*F(N/2). - With the output size N/2 complex values (=N real/imag values), it is - obvious, that the result for negative frequencies are not output, - cause of symmetry. - - input and output may alias. - */ - void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); - - /* - Similar to pffft_transform, but makes sure that the output is - ordered as expected (interleaved complex numbers). This is - similar to calling pffft_transform and then pffft_zreorder. - - input and output may alias. - */ - void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); - - /* - call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., - PFFFT_FORWARD) if you want to have the frequency components in - the correct "canonical" order, as interleaved complex numbers. - - (for real transforms, both 0-frequency and half frequency - components, which are real, are assembled in the first entry as - F(0)+i*F(n/2+1). Note that the original fftpack did place - F(n/2+1) at the end of the arrays). - - input and output should not alias. - */ - void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); - - /* - Perform a multiplication of the frequency components of dft_a and - dft_b and accumulate them into dft_ab. The arrays should have - been obtained with pffft_transform(.., PFFFT_FORWARD) and should - *not* have been reordered with pffft_zreorder (otherwise just - perform the operation yourself as the dft coefs are stored as - interleaved complex numbers). - - the operation performed is: dft_ab += (dft_a * fdt_b)*scaling - - The dft_a, dft_b and dft_ab pointers may alias. - */ - void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); - - /* - Perform a multiplication of the frequency components of dft_a and - dft_b and put result in dft_ab. The arrays should have - been obtained with pffft_transform(.., PFFFT_FORWARD) and should - *not* have been reordered with pffft_zreorder (otherwise just - perform the operation yourself as the dft coefs are stored as - interleaved complex numbers). - - the operation performed is: dft_ab = (dft_a * fdt_b)*scaling - - The dft_a, dft_b and dft_ab pointers may alias. - */ - void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); - - /* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */ - int pffft_simd_size(); - - /* return string identifier of used architecture (SSE/NEON/Altivec/..) */ - const char * pffft_simd_arch(); - - - /* following functions are identical to the pffftd_ functions */ - - /* simple helper to get minimum possible fft size */ - int pffft_min_fft_size(pffft_transform_t transform); - - /* simple helper to determine next power of 2 - - without inexact/rounding floating point operations - */ - int pffft_next_power_of_two(int N); - - /* simple helper to determine if power of 2 - returns bool */ - int pffft_is_power_of_two(int N); - - /* simple helper to determine size N is valid - - factorizable to pffft_min_fft_size() with factors 2, 3, 5 - returns bool - */ - int pffft_is_valid_size(int N, pffft_transform_t cplx); - - /* determine nearest valid transform size (by brute-force testing) - - factorizable to pffft_min_fft_size() with factors 2, 3, 5. - higher: bool-flag to find nearest higher value; else lower. - */ - int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher); - - /* - the float buffers must have the correct alignment (16-byte boundary - on intel and powerpc). This function may be used to obtain such - correctly aligned buffers. - */ - void *pffft_aligned_malloc(size_t nb_bytes); - void pffft_aligned_free(void *); - -#ifdef __cplusplus -} -#endif - -#endif /* PFFFT_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/include/pffft_double.h b/Packages/PFFFT/Sources/PFFFTLib/include/pffft_double.h deleted file mode 100644 index afa8de0..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/include/pffft_double.h +++ /dev/null @@ -1,236 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB, - authored by Dr Paul Swarztrauber of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ -/* - NOTE: This file is adapted from Julien Pommier's original PFFFT, - which works on 32 bit floating point precision using SSE instructions, - to work with 64 bit floating point precision using AVX instructions. - Author: Dario Mambro @ https://github.com/unevens/pffft -*/ -/* - PFFFT : a Pretty Fast FFT. - - This is basically an adaptation of the single precision fftpack - (v4) as found on netlib taking advantage of SIMD instruction found - on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). - - For architectures where no SIMD instruction is available, the code - falls back to a scalar version. - - Restrictions: - - - 1D transforms only, with 64-bit double precision. - - - supports only transforms for inputs of length N of the form - N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, - 144, 160, etc are all acceptable lengths). Performance is best for - 128<=N<=8192. - - - all (double*) pointers in the functions below are expected to - have an "simd-compatible" alignment, that is 32 bytes on x86 and - powerpc CPUs. - - You can allocate such buffers with the functions - pffft_aligned_malloc / pffft_aligned_free (or with stuff like - posix_memalign..) - -*/ - -#ifndef PFFFT_DOUBLE_H -#define PFFFT_DOUBLE_H - -#include /* for size_t */ - -#ifdef __cplusplus -extern "C" { -#endif - - /* opaque struct holding internal stuff (precomputed twiddle factors) - this struct can be shared by many threads as it contains only - read-only data. - */ - typedef struct PFFFTD_Setup PFFFTD_Setup; - -#ifndef PFFFT_COMMON_ENUMS -#define PFFFT_COMMON_ENUMS - - /* direction of the transform */ - typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; - - /* type of transform */ - typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; - -#endif - - /* - prepare for performing transforms of size N -- the returned - PFFFTD_Setup structure is read-only so it can safely be shared by - multiple concurrent threads. - */ - PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform); - void pffftd_destroy_setup(PFFFTD_Setup *); - /* - Perform a Fourier transform , The z-domain data is stored in the - most efficient order for transforming it back, or using it for - convolution. If you need to have its content sorted in the - "usual" way, that is as an array of interleaved complex numbers, - either use pffft_transform_ordered , or call pffft_zreorder after - the forward fft, and before the backward fft. - - Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. - Typically you will want to scale the backward transform by 1/N. - - The 'work' pointer should point to an area of N (2*N for complex - fft) doubles, properly aligned. If 'work' is NULL, then stack will - be used instead (this is probably the best strategy for small - FFTs, say for N < 16384). Threads usually have a small stack, that - there's no sufficient amount of memory, usually leading to a crash! - Use the heap with pffft_aligned_malloc() in this case. - - input and output may alias. - */ - void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction); - - /* - Similar to pffft_transform, but makes sure that the output is - ordered as expected (interleaved complex numbers). This is - similar to calling pffft_transform and then pffft_zreorder. - - input and output may alias. - */ - void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction); - - /* - call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., - PFFFT_FORWARD) if you want to have the frequency components in - the correct "canonical" order, as interleaved complex numbers. - - (for real transforms, both 0-frequency and half frequency - components, which are real, are assembled in the first entry as - F(0)+i*F(n/2+1). Note that the original fftpack did place - F(n/2+1) at the end of the arrays). - - input and output should not alias. - */ - void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction); - - /* - Perform a multiplication of the frequency components of dft_a and - dft_b and accumulate them into dft_ab. The arrays should have - been obtained with pffft_transform(.., PFFFT_FORWARD) and should - *not* have been reordered with pffft_zreorder (otherwise just - perform the operation yourself as the dft coefs are stored as - interleaved complex numbers). - - the operation performed is: dft_ab += (dft_a * fdt_b)*scaling - - The dft_a, dft_b and dft_ab pointers may alias. - */ - void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling); - - /* - Perform a multiplication of the frequency components of dft_a and - dft_b and put result in dft_ab. The arrays should have - been obtained with pffft_transform(.., PFFFT_FORWARD) and should - *not* have been reordered with pffft_zreorder (otherwise just - perform the operation yourself as the dft coefs are stored as - interleaved complex numbers). - - the operation performed is: dft_ab = (dft_a * fdt_b)*scaling - - The dft_a, dft_b and dft_ab pointers may alias. - */ - void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling); - - /* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */ - int pffftd_simd_size(); - - /* return string identifier of used architecture (AVX/..) */ - const char * pffftd_simd_arch(); - - /* simple helper to get minimum possible fft size */ - int pffftd_min_fft_size(pffft_transform_t transform); - - /* simple helper to determine size N is valid - - factorizable to pffft_min_fft_size() with factors 2, 3, 5 - */ - int pffftd_is_valid_size(int N, pffft_transform_t cplx); - - /* determine nearest valid transform size (by brute-force testing) - - factorizable to pffft_min_fft_size() with factors 2, 3, 5. - higher: bool-flag to find nearest higher value; else lower. - */ - int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher); - - - /* following functions are identical to the pffft_ functions - both declared */ - - /* simple helper to determine next power of 2 - - without inexact/rounding floating point operations - */ - int pffftd_next_power_of_two(int N); - int pffft_next_power_of_two(int N); - - /* simple helper to determine if power of 2 - returns bool */ - int pffftd_is_power_of_two(int N); - int pffft_is_power_of_two(int N); - - /* - the double buffers must have the correct alignment (32-byte boundary - on intel and powerpc). This function may be used to obtain such - correctly aligned buffers. - */ - void *pffftd_aligned_malloc(size_t nb_bytes); - void *pffft_aligned_malloc(size_t nb_bytes); - void pffftd_aligned_free(void *); - void pffft_aligned_free(void *); - -#ifdef __cplusplus -} -#endif - -#endif /* PFFFT_DOUBLE_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/pffft.c b/Packages/PFFFT/Sources/PFFFTLib/pffft.c deleted file mode 100644 index 4862a4f..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/pffft.c +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB - (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber - of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - - - PFFFT : a Pretty Fast FFT. - - This file is largerly based on the original FFTPACK implementation, modified in - order to take advantage of SIMD instructions of modern CPUs. -*/ - -/* - ChangeLog: - - 2011/10/02, version 1: This is the very first release of this file. -*/ - -#include "pffft.h" - -/* detect compiler flavour */ -#if defined(_MSC_VER) -# define COMPILER_MSVC -#elif defined(__GNUC__) -# define COMPILER_GCC -#endif - -#include -#include -#include -#include -#include - -#if defined(COMPILER_GCC) -# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) -# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) -# define RESTRICT __restrict -# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; -#elif defined(COMPILER_MSVC) -# define ALWAYS_INLINE(return_type) __forceinline return_type -# define NEVER_INLINE(return_type) __declspec(noinline) return_type -# define RESTRICT __restrict -# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) -#endif - - -#ifdef COMPILER_MSVC -#pragma warning( disable : 4244 4305 4204 4456 ) -#endif - -/* - vector support macros: the rest of the code is independant of - SSE/Altivec/NEON -- adding support for other platforms with 4-element - vectors should be limited to these macros -*/ -#include "simd/pf_float.h" - -/* have code comparable with this definition */ -#define SETUP_STRUCT PFFFT_Setup -#define FUNC_NEW_SETUP pffft_new_setup -#define FUNC_DESTROY pffft_destroy_setup -#define FUNC_TRANSFORM_UNORDRD pffft_transform -#define FUNC_TRANSFORM_ORDERED pffft_transform_ordered -#define FUNC_ZREORDER pffft_zreorder -#define FUNC_ZCONVOLVE_ACCUMULATE pffft_zconvolve_accumulate -#define FUNC_ZCONVOLVE_NO_ACCU pffft_zconvolve_no_accu - -#define FUNC_ALIGNED_MALLOC pffft_aligned_malloc -#define FUNC_ALIGNED_FREE pffft_aligned_free -#define FUNC_SIMD_SIZE pffft_simd_size -#define FUNC_MIN_FFT_SIZE pffft_min_fft_size -#define FUNC_IS_VALID_SIZE pffft_is_valid_size -#define FUNC_NEAREST_SIZE pffft_nearest_transform_size -#define FUNC_SIMD_ARCH pffft_simd_arch -#define FUNC_VALIDATE_SIMD_A validate_pffft_simd -#define FUNC_VALIDATE_SIMD_EX validate_pffft_simd_ex - -#define FUNC_CPLX_FINALIZE pffft_cplx_finalize -#define FUNC_CPLX_PREPROCESS pffft_cplx_preprocess -#define FUNC_REAL_PREPROCESS_4X4 pffft_real_preprocess_4x4 -#define FUNC_REAL_PREPROCESS pffft_real_preprocess -#define FUNC_REAL_FINALIZE_4X4 pffft_real_finalize_4x4 -#define FUNC_REAL_FINALIZE pffft_real_finalize -#define FUNC_TRANSFORM_INTERNAL pffft_transform_internal - -#define FUNC_COS cosf -#define FUNC_SIN sinf - - -#include "pffft_priv_impl.h" - - diff --git a/Packages/PFFFT/Sources/PFFFTLib/pffft_common.c b/Packages/PFFFT/Sources/PFFFTLib/pffft_common.c deleted file mode 100644 index 106fdd2..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/pffft_common.c +++ /dev/null @@ -1,53 +0,0 @@ - -#include "pffft.h" - -#include - -/* SSE and co like 16-bytes aligned pointers - * with a 64-byte alignment, we are even aligned on L2 cache lines... */ -#define MALLOC_V4SF_ALIGNMENT 64 - -static void * Valigned_malloc(size_t nb_bytes) { - void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); - if (!p0) return (void *) 0; - p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); - *((void **) p - 1) = p0; - return p; -} - -static void Valigned_free(void *p) { - if (p) free(*((void **) p - 1)); -} - - -static int next_power_of_two(int N) { - /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ - /* compute the next highest power of 2 of 32-bit v */ - unsigned v = N; - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - return v; -} - -static int is_power_of_two(int N) { - /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */ - int f = N && !(N & (N - 1)); - return f; -} - - - -void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); } -void pffft_aligned_free(void *p) { Valigned_free(p); } -int pffft_next_power_of_two(int N) { return next_power_of_two(N); } -int pffft_is_power_of_two(int N) { return is_power_of_two(N); } - -void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); } -void pffftd_aligned_free(void *p) { Valigned_free(p); } -int pffftd_next_power_of_two(int N) { return next_power_of_two(N); } -int pffftd_is_power_of_two(int N) { return is_power_of_two(N); } diff --git a/Packages/PFFFT/Sources/PFFFTLib/pffft_double.c b/Packages/PFFFT/Sources/PFFFTLib/pffft_double.c deleted file mode 100644 index 066782b..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/pffft_double.c +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) - Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB - (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber - of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - - - PFFFT : a Pretty Fast FFT. - - This file is largerly based on the original FFTPACK implementation, modified in - order to take advantage of SIMD instructions of modern CPUs. -*/ - -/* - NOTE: This file is adapted from Julien Pommier's original PFFFT, - which works on 32 bit floating point precision using SSE instructions, - to work with 64 bit floating point precision using AVX instructions. - Author: Dario Mambro @ https://github.com/unevens/pffft -*/ - -#include "pffft_double.h" - -/* detect compiler flavour */ -#if defined(_MSC_VER) -# define COMPILER_MSVC -#elif defined(__GNUC__) -# define COMPILER_GCC -#endif - -#ifdef COMPILER_MSVC -# define _USE_MATH_DEFINES -# include -#elif defined(__MINGW32__) || defined(__MINGW64__) -# include -#else -# include -#endif - -#include -#include -#include -#include -#include - -#if defined(COMPILER_GCC) -# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) -# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) -# define RESTRICT __restrict -# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; -#elif defined(COMPILER_MSVC) -# define ALWAYS_INLINE(return_type) __forceinline return_type -# define NEVER_INLINE(return_type) __declspec(noinline) return_type -# define RESTRICT __restrict -# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) -#endif - - -#ifdef COMPILER_MSVC -#pragma warning( disable : 4244 4305 4204 4456 ) -#endif - -/* - vector support macros: the rest of the code is independant of - AVX -- adding support for other platforms with 4-element - vectors should be limited to these macros -*/ -#include "simd/pf_double.h" - -/* have code comparable with this definition */ -#define float double -#define SETUP_STRUCT PFFFTD_Setup -#define FUNC_NEW_SETUP pffftd_new_setup -#define FUNC_DESTROY pffftd_destroy_setup -#define FUNC_TRANSFORM_UNORDRD pffftd_transform -#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered -#define FUNC_ZREORDER pffftd_zreorder -#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate -#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu - -#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc -#define FUNC_ALIGNED_FREE pffftd_aligned_free -#define FUNC_SIMD_SIZE pffftd_simd_size -#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size -#define FUNC_IS_VALID_SIZE pffftd_is_valid_size -#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size -#define FUNC_SIMD_ARCH pffftd_simd_arch -#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd -#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex - -#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize -#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess -#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4 -#define FUNC_REAL_PREPROCESS pffftd_real_preprocess -#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4 -#define FUNC_REAL_FINALIZE pffftd_real_finalize -#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal - -#define FUNC_COS cos -#define FUNC_SIN sin - - -#include "pffft_priv_impl.h" - - diff --git a/Packages/PFFFT/Sources/PFFFTLib/pffft_priv_impl.h b/Packages/PFFFT/Sources/PFFFTLib/pffft_priv_impl.h deleted file mode 100644 index 7ac2b0a..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/pffft_priv_impl.h +++ /dev/null @@ -1,2231 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) - Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB - (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber - of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - - - PFFFT : a Pretty Fast FFT. - - This file is largerly based on the original FFTPACK implementation, modified in - order to take advantage of SIMD instructions of modern CPUs. -*/ - -/* this file requires architecture specific preprocessor definitions - * it's only for library internal use - */ - - -/* define own constants required to turn off g++ extensions .. */ -#ifndef M_PI - #define M_PI 3.14159265358979323846 /* pi */ -#endif - -#ifndef M_SQRT2 - #define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ -#endif - - -int FUNC_SIMD_SIZE() { return SIMD_SZ; } - -int FUNC_MIN_FFT_SIZE(pffft_transform_t transform) { - /* unfortunately, the fft size must be a multiple of 16 for complex FFTs - and 32 for real FFTs -- a lot of stuff would need to be rewritten to - handle other cases (or maybe just switch to a scalar fft, I don't know..) */ - int simdSz = FUNC_SIMD_SIZE(); - if (transform == PFFFT_REAL) - return ( 2 * simdSz * simdSz ); - else if (transform == PFFFT_COMPLEX) - return ( simdSz * simdSz ); - else - return 1; -} - -int FUNC_IS_VALID_SIZE(int N, pffft_transform_t cplx) { - const int N_min = FUNC_MIN_FFT_SIZE(cplx); - int R = N; - while (R >= 5*N_min && (R % 5) == 0) R /= 5; - while (R >= 3*N_min && (R % 3) == 0) R /= 3; - while (R >= 2*N_min && (R % 2) == 0) R /= 2; - return (R == N_min) ? 1 : 0; -} - -int FUNC_NEAREST_SIZE(int N, pffft_transform_t cplx, int higher) { - int d; - const int N_min = FUNC_MIN_FFT_SIZE(cplx); - if (N < N_min) - N = N_min; - d = (higher) ? N_min : -N_min; - if (d > 0) - N = N_min * ((N+N_min-1) / N_min); /* round up */ - else - N = N_min * (N / N_min); /* round down */ - - for (; ; N += d) - if (FUNC_IS_VALID_SIZE(N, cplx)) - return N; -} - -const char * FUNC_SIMD_ARCH() { return VARCH; } - - -/* - passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 -*/ -static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { - int k, i; - int l1ido = l1*ido; - if (ido <= 2) { - for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) { - ch[0] = VADD(cc[0], cc[ido+0]); - ch[l1ido] = VSUB(cc[0], cc[ido+0]); - ch[1] = VADD(cc[1], cc[ido+1]); - ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); - } - } else { - for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) { - for (i=0; i 2); - for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { - for (i=0; i 2); - for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { - for (i = 0; i < ido-1; i += 2) { - ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); - ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); - ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); - ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); - tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); - tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); - tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); - tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); - ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); - ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); - cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); - ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); - cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); - ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); - cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); - ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); - cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); - ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); - dr3 = VSUB(cr3, ci4); - dr4 = VADD(cr3, ci4); - di3 = VADD(ci3, cr4); - di4 = VSUB(ci3, cr4); - dr5 = VADD(cr2, ci5); - dr2 = VSUB(cr2, ci5); - di5 = VSUB(ci2, cr5); - di2 = VADD(ci2, cr5); - wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; - wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; - VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); - ch_ref(i - 1, 2) = dr2; - ch_ref(i, 2) = di2; - VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); - ch_ref(i - 1, 3) = dr3; - ch_ref(i, 3) = di3; - VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); - ch_ref(i - 1, 4) = dr4; - ch_ref(i, 4) = di4; - VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); - ch_ref(i - 1, 5) = dr5; - ch_ref(i, 5) = di5; - } - } -#undef ch_ref -#undef cc_ref -} - -static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { - static const float minus_one = -1.f; - int i, k, l1ido = l1*ido; - for (k=0; k < l1ido; k += ido) { - v4sf a = cc[k], b = cc[k + l1ido]; - ch[2*k] = VADD(a, b); - ch[2*(k+ido)-1] = VSUB(a, b); - } - if (ido < 2) return; - if (ido != 2) { - for (k=0; k < l1ido; k += ido) { - for (i=2; i 5) { - wa[i1-1] = wa[i-1]; - wa[i1] = wa[i]; - } - } - l1 = l2; - } -} /* cffti1 */ - - -static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { - v4sf *in = (v4sf*)input_readonly; - v4sf *out = (in == work2 ? work1 : work2); - int nf = ifac[1], k1; - int l1 = 1; - int iw = 0; - assert(in != out && work1 != work2); - for (k1=2; k1<=nf+1; k1++) { - int ip = ifac[k1]; - int l2 = ip*l1; - int ido = n / l2; - int idot = ido + ido; - switch (ip) { - case 5: { - int ix2 = iw + idot; - int ix3 = ix2 + idot; - int ix4 = ix3 + idot; - passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); - } break; - case 4: { - int ix2 = iw + idot; - int ix3 = ix2 + idot; - passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign); - } break; - case 2: { - passf2_ps(idot, l1, in, out, &wa[iw], isign); - } break; - case 3: { - int ix2 = iw + idot; - passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign); - } break; - default: - assert(0); - } - l1 = l2; - iw += (ip - 1)*idot; - if (out == work2) { - out = work1; in = work2; - } else { - out = work2; in = work1; - } - } - - return in; /* this is in fact the output .. */ -} - - -struct SETUP_STRUCT { - int N; - int Ncvec; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */ - int ifac[15]; - pffft_transform_t transform; - v4sf *data; /* allocated room for twiddle coefs */ - float *e; /* points into 'data', N/4*3 elements */ - float *twiddle; /* points into 'data', N/4 elements */ -}; - -SETUP_STRUCT *FUNC_NEW_SETUP(int N, pffft_transform_t transform) { - SETUP_STRUCT *s = 0; - int k, m; - /* unfortunately, the fft size must be a multiple of 16 for complex FFTs - and 32 for real FFTs -- a lot of stuff would need to be rewritten to - handle other cases (or maybe just switch to a scalar fft, I don't know..) */ - if (transform == PFFFT_REAL) { if ((N%(2*SIMD_SZ*SIMD_SZ)) || N<=0) return s; } - if (transform == PFFFT_COMPLEX) { if ((N%( SIMD_SZ*SIMD_SZ)) || N<=0) return s; } - s = (SETUP_STRUCT*)malloc(sizeof(SETUP_STRUCT)); - /* assert((N % 32) == 0); */ - s->N = N; - s->transform = transform; - /* nb of complex simd vectors */ - s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; - s->data = (v4sf*)FUNC_ALIGNED_MALLOC(2*s->Ncvec * sizeof(v4sf)); - s->e = (float*)s->data; - s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); - - if (transform == PFFFT_REAL) { - for (k=0; k < s->Ncvec; ++k) { - int i = k/SIMD_SZ; - int j = k%SIMD_SZ; - for (m=0; m < SIMD_SZ-1; ++m) { - float A = -2*(float)M_PI*(m+1)*k / N; - s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = FUNC_COS(A); - s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = FUNC_SIN(A); - } - } - rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); - } else { - for (k=0; k < s->Ncvec; ++k) { - int i = k/SIMD_SZ; - int j = k%SIMD_SZ; - for (m=0; m < SIMD_SZ-1; ++m) { - float A = -2*(float)M_PI*(m+1)*k / N; - s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = FUNC_COS(A); - s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = FUNC_SIN(A); - } - } - cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); - } - - /* check that N is decomposable with allowed prime factors */ - for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } - if (m != N/SIMD_SZ) { - FUNC_DESTROY(s); s = 0; - } - - return s; -} - - -void FUNC_DESTROY(SETUP_STRUCT *s) { - if (!s) - return; - FUNC_ALIGNED_FREE(s->data); - free(s); -} - -#if ( SIMD_SZ == 4 ) /* !defined(PFFFT_SIMD_DISABLE) */ - -/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ -static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { - v4sf g0, g1; - int k; - INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; - - *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */ - for (k=1; k < N; ++k) { - v4sf h0, h1; - INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; - *--out = VSWAPHL(g1, h0); - *--out = VSWAPHL(h0, h1); - g1 = h1; - } - *--out = VSWAPHL(g1, g0); -} - -static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { - v4sf g0, g1, h0, h1; - int k; - g0 = g1 = in[0]; ++in; - for (k=1; k < N; ++k) { - h0 = *in++; h1 = *in++; - g1 = VSWAPHL(g1, h0); - h0 = VSWAPHL(h0, h1); - UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; - g1 = h1; - } - h0 = *in++; h1 = g0; - g1 = VSWAPHL(g1, h0); - h0 = VSWAPHL(h0, h1); - UNINTERLEAVE2(h0, g1, out[0], out[1]); -} - -void FUNC_ZREORDER(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) { - int k, N = setup->N, Ncvec = setup->Ncvec; - const v4sf *vin = (const v4sf*)in; - v4sf *vout = (v4sf*)out; - assert(in != out); - if (setup->transform == PFFFT_REAL) { - int k, dk = N/32; - if (direction == PFFFT_FORWARD) { - for (k=0; k < dk; ++k) { - INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); - INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); - } - reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2)); - reversed_copy(dk, vin+6, 8, (v4sf*)(out + N)); - } else { - for (k=0; k < dk; ++k) { - UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); - UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); - } - unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8); - unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8); - } - } else { - if (direction == PFFFT_FORWARD) { - for (k=0; k < Ncvec; ++k) { - int kk = (k/4) + (k%4)*(Ncvec/4); - INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); - } - } else { - for (k=0; k < Ncvec; ++k) { - int kk = (k/4) + (k%4)*(Ncvec/4); - UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); - } - } - } -} - -void FUNC_CPLX_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - assert(in != out); - for (k=0; k < dk; ++k) { - r0 = in[8*k+0]; i0 = in[8*k+1]; - r1 = in[8*k+2]; i1 = in[8*k+3]; - r2 = in[8*k+4]; i2 = in[8*k+5]; - r3 = in[8*k+6]; i3 = in[8*k+7]; - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]); - VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]); - VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]); - - sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); - si0 = VADD(i0,i2); di0 = VSUB(i0, i2); - si1 = VADD(i1,i3); di1 = VSUB(i1, i3); - - /* - transformation for each column is: - - [1 1 1 1 0 0 0 0] [r0] - [1 0 -1 0 0 -1 0 1] [r1] - [1 -1 1 -1 0 0 0 0] [r2] - [1 0 -1 0 0 1 0 -1] [r3] - [0 0 0 0 1 1 1 1] * [i0] - [0 1 0 -1 1 0 -1 0] [i1] - [0 0 0 0 1 -1 1 -1] [i2] - [0 -1 0 1 1 0 -1 0] [i3] - */ - - r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); - r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); - r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); - r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); - - *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; - *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; - } -} - -void FUNC_CPLX_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - assert(in != out); - for (k=0; k < dk; ++k) { - r0 = in[8*k+0]; i0 = in[8*k+1]; - r1 = in[8*k+2]; i1 = in[8*k+3]; - r2 = in[8*k+4]; i2 = in[8*k+5]; - r3 = in[8*k+6]; i3 = in[8*k+7]; - - sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); - si0 = VADD(i0,i2); di0 = VSUB(i0, i2); - si1 = VADD(i1,i3); di1 = VSUB(i1, i3); - - r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); - r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); - r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); - r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); - - VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]); - VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]); - VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]); - - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - - *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; - *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; - } -} - - -static ALWAYS_INLINE(void) FUNC_REAL_FINALIZE_4X4(const v4sf *in0, const v4sf *in1, const v4sf *in, - const v4sf *e, v4sf *out) { - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - r0 = *in0; i0 = *in1; - r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - - /* - transformation for each column is: - - [1 1 1 1 0 0 0 0] [r0] - [1 0 -1 0 0 -1 0 1] [r1] - [1 0 -1 0 0 1 0 -1] [r2] - [1 -1 1 -1 0 0 0 0] [r3] - [0 0 0 0 1 1 1 1] * [i0] - [0 -1 0 1 -1 0 1 0] [i1] - [0 -1 0 1 1 0 -1 0] [i2] - [0 0 0 0 -1 1 -1 1] [i3] - */ - - /* cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */ - /* cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */ - - VCPLXMUL(r1,i1,e[0],e[1]); - VCPLXMUL(r2,i2,e[2],e[3]); - VCPLXMUL(r3,i3,e[4],e[5]); - - /* cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */ - /* cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */ - - sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); - si0 = VADD(i0,i2); di0 = VSUB(i0,i2); - si1 = VADD(i1,i3); di1 = VSUB(i3,i1); - - r0 = VADD(sr0, sr1); - r3 = VSUB(sr0, sr1); - i0 = VADD(si0, si1); - i3 = VSUB(si1, si0); - r1 = VADD(dr0, di1); - r2 = VSUB(dr0, di1); - i1 = VSUB(dr1, di0); - i2 = VADD(dr1, di0); - - *out++ = r0; - *out++ = i0; - *out++ = r1; - *out++ = i1; - *out++ = r2; - *out++ = i2; - *out++ = r3; - *out++ = i3; - -} - -static NEVER_INLINE(void) FUNC_REAL_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ - /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ - - v4sf_union cr, ci, *uout = (v4sf_union*)out; - v4sf save = in[7], zero=VZERO(); - float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3; - static const float s = (float)M_SQRT2/2; - - cr.v = in[0]; ci.v = in[Ncvec*2-1]; - assert(in != out); - FUNC_REAL_FINALIZE_4X4(&zero, &zero, in+1, e, out); - - /* - [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] - - [Xr(1)] ] [1 1 1 1 0 0 0 0] - [Xr(N/4) ] [0 0 0 0 1 s 0 -s] - [Xr(N/2) ] [1 0 -1 0 0 0 0 0] - [Xr(3N/4)] [0 0 0 0 1 -s 0 s] - [Xi(1) ] [1 -1 1 -1 0 0 0 0] - [Xi(N/4) ] [0 0 0 0 0 -s -1 -s] - [Xi(N/2) ] [0 -1 0 1 0 0 0 0] - [Xi(3N/4)] [0 0 0 0 0 -s 1 -s] - */ - - xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0; - xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0; - xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2; - xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2; - xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1; - xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1; - xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3; - xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; - - for (k=1; k < dk; ++k) { - v4sf save_next = in[8*k+7]; - FUNC_REAL_FINALIZE_4X4(&save, &in[8*k+0], in + 8*k+1, - e + k*6, out + k*8); - save = save_next; - } - -} - -static ALWAYS_INLINE(void) FUNC_REAL_PREPROCESS_4X4(const v4sf *in, - const v4sf *e, v4sf *out, int first) { - v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; - /* - transformation for each column is: - - [1 1 1 1 0 0 0 0] [r0] - [1 0 0 -1 0 -1 -1 0] [r1] - [1 -1 -1 1 0 0 0 0] [r2] - [1 0 0 -1 0 1 1 0] [r3] - [0 0 0 0 1 -1 1 -1] * [i0] - [0 -1 1 0 1 0 0 1] [i1] - [0 0 0 0 1 1 -1 -1] [i2] - [0 1 -1 0 1 0 0 1] [i3] - */ - - v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); - v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); - v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); - v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); - - r0 = VADD(sr0, sr1); - r2 = VSUB(sr0, sr1); - r1 = VSUB(dr0, si1); - r3 = VADD(dr0, si1); - i0 = VSUB(di0, di1); - i2 = VADD(di0, di1); - i1 = VSUB(si0, dr1); - i3 = VADD(si0, dr1); - - VCPLXMULCONJ(r1,i1,e[0],e[1]); - VCPLXMULCONJ(r2,i2,e[2],e[3]); - VCPLXMULCONJ(r3,i3,e[4],e[5]); - - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - - if (!first) { - *out++ = r0; - *out++ = i0; - } - *out++ = r1; - *out++ = i1; - *out++ = r2; - *out++ = i2; - *out++ = r3; - *out++ = i3; -} - -static NEVER_INLINE(void) FUNC_REAL_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ - /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ - - v4sf_union Xr, Xi, *uout = (v4sf_union*)out; - float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3; - static const float s = (float)M_SQRT2; - assert(in != out); - for (k=0; k < 4; ++k) { - Xr.f[k] = ((float*)in)[8*k]; - Xi.f[k] = ((float*)in)[8*k+4]; - } - - FUNC_REAL_PREPROCESS_4X4(in, e, out+1, 1); /* will write only 6 values */ - - /* - [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] - - [cr0] [1 0 2 0 1 0 0 0] - [cr1] [1 0 0 0 -1 0 -2 0] - [cr2] [1 0 -2 0 1 0 0 0] - [cr3] [1 0 0 0 -1 0 2 0] - [ci0] [0 2 0 2 0 0 0 0] - [ci1] [0 s 0 -s 0 -s 0 -s] - [ci2] [0 0 0 0 0 -2 0 2] - [ci3] [0 -s 0 s 0 -s 0 -s] - */ - for (k=1; k < dk; ++k) { - FUNC_REAL_PREPROCESS_4X4(in+8*k, e + k*6, out-1+k*8, 0); - } - - cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0; - cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1; - cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2; - cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3; - ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0; - ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1; - ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2; - ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3; -} - - -void FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT *setup, const float *finput, float *foutput, v4sf *scratch, - pffft_direction_t direction, int ordered) { - int k, Ncvec = setup->Ncvec; - int nf_odd = (setup->ifac[1] & 1); - - /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ - int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); - VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); - - const v4sf *vinput = (const v4sf*)finput; - v4sf *voutput = (v4sf*)foutput; - v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; - int ib = (nf_odd ^ ordered ? 1 : 0); - - assert(VALIGNED(finput) && VALIGNED(foutput)); - - /* assert(finput != foutput); */ - if (direction == PFFFT_FORWARD) { - ib = !ib; - if (setup->transform == PFFFT_REAL) { - ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - FUNC_REAL_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); - } else { - v4sf *tmp = buff[ib]; - for (k=0; k < Ncvec; ++k) { - UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); - } - ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], - setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); - FUNC_CPLX_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); - } - if (ordered) { - FUNC_ZREORDER(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); - } else ib = !ib; - } else { - if (vinput == buff[ib]) { - ib = !ib; /* may happen when finput == foutput */ - } - if (ordered) { - FUNC_ZREORDER(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); - vinput = buff[ib]; ib = !ib; - } - if (setup->transform == PFFFT_REAL) { - FUNC_REAL_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e); - ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - } else { - FUNC_CPLX_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e); - ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], - setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); - for (k=0; k < Ncvec; ++k) { - INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); - } - } - } - - if (buff[ib] != voutput) { - /* extra copy required -- this situation should only happen when finput == foutput */ - assert(finput==foutput); - for (k=0; k < Ncvec; ++k) { - v4sf a = buff[ib][2*k], b = buff[ib][2*k+1]; - voutput[2*k] = a; voutput[2*k+1] = b; - } - ib = !ib; - } - assert(buff[ib] == voutput); -} - -void FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) { - int Ncvec = s->Ncvec; - const v4sf * RESTRICT va = (const v4sf*)a; - const v4sf * RESTRICT vb = (const v4sf*)b; - v4sf * RESTRICT vab = (v4sf*)ab; - -#ifdef __arm__ - __builtin_prefetch(va); - __builtin_prefetch(vb); - __builtin_prefetch(vab); - __builtin_prefetch(va+2); - __builtin_prefetch(vb+2); - __builtin_prefetch(vab+2); - __builtin_prefetch(va+4); - __builtin_prefetch(vb+4); - __builtin_prefetch(vab+4); - __builtin_prefetch(va+6); - __builtin_prefetch(vb+6); - __builtin_prefetch(vab+6); -# ifndef __clang__ -# define ZCONVOLVE_USING_INLINE_NEON_ASM -# endif -#endif - - float ar, ai, br, bi, abr, abi; -#ifndef ZCONVOLVE_USING_INLINE_ASM - v4sf vscal = LD_PS1(scaling); - int i; -#endif - - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); - ar = ((v4sf_union*)va)[0].f[0]; - ai = ((v4sf_union*)va)[1].f[0]; - br = ((v4sf_union*)vb)[0].f[0]; - bi = ((v4sf_union*)vb)[1].f[0]; - abr = ((v4sf_union*)vab)[0].f[0]; - abi = ((v4sf_union*)vab)[1].f[0]; - -#ifdef ZCONVOLVE_USING_INLINE_ASM - /* inline asm version, unfortunately miscompiled by clang 3.2, - * at least on ubuntu.. so this will be restricted to gcc */ - const float *a_ = a, *b_ = b; float *ab_ = ab; - int N = Ncvec; - asm volatile("mov r8, %2 \n" - "vdup.f32 q15, %4 \n" - "1: \n" - "pld [%0,#64] \n" - "pld [%1,#64] \n" - "pld [%2,#64] \n" - "pld [%0,#96] \n" - "pld [%1,#96] \n" - "pld [%2,#96] \n" - "vld1.f32 {q0,q1}, [%0,:128]! \n" - "vld1.f32 {q4,q5}, [%1,:128]! \n" - "vld1.f32 {q2,q3}, [%0,:128]! \n" - "vld1.f32 {q6,q7}, [%1,:128]! \n" - "vld1.f32 {q8,q9}, [r8,:128]! \n" - - "vmul.f32 q10, q0, q4 \n" - "vmul.f32 q11, q0, q5 \n" - "vmul.f32 q12, q2, q6 \n" - "vmul.f32 q13, q2, q7 \n" - "vmls.f32 q10, q1, q5 \n" - "vmla.f32 q11, q1, q4 \n" - "vld1.f32 {q0,q1}, [r8,:128]! \n" - "vmls.f32 q12, q3, q7 \n" - "vmla.f32 q13, q3, q6 \n" - "vmla.f32 q8, q10, q15 \n" - "vmla.f32 q9, q11, q15 \n" - "vmla.f32 q0, q12, q15 \n" - "vmla.f32 q1, q13, q15 \n" - "vst1.f32 {q8,q9},[%2,:128]! \n" - "vst1.f32 {q0,q1},[%2,:128]! \n" - "subs %3, #2 \n" - "bne 1b \n" - : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); -#else - /* default routine, works fine for non-arm cpus with current compilers */ - for (i=0; i < Ncvec; i += 2) { - v4sf ar, ai, br, bi; - ar = va[2*i+0]; ai = va[2*i+1]; - br = vb[2*i+0]; bi = vb[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]); - vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]); - ar = va[2*i+2]; ai = va[2*i+3]; - br = vb[2*i+2]; bi = vb[2*i+3]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]); - vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]); - } -#endif - if (s->transform == PFFFT_REAL) { - ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling; - ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling; - } -} - -void FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) { - v4sf vscal = LD_PS1(scaling); - const v4sf * RESTRICT va = (const v4sf*)a; - const v4sf * RESTRICT vb = (const v4sf*)b; - v4sf * RESTRICT vab = (v4sf*)ab; - float sar, sai, sbr, sbi; - const int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ - int k; /* was i -- but always used "2*i" - except at for() */ - -#ifdef __arm__ - __builtin_prefetch(va); - __builtin_prefetch(vb); - __builtin_prefetch(vab); - __builtin_prefetch(va+2); - __builtin_prefetch(vb+2); - __builtin_prefetch(vab+2); - __builtin_prefetch(va+4); - __builtin_prefetch(vb+4); - __builtin_prefetch(vab+4); - __builtin_prefetch(va+6); - __builtin_prefetch(vb+6); - __builtin_prefetch(vab+6); -# ifndef __clang__ -# define ZCONVOLVE_USING_INLINE_NEON_ASM -# endif -#endif - - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); - sar = ((v4sf_union*)va)[0].f[0]; - sai = ((v4sf_union*)va)[1].f[0]; - sbr = ((v4sf_union*)vb)[0].f[0]; - sbi = ((v4sf_union*)vb)[1].f[0]; - - /* default routine, works fine for non-arm cpus with current compilers */ - for (k=0; k < NcvecMulTwo; k += 4) { - v4sf var, vai, vbr, vbi; - var = va[k+0]; vai = va[k+1]; - vbr = vb[k+0]; vbi = vb[k+1]; - VCPLXMUL(var, vai, vbr, vbi); - vab[k+0] = VMUL(var, vscal); - vab[k+1] = VMUL(vai, vscal); - var = va[k+2]; vai = va[k+3]; - vbr = vb[k+2]; vbi = vb[k+3]; - VCPLXMUL(var, vai, vbr, vbi); - vab[k+2] = VMUL(var, vscal); - vab[k+3] = VMUL(vai, vscal); - } - - if (s->transform == PFFFT_REAL) { - ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling; - ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling; - } -} - - -#else /* #if ( SIMD_SZ == 4 ) * !defined(PFFFT_SIMD_DISABLE) */ - -/* standard routine using scalar floats, without SIMD stuff. */ - -#define pffft_zreorder_nosimd FUNC_ZREORDER -void pffft_zreorder_nosimd(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) { - int k, N = setup->N; - if (setup->transform == PFFFT_COMPLEX) { - for (k=0; k < 2*N; ++k) out[k] = in[k]; - return; - } - else if (direction == PFFFT_FORWARD) { - float x_N = in[N-1]; - for (k=N-1; k > 1; --k) out[k] = in[k-1]; - out[0] = in[0]; - out[1] = x_N; - } else { - float x_N = in[1]; - for (k=1; k < N-1; ++k) out[k] = in[k+1]; - out[0] = in[0]; - out[N-1] = x_N; - } -} - -#define pffft_transform_internal_nosimd FUNC_TRANSFORM_INTERNAL -void pffft_transform_internal_nosimd(SETUP_STRUCT *setup, const float *input, float *output, float *scratch, - pffft_direction_t direction, int ordered) { - int Ncvec = setup->Ncvec; - int nf_odd = (setup->ifac[1] & 1); - - /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ - int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); - VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); - float *buff[2]; - int ib; - if (scratch == 0) scratch = scratch_on_stack; - buff[0] = output; buff[1] = scratch; - - if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */ - ib = (nf_odd ^ ordered ? 1 : 0); - - if (direction == PFFFT_FORWARD) { - if (setup->transform == PFFFT_REAL) { - ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - } else { - ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); - } - if (ordered) { - FUNC_ZREORDER(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; - } - } else { - if (input == buff[ib]) { - ib = !ib; /* may happen when finput == foutput */ - } - if (ordered) { - FUNC_ZREORDER(setup, input, buff[!ib], PFFFT_BACKWARD); - input = buff[!ib]; - } - if (setup->transform == PFFFT_REAL) { - ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - } else { - ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); - } - } - if (buff[ib] != output) { - int k; - /* extra copy required -- this situation should happens only when finput == foutput */ - assert(input==output); - for (k=0; k < Ncvec; ++k) { - float a = buff[ib][2*k], b = buff[ib][2*k+1]; - output[2*k] = a; output[2*k+1] = b; - } - ib = !ib; - } - assert(buff[ib] == output); -} - -#define pffft_zconvolve_accumulate_nosimd FUNC_ZCONVOLVE_ACCUMULATE -void pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT *s, const float *a, const float *b, - float *ab, float scaling) { - int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ - int k; /* was i -- but always used "2*i" - except at for() */ - - if (s->transform == PFFFT_REAL) { - /* take care of the fftpack ordering */ - ab[0] += a[0]*b[0]*scaling; - ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling; - ++ab; ++a; ++b; NcvecMulTwo -= 2; - } - for (k=0; k < NcvecMulTwo; k += 2) { - float ar, ai, br, bi; - ar = a[k+0]; ai = a[k+1]; - br = b[k+0]; bi = b[k+1]; - VCPLXMUL(ar, ai, br, bi); - ab[k+0] += ar*scaling; - ab[k+1] += ai*scaling; - } -} - -#define pffft_zconvolve_no_accu_nosimd FUNC_ZCONVOLVE_NO_ACCU -void pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT *s, const float *a, const float *b, - float *ab, float scaling) { - int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ - int k; /* was i -- but always used "2*i" - except at for() */ - - if (s->transform == PFFFT_REAL) { - /* take care of the fftpack ordering */ - ab[0] += a[0]*b[0]*scaling; - ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling; - ++ab; ++a; ++b; NcvecMulTwo -= 2; - } - for (k=0; k < NcvecMulTwo; k += 2) { - float ar, ai, br, bi; - ar = a[k+0]; ai = a[k+1]; - br = b[k+0]; bi = b[k+1]; - VCPLXMUL(ar, ai, br, bi); - ab[k+0] = ar*scaling; - ab[k+1] = ai*scaling; - } -} - - -#endif /* #if ( SIMD_SZ == 4 ) * !defined(PFFFT_SIMD_DISABLE) */ - - -void FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) { - FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 0); -} - -void FUNC_TRANSFORM_ORDERED(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) { - FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 1); -} - - -#if ( SIMD_SZ == 4 ) - -#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) - -/* detect bugs with the vector support macros */ -void FUNC_VALIDATE_SIMD_A() { - float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; - v4sf_union a0, a1, a2, a3, t, u; - memcpy(a0.f, f, 4*sizeof(float)); - memcpy(a1.f, f+4, 4*sizeof(float)); - memcpy(a2.f, f+8, 4*sizeof(float)); - memcpy(a3.f, f+12, 4*sizeof(float)); - - t = a0; u = a1; t.v = VZERO(); - printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); - t.v = VADD(a1.v, a2.v); - printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); - t.v = VMUL(a1.v, a2.v); - printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); - t.v = VMADD(a1.v, a2.v,a0.v); - printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); - - INTERLEAVE2(a1.v,a2.v,t.v,u.v); - printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); - assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); - UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); - printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); - assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); - - t.v=LD_PS1(f[15]); - printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); - assertv4(t, 15, 15, 15, 15); - t.v = VSWAPHL(a1.v, a2.v); - printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); - assertv4(t, 8, 9, 6, 7); - VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); - printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", - a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], - a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); - assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); -} - - -static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo ) -{ - if ( !( fabs( result - ref ) < 0.01F ) ) - { - fprintf(stderr, "%s: assert for %s at %s(%d)\n expected %f value %f\n", functxt, vartxt, f, lineNo, ref, result); - ++(*numErrs); - } -} - -static void pffft_assert4( vsfscalar v0, vsfscalar v1, vsfscalar v2, vsfscalar v3, - float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo ) -{ - pffft_assert1( v0, a, "[0]", functxt, numErrs, f, lineNo ); - pffft_assert1( v1, b, "[1]", functxt, numErrs, f, lineNo ); - pffft_assert1( v2, c, "[2]", functxt, numErrs, f, lineNo ); - pffft_assert1( v3, d, "[3]", functxt, numErrs, f, lineNo ); -} - -#define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT ) pffft_assert4( (V).f[0], (V).f[1], (V).f[2], (V).f[3], a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ ) - - -int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut) -{ - int numErrs = 0; - - { - v4sf_union C; - int k; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: { }\n" ); - } - C.v = VZERO(); - if (DbgOut) { - fprintf(DbgOut, "VZERO(a) => C) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( C, 0.0F, 0.0F, 0.0F, 0.0F, "VZERO() Out C" ); - } - - { - v4sf_union C; - float a = 42.0F; - int k; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: a = {\n" ); - fprintf(DbgOut, " Inp a: %f\n", a ); - fprintf(DbgOut, "}\n" ); - } - C.v = LD_PS1(a); - if (DbgOut) { - fprintf(DbgOut, "LD_PS1(a) => C) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( C, 42.0F, 42.0F, 42.0F, 42.0F, "LD_PS1() Out C" ); - } - - { - v4sf_union C; - float a[16]; - int numAligned = 0, numUnaligned = 0; - int k; - const char * pUn; - for ( k = 0; k < 16; ++k ) a[k] = k+1; - - for ( k = 0; k + 3 < 16; ++k ) - { - const float * ptr = &a[k]; - if (DbgOut) - fprintf(DbgOut, "\ninput: a = [ %f, %f, %f, %f ]\n", ptr[0], ptr[1], ptr[2], ptr[3] ); - if ( VALIGNED(ptr) ) - { - C.v = VLOAD_ALIGNED( ptr ); - pUn = ""; - ++numAligned; - } - else - { - C.v = VLOAD_UNALIGNED( ptr ); - pUn = "UN"; - ++numUnaligned; - } - if (DbgOut) { - fprintf(DbgOut, "C = VLOAD_%sALIGNED(&a[%d]) => {\n", pUn, k ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - //PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" ); - - if ( numAligned >= 1 && numUnaligned >= 4 ) - break; - } - if ( numAligned < 1 ) { - fprintf(stderr, "VALIGNED() should have found at least 1 occurence!"); - ++numErrs; - } - if ( numUnaligned < 4 ) { - fprintf(stderr, "!VALIGNED() should have found at least 4 occurences!"); - ++numErrs; - } - } - - { - v4sf_union A, B, C; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, "}\n" ); - } - C.v = VADD(A.v, B.v); - if (DbgOut) { - fprintf(DbgOut, "C = VADD(A,B) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VADD(): Inp A" ); - PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VADD(): Inp B" ); - PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" ); - } - - { - v4sf_union A, B, C; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 20 + 2*k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, "}\n" ); - } - C.v = VSUB(A.v, B.v); - if (DbgOut) { - fprintf(DbgOut, "C = VSUB(A,B) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 21.0F, 23.0F, 25.0F, 27.0F, "VSUB(): Inp A" ); - PFFFT_ASSERT4( B, 11.0F, 12.0F, 13.0F, 14.0F, "VSUB(): Inp B" ); - PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VSUB(): Out C" ); - } - - { - v4sf_union A, B, C; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, "}\n" ); - } - C.v = VMUL(A.v, B.v); - if (DbgOut) { - fprintf(DbgOut, "C = VMUL(A,B) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMUL(): Inp A" ); - PFFFT_ASSERT4( B, 1.0F, 2.0F, 3.0F, 4.0F, "VMUL(): Inp B" ); - PFFFT_ASSERT4( C, 11.0F, 24.0F, 39.0F, 56.0F, "VMUL(): Out C" ); - } - - { - v4sf_union A, B, C, D; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 10 + k; - for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B,C = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, " Inp C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - D.v = VMADD(A.v, B.v, C.v); - if (DbgOut) { - fprintf(DbgOut, "D = VMADD(A,B,C) => {\n" ); - fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMADD(): Inp A" ); - PFFFT_ASSERT4( B, 1.0F, 2.0F, 3.0F, 4.0F, "VMADD(): Inp B" ); - PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VMADD(): Inp C" ); - PFFFT_ASSERT4( D, 21.0F, 35.0F, 51.0F, 69.0F, "VMADD(): Out D" ); - } - - { - v4sf_union A, B, C, D; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, "}\n" ); - } - INTERLEAVE2(A.v, B.v, C.v, D.v); - if (DbgOut) { - fprintf(DbgOut, "INTERLEAVE2(A,B, => C,D) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "INTERLEAVE2() Inp A" ); - PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "INTERLEAVE2() Inp B" ); - PFFFT_ASSERT4( C, 11.0F, 21.0F, 12.0F, 22.0F, "INTERLEAVE2() Out C" ); - PFFFT_ASSERT4( D, 13.0F, 23.0F, 14.0F, 24.0F, "INTERLEAVE2() Out D" ); - } - - { - v4sf_union A, B, C, D; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, "}\n" ); - } - UNINTERLEAVE2(A.v, B.v, C.v, D.v); - if (DbgOut) { - fprintf(DbgOut, "UNINTERLEAVE2(A,B, => C,D) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "UNINTERLEAVE2() Inp A" ); - PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "UNINTERLEAVE2() Inp B" ); - PFFFT_ASSERT4( C, 11.0F, 13.0F, 21.0F, 23.0F, "UNINTERLEAVE2() Out C" ); - PFFFT_ASSERT4( D, 12.0F, 14.0F, 22.0F, 24.0F, "UNINTERLEAVE2() Out D" ); - } - - { - v4sf_union A, B, C, D; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B,C,D = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, " Inp C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, " Inp D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); - fprintf(DbgOut, "}\n" ); - } - VTRANSPOSE4(A.v, B.v, C.v, D.v); - if (DbgOut) { - fprintf(DbgOut, "VTRANSPOSE4(A,B,C,D) => {\n" ); - fprintf(DbgOut, " Out A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Out B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 21.0F, 31.0F, 41.0F, "VTRANSPOSE4(): Out A" ); - PFFFT_ASSERT4( B, 12.0F, 22.0F, 32.0F, 42.0F, "VTRANSPOSE4(): Out B" ); - PFFFT_ASSERT4( C, 13.0F, 23.0F, 33.0F, 43.0F, "VTRANSPOSE4(): Out C" ); - PFFFT_ASSERT4( D, 14.0F, 24.0F, 34.0F, 44.0F, "VTRANSPOSE4(): Out D" ); - } - - { - v4sf_union A, B, C; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A,B = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] ); - fprintf(DbgOut, "}\n" ); - } - C.v = VSWAPHL(A.v, B.v); - if (DbgOut) { - fprintf(DbgOut, "C = VSWAPHL(A,B) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VSWAPHL(): Inp A" ); - PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VSWAPHL(): Inp B" ); - PFFFT_ASSERT4( C, 21.0F, 22.0F, 13.0F, 14.0F, "VSWAPHL(): Out C" ); - } - - { - v4sf_union A, C; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, "}\n" ); - } - C.v = VREV_S(A.v); - if (DbgOut) { - fprintf(DbgOut, "C = VREV_S(A) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_S(): Inp A" ); - PFFFT_ASSERT4( C, 14.0F, 13.0F, 12.0F, 11.0F, "VREV_S(): Out C" ); - } - - { - v4sf_union A, C; - int k; - for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1; - - if (DbgOut) { - fprintf(DbgOut, "\ninput: A = {\n" ); - fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] ); - fprintf(DbgOut, "}\n" ); - } - C.v = VREV_C(A.v); - if (DbgOut) { - fprintf(DbgOut, "C = VREV_C(A) => {\n" ); - fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] ); - fprintf(DbgOut, "}\n" ); - } - PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_C(): Inp A" ); - PFFFT_ASSERT4( C, 13.0F, 14.0F, 11.0F, 12.0F, "VREV_C(): Out A" ); - } - - return numErrs; -} - -#else /* if ( SIMD_SZ == 4 ) */ - -void FUNC_VALIDATE_SIMD_A() -{ -} - -int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut) -{ - return -1; -} - -#endif /* end if ( SIMD_SZ == 4 ) */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_altivec_float.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_altivec_float.h deleted file mode 100644 index ef2526d..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_altivec_float.h +++ /dev/null @@ -1,81 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_ALTIVEC_FLT_H -#define PF_ALTIVEC_FLT_H - -/* - Altivec support macros -*/ -#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__)) -#pragma message( __FILE__ ": ALTIVEC float macros are defined" ) -typedef vector float v4sf; - -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - float f[SIMD_SZ]; -} v4sf_union; - -# define VREQUIRES_ALIGN 1 /* not sure, if really required */ -# define VARCH "ALTIVEC" -# define VZERO() ((vector float) vec_splat_u8(0)) -# define VMUL(a,b) vec_madd(a,b, VZERO()) -# define VADD(a,b) vec_add(a,b) -# define VMADD(a,b,c) vec_madd(a,b,c) -# define VSUB(a,b) vec_sub(a,b) -inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } -# define LD_PS1(p) ld_ps1(&p) -# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } -# define UNINTERLEAVE2(in1, in2, out1, out2) { \ - vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \ - vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \ - v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ - } -# define VTRANSPOSE4(x0,x1,x2,x3) { \ - v4sf y0 = vec_mergeh(x0, x2); \ - v4sf y1 = vec_mergel(x0, x2); \ - v4sf y2 = vec_mergeh(x1, x3); \ - v4sf y3 = vec_mergel(x1, x3); \ - x0 = vec_mergeh(y0, y2); \ - x1 = vec_mergel(y0, y2); \ - x2 = vec_mergeh(y1, y3); \ - x3 = vec_mergel(y1, y3); \ - } -# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15)) -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) - -#endif - -#endif /* PF_SSE1_FLT_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_avx_double.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_avx_double.h deleted file mode 100644 index fe0efa8..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_avx_double.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) -*/ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_AVX_DBL_H -#define PF_AVX_DBL_H - -/* - vector support macros: the rest of the code is independant of - AVX -- adding support for other platforms with 4-element - vectors should be limited to these macros -*/ - - -/* - AVX support macros -*/ -#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__) -#pragma message( __FILE__ ": AVX macros are defined" ) - -#include -typedef __m256d v4sf; - -/* 4 doubles by simd vector */ -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - double f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "AVX" -# define VREQUIRES_ALIGN 1 -# define VZERO() _mm256_setzero_pd() -# define VMUL(a,b) _mm256_mul_pd(a,b) -# define VADD(a,b) _mm256_add_pd(a,b) -# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) -# define VSUB(a,b) _mm256_sub_pd(a,b) -# define LD_PS1(p) _mm256_set1_pd(p) -# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) -# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) - -/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: -out1 = [ in1[0], in2[0], in1[1], in2[1] ] -out2 = [ in1[2], in2[2], in1[3], in2[3] ] -*/ -# define INTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = _mm256_castpd256_pd128(in1); \ - __m128d low2__ = _mm256_castpd256_pd128(in2); \ - __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ - _mm_shuffle_pd(low1__, low2__, 3), \ - 1); \ - out2 = _mm256_insertf128_pd( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ - _mm_shuffle_pd(high1__, high2__, 3), \ - 1); \ - out1 = tmp__; \ -} - -/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: -out1 = [ in1[0], in1[2], in2[0], in2[2] ] -out2 = [ in1[1], in1[3], in2[1], in2[3] ] -*/ -# define UNINTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = _mm256_castpd256_pd128(in1); \ - __m128d low2__ = _mm256_castpd256_pd128(in2); \ - __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ - _mm_shuffle_pd(low2__, high2__, 0), \ - 1); \ - out2 = _mm256_insertf128_pd( \ - _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ - _mm_shuffle_pd(low2__, high2__, 3), \ - 1); \ - out1 = tmp__; \ -} - -# define VTRANSPOSE4(row0, row1, row2, row3) { \ - __m256d tmp3, tmp2, tmp1, tmp0; \ - \ - tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \ - tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \ - tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \ - tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \ - \ - (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \ - (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \ - (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \ - (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \ - } - -/*VSWAPHL(a, b) pseudo code: -return [ b[0], b[1], a[2], a[3] ] -*/ -# define VSWAPHL(a,b) \ - _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) - -/* reverse/flip all floats */ -# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1) - -/* reverse/flip complex floats */ -# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) - -#endif - -#endif /* PF_AVX_DBL_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_double.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_double.h deleted file mode 100644 index 1025827..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_double.h +++ /dev/null @@ -1,84 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_DBL_H -#define PF_DBL_H - -#include -#include -#include - - -/* - * SIMD reference material: - * - * general SIMD introduction: - * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing - * - * SSE 1: - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ - * - * ARM NEON: - * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics - * - * Altivec: - * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf - * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html - * better one? - * - */ - -typedef double vsfscalar; - -#include "pf_avx_double.h" -#include "pf_sse2_double.h" -#include "pf_neon_double.h" - -#ifndef SIMD_SZ -# if !defined(PFFFT_SIMD_DISABLE) -# pragma message( "building double with simd disabled !" ) -# define PFFFT_SIMD_DISABLE /* fallback to scalar code */ -# endif -#endif - -#include "pf_scalar_double.h" - -/* shortcuts for complex multiplcations */ -#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } -#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } -#ifndef SVMUL -/* multiply a scalar with a vector */ -#define SVMUL(f,v) VMUL(LD_PS1(f),v) -#endif - -#endif /* PF_DBL_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_float.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_float.h deleted file mode 100644 index eab2723..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_float.h +++ /dev/null @@ -1,84 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_FLT_H -#define PF_FLT_H - -#include -#include -#include - - -/* - * SIMD reference material: - * - * general SIMD introduction: - * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing - * - * SSE 1: - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ - * - * ARM NEON: - * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics - * - * Altivec: - * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf - * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html - * better one? - * - */ - -typedef float vsfscalar; - -#include "pf_sse1_float.h" -#include "pf_neon_float.h" -#include "pf_altivec_float.h" - -#ifndef SIMD_SZ -# if !defined(PFFFT_SIMD_DISABLE) -# pragma message( "building float with simd disabled !" ) -# define PFFFT_SIMD_DISABLE /* fallback to scalar code */ -# endif -#endif - -#include "pf_scalar_float.h" - -/* shortcuts for complex multiplcations */ -#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } -#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } -#ifndef SVMUL -/* multiply a scalar with a vector */ -#define SVMUL(f,v) VMUL(LD_PS1(f),v) -#endif - -#endif /* PF_FLT_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_double.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_double.h deleted file mode 100644 index e432abc..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_double.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) -*/ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_NEON_DBL_H -#define PF_NEON_DBL_H - -/* - NEON 64bit support macros -*/ -#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__)) - -#pragma message (__FILE__ ": NEON (from AVX) macros are defined" ) - -#include "pf_neon_double_from_avx.h" -typedef __m256d v4sf; - -/* 4 doubles by simd vector */ -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - double f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "NEON" -# define VREQUIRES_ALIGN 1 -# define VZERO() _mm256_setzero_pd() -# define VMUL(a,b) _mm256_mul_pd(a,b) -# define VADD(a,b) _mm256_add_pd(a,b) -# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) -# define VSUB(a,b) _mm256_sub_pd(a,b) -# define LD_PS1(p) _mm256_set1_pd(p) -# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) -# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) - -FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b) -{ - __m256d res; - res.vect_f64[0] = a.vect_f64[0]; - res.vect_f64[1] = b; - return res; -} - -FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b) -{ - float64x1_t al = vget_low_f64(a); - float64x1_t bl = vget_low_f64(b); - return vcombine_f64(al, bl); -} - -FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b) -{ - float64x1_t ah = vget_high_f64(a); - float64x1_t bh = vget_high_f64(b); - return vcombine_f64(ah, bh); -} - -FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b) -{ - __m256d res; - res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]); - res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]); - return res; -} - -FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b) -{ - __m256d res; - res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]); - res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]); - return res; -} - -FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) { - __m256d res; - res.vect_f64[0] = a.vect_f64[0]; - res.vect_f64[1] = b.vect_f64[0]; - return res; -} - - -FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b) -{ - __m256d res; - res.vect_f64[0] = a.vect_f64[1]; - res.vect_f64[1] = b.vect_f64[1]; - return res; -} - -FORCE_INLINE __m256d _mm256_reverse(__m256d x) -{ - __m256d res; - float64x2_t low = x.vect_f64[0]; - float64x2_t high = x.vect_f64[1]; - float64x1_t a = vget_low_f64(low); - float64x1_t b = vget_high_f64(low); - float64x1_t c = vget_low_f64(high); - float64x1_t d = vget_high_f64(high); - res.vect_f64[0] = vcombine_f64(d, c); - res.vect_f64[1] = vcombine_f64(b, a); - return res; -} - -/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: -out1 = [ in1[0], in2[0], in1[1], in2[1] ] -out2 = [ in1[2], in2[2], in1[3], in2[3] ] -*/ -# define INTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = _mm256_castpd256_pd128(in1); \ - __m128d low2__ = _mm256_castpd256_pd128(in2); \ - __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \ - _mm_shuffle_pd_11(low1__, low2__)); \ - out2 = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \ - _mm_shuffle_pd_11(high1__, high2__)); \ - out1 = tmp__; \ -} - -/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: -out1 = [ in1[0], in1[2], in2[0], in2[2] ] -out2 = [ in1[1], in1[3], in2[1], in2[3] ] -*/ -# define UNINTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = _mm256_castpd256_pd128(in1); \ - __m128d low2__ = _mm256_castpd256_pd128(in2); \ - __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ - __m256d tmp__ = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \ - _mm_shuffle_pd_00(low2__, high2__)); \ - out2 = _mm256_insertf128_pd_1( \ - _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \ - _mm_shuffle_pd_11(low2__, high2__)); \ - out1 = tmp__; \ -} - -# define VTRANSPOSE4(row0, row1, row2, row3) { \ - __m256d tmp3, tmp2, tmp1, tmp0; \ - \ - tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \ - tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \ - tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \ - tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \ - \ - (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \ - (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \ - (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \ - (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \ - } - -/*VSWAPHL(a, b) pseudo code: -return [ b[0], b[1], a[2], a[3] ] -*/ -# define VSWAPHL(a,b) \ - _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1)) - -/* reverse/flip all floats */ -# define VREV_S(a) _mm256_reverse(a) - -/* reverse/flip complex floats */ -# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a)) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) - -#endif - -#endif /* PF_AVX_DBL_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_double_from_avx.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_double_from_avx.h deleted file mode 100644 index 5cce17e..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_double_from_avx.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. - - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - - * http://www.apache.org/licenses/LICENSE-2.0 - - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - - */ - -//see https://github.com/kunpengcompute/AvxToNeon - -#ifndef PF_NEON_DBL_FROM_AVX_H -#define PF_NEON_DBL_FROM_AVX_H -#include - - -#if defined(__GNUC__) || defined(__clang__) - -#pragma push_macro("FORCE_INLINE") -#define FORCE_INLINE static inline __attribute__((always_inline)) - -#else - -#error "Macro name collisions may happens with unknown compiler" -#ifdef FORCE_INLINE -#undef FORCE_INLINE -#endif - -#define FORCE_INLINE static inline - -#endif - -typedef struct { - float32x4_t vect_f32[2]; -} __m256; - -typedef struct { - float64x2_t vect_f64[2]; -} __m256d; - -typedef float64x2_t __m128d; - -FORCE_INLINE __m256d _mm256_setzero_pd(void) -{ - __m256d ret; - ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0); - return ret; -} - -FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b) -{ - __m256d res_m256d; - res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; -} - -FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b) -{ - __m256d res_m256d; - res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; -} - -FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b) -{ - __m256d res_m256d; - res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]); - res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]); - return res_m256d; -} - -FORCE_INLINE __m256d _mm256_set1_pd(double a) -{ - __m256d ret; - ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a); - return ret; -} - -FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr) -{ - __m256d res; - res.vect_f64[0] = vld1q_f64((const double *)mem_addr); - res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); - return res; -} -FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr) -{ - __m256d res; - res.vect_f64[0] = vld1q_f64((const double *)mem_addr); - res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2); - return res; -} - -FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a) -{ - return a.vect_f64[0]; -} - -FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8) -{ - assert(imm8 >= 0 && imm8 <= 1); - return a.vect_f64[imm8]; -} - -FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a) -{ - __m256d res; - res.vect_f64[0] = a; - return res; -} - -#endif /* PF_AVX_DBL_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_float.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_float.h deleted file mode 100644 index c7a547f..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_neon_float.h +++ /dev/null @@ -1,87 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_NEON_FLT_H -#define PF_NEON_FLT_H - -/* - ARM NEON support macros -*/ -#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__)) -#pragma message( __FILE__ ": ARM NEON macros are defined" ) - -# include -typedef float32x4_t v4sf; - -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - float f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "NEON" -# define VREQUIRES_ALIGN 0 /* usually no alignment required */ -# define VZERO() vdupq_n_f32(0) -# define VMUL(a,b) vmulq_f32(a,b) -# define VADD(a,b) vaddq_f32(a,b) -# define VMADD(a,b,c) vmlaq_f32(c,a,b) -# define VSUB(a,b) vsubq_f32(a,b) -# define LD_PS1(p) vld1q_dup_f32(&(p)) -# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr))) -# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr))) -# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define VTRANSPOSE4(x0,x1,x2,x3) { \ - float32x4x2_t t0_ = vzipq_f32(x0, x2); \ - float32x4x2_t t1_ = vzipq_f32(x1, x3); \ - float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ - float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \ - x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ - } -// marginally faster version -//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } -# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) - -/* reverse/flip all floats */ -# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a))) -/* reverse/flip complex floats */ -# define VREV_C(a) vextq_f32(a, a, 2) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0) - -#else -/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */ -#endif - -#endif /* PF_NEON_FLT_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_scalar_double.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_scalar_double.h deleted file mode 100644 index b7a1cae..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_scalar_double.h +++ /dev/null @@ -1,185 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_SCAL_DBL_H -#define PF_SCAL_DBL_H - -/* - fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead -*/ - -#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED) -#pragma message( __FILE__ ": double SCALAR4 macros are defined" ) - -typedef struct { - vsfscalar a; - vsfscalar b; - vsfscalar c; - vsfscalar d; -} v4sf; - -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - vsfscalar f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "4xScalar" -# define VREQUIRES_ALIGN 0 - - static ALWAYS_INLINE(v4sf) VZERO() { - v4sf r = { 0.f, 0.f, 0.f, 0.f }; - return r; - } - - static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) { - v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) { - v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) { - v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) { - v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) { - v4sf r = { v, v, v, v }; - return r; - } - -# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr))) - -# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr))) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0) - - - /* INTERLEAVE2() */ - #define INTERLEAVE2( A, B, C, D) \ - do { \ - v4sf Cr = { A.a, B.a, A.b, B.b }; \ - v4sf Dr = { A.c, B.c, A.d, B.d }; \ - C = Cr; \ - D = Dr; \ - } while (0) - - - /* UNINTERLEAVE2() */ - #define UNINTERLEAVE2(A, B, C, D) \ - do { \ - v4sf Cr = { A.a, A.c, B.a, B.c }; \ - v4sf Dr = { A.b, A.d, B.b, B.d }; \ - C = Cr; \ - D = Dr; \ - } while (0) - - - /* VTRANSPOSE4() */ - #define VTRANSPOSE4(A, B, C, D) \ - do { \ - v4sf Ar = { A.a, B.a, C.a, D.a }; \ - v4sf Br = { A.b, B.b, C.b, D.b }; \ - v4sf Cr = { A.c, B.c, C.c, D.c }; \ - v4sf Dr = { A.d, B.d, C.d, D.d }; \ - A = Ar; \ - B = Br; \ - C = Cr; \ - D = Dr; \ - } while (0) - - - /* VSWAPHL() */ - static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) { - v4sf r = { B.a, B.b, A.c, A.d }; - return r; - } - - - /* reverse/flip all floats */ - static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) { - v4sf r = { A.d, A.c, A.b, A.a }; - return r; - } - - /* reverse/flip complex floats */ - static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) { - v4sf r = { A.c, A.d, A.a, A.b }; - return r; - } - -#else -/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */ -#endif - - -#if !defined(SIMD_SZ) -#pragma message( __FILE__ ": float SCALAR1 macros are defined" ) -typedef vsfscalar v4sf; - -# define SIMD_SZ 1 - -typedef union v4sf_union { - v4sf v; - vsfscalar f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "Scalar" -# define VREQUIRES_ALIGN 0 -# define VZERO() 0.0 -# define VMUL(a,b) ((a)*(b)) -# define VADD(a,b) ((a)+(b)) -# define VMADD(a,b,c) ((a)*(b)+(c)) -# define VSUB(a,b) ((a)-(b)) -# define LD_PS1(p) (p) -# define VLOAD_UNALIGNED(ptr) (*(ptr)) -# define VLOAD_ALIGNED(ptr) (*(ptr)) -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0) - -#else -/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */ -#endif - - -#endif /* PF_SCAL_DBL_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_scalar_float.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_scalar_float.h deleted file mode 100644 index 4588588..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_scalar_float.h +++ /dev/null @@ -1,185 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_SCAL_FLT_H -#define PF_SCAL_FLT_H - -/* - fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead -*/ - -#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED) -#pragma message( __FILE__ ": float SCALAR4 macros are defined" ) - -typedef struct { - vsfscalar a; - vsfscalar b; - vsfscalar c; - vsfscalar d; -} v4sf; - -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - vsfscalar f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "4xScalar" -# define VREQUIRES_ALIGN 0 - - static ALWAYS_INLINE(v4sf) VZERO() { - v4sf r = { 0.f, 0.f, 0.f, 0.f }; - return r; - } - - static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) { - v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) { - v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) { - v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) { - v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d }; - return r; - } - - static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) { - v4sf r = { v, v, v, v }; - return r; - } - -# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr))) - -# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr))) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0) - - - /* INTERLEAVE2() */ - #define INTERLEAVE2( A, B, C, D) \ - do { \ - v4sf Cr = { A.a, B.a, A.b, B.b }; \ - v4sf Dr = { A.c, B.c, A.d, B.d }; \ - C = Cr; \ - D = Dr; \ - } while (0) - - - /* UNINTERLEAVE2() */ - #define UNINTERLEAVE2(A, B, C, D) \ - do { \ - v4sf Cr = { A.a, A.c, B.a, B.c }; \ - v4sf Dr = { A.b, A.d, B.b, B.d }; \ - C = Cr; \ - D = Dr; \ - } while (0) - - - /* VTRANSPOSE4() */ - #define VTRANSPOSE4(A, B, C, D) \ - do { \ - v4sf Ar = { A.a, B.a, C.a, D.a }; \ - v4sf Br = { A.b, B.b, C.b, D.b }; \ - v4sf Cr = { A.c, B.c, C.c, D.c }; \ - v4sf Dr = { A.d, B.d, C.d, D.d }; \ - A = Ar; \ - B = Br; \ - C = Cr; \ - D = Dr; \ - } while (0) - - - /* VSWAPHL() */ - static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) { - v4sf r = { B.a, B.b, A.c, A.d }; - return r; - } - - - /* reverse/flip all floats */ - static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) { - v4sf r = { A.d, A.c, A.b, A.a }; - return r; - } - - /* reverse/flip complex floats */ - static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) { - v4sf r = { A.c, A.d, A.a, A.b }; - return r; - } - -#else -/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */ -#endif - - -#if !defined(SIMD_SZ) -#pragma message( __FILE__ ": float SCALAR1 macros are defined" ) -typedef vsfscalar v4sf; - -# define SIMD_SZ 1 - -typedef union v4sf_union { - v4sf v; - vsfscalar f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "Scalar" -# define VREQUIRES_ALIGN 0 -# define VZERO() 0.f -# define VMUL(a,b) ((a)*(b)) -# define VADD(a,b) ((a)+(b)) -# define VMADD(a,b,c) ((a)*(b)+(c)) -# define VSUB(a,b) ((a)-(b)) -# define LD_PS1(p) (p) -# define VLOAD_UNALIGNED(ptr) (*(ptr)) -# define VLOAD_ALIGNED(ptr) (*(ptr)) -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0) - -#else -/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */ -#endif - - -#endif /* PF_SCAL_FLT_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_sse1_float.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_sse1_float.h deleted file mode 100644 index df73c2e..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_sse1_float.h +++ /dev/null @@ -1,82 +0,0 @@ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_SSE1_FLT_H -#define PF_SSE1_FLT_H - -/* - SSE1 support macros -*/ -#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86)) -#pragma message( __FILE__ ": SSE1 float macros are defined" ) - -#include -typedef __m128 v4sf; - -/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions - * anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */ -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - float f[SIMD_SZ]; -} v4sf_union; - -# define VARCH "SSE1" -# define VREQUIRES_ALIGN 1 -# define VZERO() _mm_setzero_ps() -# define VMUL(a,b) _mm_mul_ps(a,b) -# define VADD(a,b) _mm_add_ps(a,b) -# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) -# define VSUB(a,b) _mm_sub_ps(a,b) -# define LD_PS1(p) _mm_set1_ps(p) -# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr) -# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr) - -# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } -# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } -# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) -# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) - -/* reverse/flip all floats */ -# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3)) -/* reverse/flip complex floats */ -# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2)) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) - -#else -/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */ -#endif - -#endif /* PF_SSE1_FLT_H */ - diff --git a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_sse2_double.h b/Packages/PFFFT/Sources/PFFFTLib/simd/pf_sse2_double.h deleted file mode 100644 index da87951..0000000 --- a/Packages/PFFFT/Sources/PFFFTLib/simd/pf_sse2_double.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com ) -*/ - -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -#ifndef PF_SSE2_DBL_H -#define PF_SSE2_DBL_H - -//detect sse2 support under MSVC -#if defined ( _M_IX86_FP ) -# if _M_IX86_FP == 2 -# if !defined(__SSE2__) -# define __SSE2__ -# endif -# endif -#endif - -/* - SSE2 64bit support macros -*/ -#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 )) -#pragma message (__FILE__ ": SSE2 double macros are defined" ) - -#include - -typedef struct { - __m128d d128[2]; -} m256d; - -typedef m256d v4sf; - -# define SIMD_SZ 4 - -typedef union v4sf_union { - v4sf v; - double f[SIMD_SZ]; -} v4sf_union; - - -#if defined(__GNUC__) || defined(__clang__) - -#pragma push_macro("FORCE_INLINE") -#define FORCE_INLINE static inline __attribute__((always_inline)) - -#elif defined (_MSC_VER) -#define FORCE_INLINE static __forceinline - -#else -#error "Macro name collisions may happens with unknown compiler" -#ifdef FORCE_INLINE -#undef FORCE_INLINE -#endif -#define FORCE_INLINE static inline -#endif - -FORCE_INLINE m256d mm256_setzero_pd(void) -{ - m256d ret; - ret.d128[0] = ret.d128[1] = _mm_setzero_pd(); - return ret; -} - -FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b) -{ - m256d ret; - ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]); - ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]); - return ret; -} - -FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b) -{ - m256d ret; - ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]); - ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]); - return ret; -} - -FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b) -{ - m256d ret; - ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]); - ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]); - return ret; -} - -FORCE_INLINE m256d mm256_set1_pd(double a) -{ - m256d ret; - ret.d128[0] = ret.d128[1] = _mm_set1_pd(a); - return ret; -} - -FORCE_INLINE m256d mm256_load_pd (double const * mem_addr) -{ - m256d res; - res.d128[0] = _mm_load_pd((const double *)mem_addr); - res.d128[1] = _mm_load_pd((const double *)mem_addr + 2); - return res; -} -FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr) -{ - m256d res; - res.d128[0] = _mm_loadu_pd((const double *)mem_addr); - res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2); - return res; -} - - -# define VARCH "SSE2" -# define VREQUIRES_ALIGN 1 -# define VZERO() mm256_setzero_pd() -# define VMUL(a,b) mm256_mul_pd(a,b) -# define VADD(a,b) mm256_add_pd(a,b) -# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c) -# define VSUB(a,b) mm256_sub_pd(a,b) -# define LD_PS1(p) mm256_set1_pd(p) -# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr) -# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr) - - -FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a) -{ - return a.d128[0]; -} - -FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8) -{ - assert(imm8 >= 0 && imm8 <= 1); - return a.d128[imm8]; -} -FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b) -{ - m256d res; - res.d128[0] = a.d128[0]; - res.d128[1] = b; - return res; -} -FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a) -{ - m256d res; - res.d128[0] = a; - return res; -} - -FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b) -{ - m256d res; - res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0); - res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0); - return res; -} - -FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b) -{ - m256d res; - res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3); - res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3); - return res; -} - -FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) { - m256d res; - res.d128[0] = a.d128[0]; - res.d128[1] = b.d128[0]; - return res; -} - - -FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b) -{ - m256d res; - res.d128[0] = a.d128[1]; - res.d128[1] = b.d128[1]; - return res; -} - -FORCE_INLINE m256d mm256_reverse(m256d x) -{ - m256d res; - res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1); - res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1); - return res; -} - -/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: -out1 = [ in1[0], in2[0], in1[1], in2[1] ] -out2 = [ in1[2], in2[2], in1[3], in2[3] ] -*/ -# define INTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = mm256_castpd256_pd128(in1); \ - __m128d low2__ = mm256_castpd256_pd128(in2); \ - __m128d high1__ = mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = mm256_extractf128_pd(in2, 1); \ - m256d tmp__ = mm256_insertf128_pd_1( \ - mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ - _mm_shuffle_pd(low1__, low2__, 3)); \ - out2 = mm256_insertf128_pd_1( \ - mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ - _mm_shuffle_pd(high1__, high2__, 3)); \ - out1 = tmp__; \ -} - -/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: -out1 = [ in1[0], in1[2], in2[0], in2[2] ] -out2 = [ in1[1], in1[3], in2[1], in2[3] ] -*/ -# define UNINTERLEAVE2(in1, in2, out1, out2) { \ - __m128d low1__ = mm256_castpd256_pd128(in1); \ - __m128d low2__ = mm256_castpd256_pd128(in2); \ - __m128d high1__ = mm256_extractf128_pd(in1, 1); \ - __m128d high2__ = mm256_extractf128_pd(in2, 1); \ - m256d tmp__ = mm256_insertf128_pd_1( \ - mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ - _mm_shuffle_pd(low2__, high2__, 0)); \ - out2 = mm256_insertf128_pd_1( \ - mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ - _mm_shuffle_pd(low2__, high2__, 3)); \ - out1 = tmp__; \ -} - -# define VTRANSPOSE4(row0, row1, row2, row3) { \ - m256d tmp3, tmp2, tmp1, tmp0; \ - \ - tmp0 = mm256_shuffle_pd_00((row0),(row1)); \ - tmp2 = mm256_shuffle_pd_11((row0),(row1)); \ - tmp1 = mm256_shuffle_pd_00((row2),(row3)); \ - tmp3 = mm256_shuffle_pd_11((row2),(row3)); \ - \ - (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \ - (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \ - (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \ - (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \ - } - -/*VSWAPHL(a, b) pseudo code: -return [ b[0], b[1], a[2], a[3] ] -*/ -# define VSWAPHL(a,b) \ - mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1)) - -/* reverse/flip all floats */ -# define VREV_S(a) mm256_reverse(a) - -/* reverse/flip complex floats */ -# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a)) - -# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) - -#endif -#endif diff --git a/Packages/PFFFT/Tests/PFFFTTests/PFFFTTests.swift b/Packages/PFFFT/Tests/PFFFTTests/PFFFTTests.swift deleted file mode 100644 index 5a4021e..0000000 --- a/Packages/PFFFT/Tests/PFFFTTests/PFFFTTests.swift +++ /dev/null @@ -1,15 +0,0 @@ -import Testing -@testable import PFFFT - -@Test func example() async throws { - let cache = PFFFT.SetupCache() - - let ff = try cache.get(for: 1024, type: .real) - - let input = Buffer(capacity: 1023) - let output = Buffer(capacity: 1024) - - ff.fft(input: input, output: output, work: nil, sign: .forward) - - // Write your test here and use APIs like `#expect(...)` to check expected conditions. -} diff --git a/Sources/EcgSynKit/EcgSynKit.swift b/Sources/EcgSynKit/EcgSynKit.swift index 3246e7c..c1834f4 100644 --- a/Sources/EcgSynKit/EcgSynKit.swift +++ b/Sources/EcgSynKit/EcgSynKit.swift @@ -1,26 +1,12 @@ import Algorithms import ComplexModule import Foundation +import OdeInt import PFFFT import PFFFTLib import RealModule -struct Parameters { - /// The number of beats to simulate. - let numBeats: Int = 256 - - /// The ECG sampling frequency in Hz. - let sfEcg: Int = 256 - - /// The internal sampling frequency in Hz. - var sfInternal: Int = 512 - - /// The mean heart rate in beats per minute. - let hrMean: Double = 60.0 - - /// The standard deviation of the heart rate. - let hrStd: Double = 1.0 - +public struct Parameters { /// The ratio of power between low and high frequencies. let lfhfRatio: Double = 0.5 @@ -28,7 +14,7 @@ struct Parameters { let amplitude: Double = 1.4 /// RNG seed value. - let seed: Int = 0 + let seed: UInt64 = 111 /// Amplitude of the noise. let aNoise: Double = 0.0 @@ -55,8 +41,21 @@ struct Parameters { let fhistd = 0.01 } -struct EcgDerive { - let rrpc: [Double] +public struct TimeParameters { + /// The number of beats to simulate. + let numBeats: Int = 8 + + /// The ECG sampling frequency in Hz. + let sfEcg: Int = 256 + + /// The internal sampling frequency in Hz. + var sfInternal: Int = 512 + + /// The mean heart rate in beats per minute. + let hrMean: Double = 75.0 + + /// The standard deviation of the heart rate. + let hrStd: Double = 1.0 } func stdev(_ data: [Double]) -> Double { @@ -65,21 +64,31 @@ func stdev(_ data: [Double]) -> Double { return sqrt(data.lazy.map { ($0 - mean) * ($0 - mean) }.reduce(0.0, +) / (n - 1)) } -struct RRProcess: ~Copyable { +public struct RRProcess: ~Copyable { let nrr: Int + let sfInternal: Int + let fft: FFT let spectrum: Buffer> let signal: Buffer - let fft: FFT - init(nrr: Int) { - self.nrr = nrr + // mean and standard deviation of RR intervals + let rrMean: Double + let rrStd: Double + + public init(params: TimeParameters) { + sfInternal = params.sfInternal + rrMean = 60.0 / params.hrMean + rrStd = 60.0 * params.hrStd / (params.hrMean * params.hrMean) + + nrr = Int(pow(2.0, ceil(log2(Double(params.numBeats * sfInternal) * rrMean)))) + fft = try! FFT(n: nrr) spectrum = fft.makeSpectrumBuffer(extra: 1) signal = fft.makeSignalBuffer() } - func generate(params: Parameters) -> [Double] { + public func generate(params: Parameters) -> [Double] { let w1 = 2.0 * .pi * params.flo let w2 = 2.0 * .pi * params.fhi let c1 = 2.0 * .pi * params.flostd @@ -88,74 +97,159 @@ struct RRProcess: ~Copyable { let sig2 = 1.0 let sig1 = params.lfhfRatio - let rrmean = 60.0 / params.hrMean - let rrstd = 60.0 * params.hrStd / (params.hrMean * params.hrMean) + let sf = Double(sfInternal) - let sf = Double(params.sfInternal) - let df = sf / Double(nrr) + let dw = (sf / Double(nrr)) * 2.0 * .pi - spectrum.withUnsafeMutableBufferPointer { swc in - for i in 0 ..< nrr / 2 + 1 { - let w = df * Double(i) * 2.0 * .pi - let dw1 = w - w1 - let dw2 = w - w2 - let hw = sig1 * exp(-dw1 * dw1 / (2.0 * c1 * c1)) / sqrt(2.0 * .pi * c1 * c1) - + sig2 * exp(-dw2 * dw2 / (2.0 * c2 * c2)) / sqrt(2.0 * .pi * c2 * c2) + var rng = Xoshiro256Plus(seed: params.seed) - let sw = (sf / 2.0) * sqrt(hw) - let ph = 2.0 * .pi * Double.random(in: 0.0 ..< 1.0) + spectrum.mapInPlaceSwapLast { i in + let w = dw * Double(i) - swc[i].real = sw * cos(ph) - swc[i].imaginary = sw * sin(ph) - } + let dw1 = w - w1 + let dw2 = w - w2 + let hw = sig1 * exp(-dw1 * dw1 / (2.0 * c1 * c1)) / sqrt(2.0 * .pi * c1 * c1) + + sig2 * exp(-dw2 * dw2 / (2.0 * c2 * c2)) / sqrt(2.0 * .pi * c2 * c2) - // pack Nyquist frequency real to imaginary of DC - swc[0].imaginary = swc[nrr / 2].real + let sw = (sf / 2.0) * sqrt(hw) + let ph = 2.0 * .pi * rng.nextDouble() + + return Complex(length: sw, phase: ph) } fft.inverse(spectrum: spectrum, signal: signal) - var rr = signal.withUnsafeMutableBufferPointer { outptr in - outptr.map { $0 * 1.0 / Double(nrr) } - } + var rr = signal.map { $0 * 1.0 / Double(nrr) } let xstd = stdev(rr) - let ratio = rrstd / xstd + let ratio = rrStd / xstd for i in 0 ..< nrr { - rr[i] = rr[i] * ratio + rrmean + rr[i] = rr[i] * ratio + rrMean } return rr } } -// func compute(params: Parameters) { -// // adjust extrema parameters for mean heart rate -// let hrFact = sqrt(params.hrMean / 60) -// let hrFact2 = sqrt(hrFact) +public struct Generator: ~Copyable { + let rrp: RRProcess -// let bi = params.b.map { $0 * hrFact } + let hrFact: Double + let hrFactSqrt: Double -// /// XXX: Discrepancy here between Java/C and Matlab, the former uses 1.0 for ti[4] adjustment. -// let ti = zip([hrFact2, hrFact, 1, hrFact, hrFact2], params.theta).map(*) + let dt: Double -// let ai = params.a -// let x0 = SIMD3(1.0, 0.0, 0.04) // XXX: Convert to init from vector3d -// let rseed = params.seed + let timeParams: TimeParameters -// // calculate time scales -// let h = 1.0 / Double(params.sfInternal) -// let tstep = 1.0 / Double(params.sfEcg) + public init(params: TimeParameters) { + rrp = RRProcess(params: params) -// // calculate length of the RR time series -// let rrmean = (60.0 / params.hrMean) + // stretching factors for intervals based on Bazett's formula + hrFact = sqrt(params.hrMean / 60) + hrFactSqrt = sqrt(hrFact) -// let numRr = Int(pow(2.0, ceil(log2(Double(params.numBeats * params.sfInternal) * rrmean)))) + // init time scales + dt = 1.0 / Double(params.sfInternal) -// var rr = [Double](repeating: 0.0, count: numRr) + timeParams = params + } -// // TODO: check sfInternal is integer multple of sfEcg + public func compute(params: Parameters) -> [Double] { + let ai = params.a + let bi = params.b.map { $0 * hrFact } + // adjust extrema parameters for mean heart rate + let ti = zip([hrFactSqrt, hrFact, 1, hrFact, hrFactSqrt], params.theta).map(*) -// // define frequency parameters for rr process -// } + let sfInternal = timeParams.sfInternal + + let rr = rrp.generate(params: params) + + let fhi = params.fhi + + let dt = self.dt + + // generate piecewise RR + let rrpc = [Double](unsafeUninitializedCapacity: rr.count * 2) { rrpc, count in + var tecg = 0.0 + var i = 0 + var j = 0 + while i < rr.count { + tecg += rr[i] + j = Int((tecg / dt).rounded(.toNearestOrEven)) + for k in i ... j { + rrpc[k] = rr[i] + } + i = j + 1 + } + count = i + } + print("rrpc: \(rrpc.count)") + + let nt = rr.count + + let x0 = SIMD3(1.0, 0.0, 0.04) + + var mip = 0 + var mt2 = 0.0 + + let ts = (0 ..< nt).map { Double($0) * dt } + print("ts: \(ts.count) \(ts.last!)") + + let result = SIMD3.integrate(over: ts, y0: x0, tol: 1e-6) { x, t in + let ta = atan2(x[1], x[0]) + + let r0 = 1.0 + let a0 = 1.0 - sqrt(x[0] * x[0] + x[1] * x[1]) / r0 + + let ip = Int(floor(t * Double(sfInternal))) + mip = max(ip, mip) + mt2 = max(t, mt2) + //print("ip: \(ip) mip: \(mip) mt2: \(mt2)") + + let w0 = 2 * .pi / rrpc[ip] + + let zbase = 0.005 * sin(2 * .pi * fhi * t) + + var dxdt = SIMD3() + + dxdt[0] = a0 * x[0] - w0 * x[1] + dxdt[1] = a0 * x[1] + w0 * x[0] + + dxdt[2] = 0.0 + + for i in 0 ..< ti.count { + let dt = remainder(ta - ti[i], 2 * .pi) + let dt² = dt * dt + + dxdt[2] += -ai[i] * dt * exp(-0.5 * dt² / (bi[i] * bi[i])) + } + dxdt[2] += -1.0 * (x[2] - zbase) + + return dxdt + } + + print("mip: \(mip)") + + // downsample to ECG sampling frequency + let qstep = sfInternal / timeParams.sfEcg + var zresult = stride(from: 0, to: nt, by: qstep).map { result[$0][2] } + + let (zmin, zmax) = zresult.minAndMax()! + let zrange = zmax - zmin + + var rng = Xoshiro256Plus(seed: params.seed + 1) + + // Scale signal between -0.4 and 1.2 mV + // add uniformly distributed measurement noise + for i in 0 ..< zresult.count { + zresult[i] = 1.6 * (zresult[i] - zmin) / zrange - 0.4 + zresult[i] += params.aNoise * (2.0 * rng.nextDouble() - 1.0) + } + + // write zresult to text/CSV file + let filename = "ecg.csv" + try! zresult[0 ..< 800].lazy.map { String($0) }.joined(separator: "\n").write(to: URL(fileURLWithPath: filename), atomically: true, encoding: .utf8) + return zresult + } +} diff --git a/Tests/EcgSynKitTests/EcgSynKitTests.swift b/Tests/EcgSynKitTests/EcgSynKitTests.swift index 2df535f..7f85da9 100644 --- a/Tests/EcgSynKitTests/EcgSynKitTests.swift +++ b/Tests/EcgSynKitTests/EcgSynKitTests.swift @@ -1,13 +1,16 @@ import Testing @testable import EcgSynKit +import PFFFT +import ComplexModule @Test func fftTest () { - // Write your test here and use APIs like `#expect(...)` to check expected conditions. + let p0 = TimeParameters() + let c = Generator(params: p0) let p = Parameters() + let z = c.compute(params: p) - - let rrp = RRProcess(nrr: 131072) - let rr = rrp.generate(params: p) + // print("z: \(z)") + print("PFFFT.simdArch: \(FFT>.simdArch)") //fft(data: &a, isign: -1) //print("out: \(rr)")