update
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"originHash" : "bf0b92ec2751ab82eed76f0b5f4c42d45557a303569895153367fd1d19d1045a",
|
||||
"originHash" : "8b424c89cf909d23f0294225dd14ece13680ef4d122b099e92d093e9817a9b21",
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "swift-algorithms",
|
||||
@@ -18,6 +18,24 @@
|
||||
"revision" : "0a5bc04095a675662cf24757cc0640aa2204253b",
|
||||
"version" : "1.0.2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-odeint",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/jkl1337/swift-odeint",
|
||||
"state" : {
|
||||
"branch" : "master",
|
||||
"revision" : "6e195efd2276dec7cedd2745be8ed61e0cbaa370"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swiftpffft",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/jkl1337/SwiftPFFFT",
|
||||
"state" : {
|
||||
"branch" : "master",
|
||||
"revision" : "437f984b112fecbd4303d21f7c934f2085b6d9af"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 3
|
||||
|
||||
@@ -14,14 +14,16 @@ let package = Package(
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/apple/swift-algorithms", from: "1.2.0"),
|
||||
.package(path: "./Packages/PFFFT/"),
|
||||
.package(url: "https://github.com/jkl1337/SwiftPFFFT", branch: "master"),
|
||||
.package(url: "https://github.com/jkl1337/swift-odeint", branch: "master"),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "EcgSynKit",
|
||||
dependencies: [
|
||||
.product(name: "PFFFT", package: "PFFFT"),
|
||||
.product(name: "PFFFT", package: "SwiftPFFFT"),
|
||||
.product(name: "Algorithms", package: "swift-algorithms"),
|
||||
.product(name: "OdeInt", package: "swift-odeint"),
|
||||
]
|
||||
),
|
||||
.testTarget(
|
||||
|
||||
8
Packages/PFFFT/.gitignore
vendored
8
Packages/PFFFT/.gitignore
vendored
@@ -1,8 +0,0 @@
|
||||
.DS_Store
|
||||
/.build
|
||||
/Packages
|
||||
xcuserdata/
|
||||
DerivedData/
|
||||
.swiftpm/configuration/registries.json
|
||||
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
||||
.netrc
|
||||
@@ -1,37 +0,0 @@
|
||||
// swift-tools-version: 6.0
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "PFFFT",
|
||||
products: [
|
||||
.library(
|
||||
name: "PFFFT",
|
||||
targets: ["PFFFT", "PFFFTLib"]
|
||||
),
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/apple/swift-numerics", from: "1.0.0"),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "PFFFTLib",
|
||||
publicHeadersPath: "include",
|
||||
cSettings: [
|
||||
.define("PFFFT_SCALVEC_ENABLED", to: "1"),
|
||||
.define("_USE_MATH_DEFINES"),
|
||||
.define("NDEBUG"),
|
||||
.unsafeFlags(["-O3", "-std=c99"]),
|
||||
]
|
||||
),
|
||||
.target(
|
||||
name: "PFFFT",
|
||||
dependencies: ["PFFFTLib", .product(name: "Numerics", package: "swift-numerics")]
|
||||
),
|
||||
.testTarget(
|
||||
name: "PFFFTTests",
|
||||
dependencies: ["PFFFT"]
|
||||
),
|
||||
]
|
||||
)
|
||||
@@ -1,31 +0,0 @@
|
||||
let bufferAlignment = 32
|
||||
|
||||
@frozen
|
||||
public struct Buffer<T>: ~Copyable {
|
||||
let buffer: UnsafeMutableBufferPointer<T>
|
||||
var count: Int { buffer.count }
|
||||
var baseAddress: UnsafeMutablePointer<T> { buffer.baseAddress! }
|
||||
|
||||
public init(capacity: Int) {
|
||||
buffer = UnsafeMutableRawBufferPointer.allocate(
|
||||
byteCount: MemoryLayout<T>.stride * capacity,
|
||||
alignment: bufferAlignment
|
||||
).bindMemory(to: T.self)
|
||||
}
|
||||
|
||||
deinit {
|
||||
buffer.deallocate()
|
||||
}
|
||||
|
||||
public func withUnsafeMutableBufferPointer<R>(_ body: (UnsafeMutableBufferPointer<T>) throws -> R) rethrows -> R {
|
||||
try body(buffer)
|
||||
}
|
||||
|
||||
public func withUnsafeBufferPointer<R>(_ body: (UnsafeBufferPointer<T>) throws -> R) rethrows -> R {
|
||||
try body(UnsafeBufferPointer(buffer))
|
||||
}
|
||||
|
||||
public func withUnsafeMutableBytes<R>(_ body: (UnsafeMutableRawBufferPointer) throws -> R) rethrows -> R {
|
||||
try body(UnsafeMutableRawBufferPointer(buffer))
|
||||
}
|
||||
}
|
||||
@@ -1,348 +0,0 @@
|
||||
internal import PFFFTLib
|
||||
import ComplexModule
|
||||
import RealModule
|
||||
|
||||
@frozen
|
||||
public enum FFTType {
|
||||
case real
|
||||
case complex
|
||||
}
|
||||
|
||||
@frozen
|
||||
public enum FFTSign: Int {
|
||||
case forward = -1
|
||||
case backward = 1
|
||||
}
|
||||
|
||||
public enum FFTError: Error {
|
||||
case invalidSize
|
||||
}
|
||||
|
||||
public protocol FFTElement {
|
||||
associatedtype FFTScalarType: FFTScalar
|
||||
associatedtype FFTComplexType = Complex<FFTScalarType>
|
||||
|
||||
static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer
|
||||
static func pffftMinFftSize(_ type: FFTType) -> Int
|
||||
static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool
|
||||
static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int
|
||||
}
|
||||
|
||||
public protocol FFTScalar: Real {
|
||||
static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign)
|
||||
static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign)
|
||||
static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ dir: FFTSign)
|
||||
static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self)
|
||||
static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self)
|
||||
}
|
||||
|
||||
extension Float: FFTScalar {
|
||||
public static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign) {
|
||||
pffft_transform_ordered(ptr, input, output, work, pffft_direction_t(dir))
|
||||
}
|
||||
|
||||
public static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ work: UnsafeMutablePointer<Self>?, _ dir: FFTSign) {
|
||||
pffft_transform(ptr, input, output, work, pffft_direction_t(dir))
|
||||
}
|
||||
|
||||
public static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Self>, _ output: UnsafeMutablePointer<Self>, _ dir: FFTSign) {
|
||||
pffft_zreorder(ptr, input, output, pffft_direction_t(dir))
|
||||
}
|
||||
|
||||
public static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self) {
|
||||
pffft_zconvolve_accumulate(ptr, dftA, dftB, dftAB, scaling)
|
||||
}
|
||||
|
||||
public static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Self>, _ dftB: UnsafeMutablePointer<Self>, _ dftAB: UnsafeMutablePointer<Self>, _ scaling: Self) {
|
||||
pffft_zconvolve_no_accu(ptr, dftA, dftB, dftAB, scaling)
|
||||
}
|
||||
}
|
||||
|
||||
extension Double: FFTScalar {
|
||||
public static func pffftTransformOrdered(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Double>, _ output: UnsafeMutablePointer<Double>, _ work: UnsafeMutablePointer<Double>?, _ dir: FFTSign) {
|
||||
pffftd_transform_ordered(ptr, input, output, work, pffft_direction_t(dir))
|
||||
}
|
||||
|
||||
public static func pffftTransform(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Double>, _ output: UnsafeMutablePointer<Double>, _ work: UnsafeMutablePointer<Double>?, _ dir: FFTSign) {
|
||||
pffftd_transform(ptr, input, output, work, pffft_direction_t(dir))
|
||||
}
|
||||
|
||||
public static func pffftZreorder(_ ptr: OpaquePointer, _ input: UnsafeMutablePointer<Double>, _ output: UnsafeMutablePointer<Double>, _ dir: FFTSign) {
|
||||
pffftd_zreorder(ptr, input, output, pffft_direction_t(dir))
|
||||
}
|
||||
|
||||
public static func pffftZconvolveAccumulate(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Double>, _ dftB: UnsafeMutablePointer<Double>, _ dftAB: UnsafeMutablePointer<Double>, _ scaling: Double) {
|
||||
pffftd_zconvolve_accumulate(ptr, dftA, dftB, dftAB, scaling)
|
||||
}
|
||||
|
||||
public static func pffftZconvolveNoAccu(_ ptr: OpaquePointer, _ dftA: UnsafeMutablePointer<Double>, _ dftB: UnsafeMutablePointer<Double>, _ dftAB: UnsafeMutablePointer<Double>, _ scaling: Double) {
|
||||
pffftd_zconvolve_no_accu(ptr, dftA, dftB, dftAB, scaling)
|
||||
}
|
||||
}
|
||||
|
||||
extension Complex: FFTElement where RealType: FFTElement & FFTScalar {
|
||||
public typealias FFTScalarType = RealType
|
||||
|
||||
public static func pffftSetup(_ n: Int, _: FFTType) throws -> OpaquePointer {
|
||||
return try FFTScalarType.pffftSetup(n, .complex)
|
||||
}
|
||||
|
||||
public static func pffftMinFftSize(_: FFTType) -> Int {
|
||||
return FFTScalarType.pffftMinFftSize(.complex)
|
||||
}
|
||||
|
||||
public static func pffftIsValidSize(_ n: Int, _: FFTType) -> Bool {
|
||||
return FFTScalarType.pffftIsValidSize(n, .complex)
|
||||
}
|
||||
|
||||
public static func pffftNearestValidSize(_ n: Int, _: FFTType, _ higher: Bool) -> Int {
|
||||
return FFTScalarType.pffftNearestValidSize(n, .complex, higher)
|
||||
}
|
||||
}
|
||||
|
||||
extension Double: FFTElement {
|
||||
public typealias FFTScalarType = Double
|
||||
|
||||
public static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer {
|
||||
guard let ptr = pffftd_new_setup(Int32(n), pffft_transform_t(type)) else { throw FFTError.invalidSize }
|
||||
return ptr
|
||||
}
|
||||
|
||||
public static func pffftMinFftSize(_ type: FFTType) -> Int {
|
||||
Int(pffftd_min_fft_size(pffft_transform_t(type)))
|
||||
}
|
||||
|
||||
public static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool {
|
||||
pffftd_is_valid_size(Int32(n), pffft_transform_t(type)) != 0
|
||||
}
|
||||
|
||||
public static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int {
|
||||
Int(pffftd_nearest_transform_size(Int32(n), pffft_transform_t(type), higher ? 1 : 0))
|
||||
}
|
||||
}
|
||||
|
||||
extension Float: FFTElement {
|
||||
public typealias FFTScalarType = Float
|
||||
|
||||
public static func pffftSetup(_ n: Int, _ type: FFTType) throws -> OpaquePointer {
|
||||
guard let ptr = pffft_new_setup(Int32(n), pffft_transform_t(type)) else { throw FFTError.invalidSize }
|
||||
return ptr
|
||||
}
|
||||
|
||||
public static func pffftMinFftSize(_ type: FFTType) -> Int {
|
||||
Int(pffft_min_fft_size(pffft_transform_t(type)))
|
||||
}
|
||||
|
||||
public static func pffftIsValidSize(_ n: Int, _ type: FFTType) -> Bool {
|
||||
pffft_is_valid_size(Int32(n), pffft_transform_t(type)) != 0
|
||||
}
|
||||
|
||||
public static func pffftNearestValidSize(_ n: Int, _ type: FFTType, _ higher: Bool) -> Int {
|
||||
Int(pffft_nearest_transform_size(Int32(n), pffft_transform_t(type), higher ? 1 : 0))
|
||||
}
|
||||
}
|
||||
|
||||
public final class FFT<T: FFTElement> {
|
||||
public typealias ComplexType = T.FFTComplexType
|
||||
public typealias ScalarType = T.FFTScalarType
|
||||
|
||||
let ptr: OpaquePointer
|
||||
let n: Int
|
||||
let work: Buffer<ScalarType>?
|
||||
|
||||
/// Initialize the FFT implementation with the given size and type.
|
||||
/// - Parameters:
|
||||
/// - n: The size of the FFT.
|
||||
/// - Throws: `FFTError.invalidSize` if the size is invalid.
|
||||
public init(n: Int) throws {
|
||||
ptr = try T.pffftSetup(n, .real)
|
||||
self.n = n
|
||||
let capacity = T.self == ComplexType.self ? 2 * n : n
|
||||
work = n > 4096 ? Buffer<ScalarType>(capacity: capacity) : nil
|
||||
}
|
||||
|
||||
/// Make a buffer for the FFT (time-domain) signal.
|
||||
/// - Parameters:
|
||||
/// - extra: An extra number of elements to allocate.
|
||||
public func makeSignalBuffer(extra: Int = 0) -> Buffer<T> {
|
||||
Buffer(capacity: n + extra)
|
||||
}
|
||||
|
||||
/// Make a buffer for the FFT (frequency-domain) spectrum.
|
||||
/// - Parameters:
|
||||
/// - extra: An extra number of elements to allocate.
|
||||
public func makeSpectrumBuffer(extra: Int = 0) -> Buffer<ComplexType> {
|
||||
Buffer(capacity: T.self == ComplexType.self ? (n + extra) : n / 2 + extra)
|
||||
}
|
||||
|
||||
/// Make a buffer for the internal layout of the FFT (frequency-domain) spectrum.
|
||||
/// - Parameters:
|
||||
/// - extra: An extra number of points to allocate. For complex FFTs, 2 * extra
|
||||
/// additional elements will be allocated.
|
||||
public func makeInternalLayoutBuffer(extra: Int = 0) -> Buffer<ScalarType> {
|
||||
Buffer(capacity: (T.self == ComplexType.self ? 2 : 1) * (n + extra))
|
||||
}
|
||||
|
||||
@inline(__always)
|
||||
func toAddress(_ work: borrowing Buffer<ScalarType>?) -> UnsafeMutablePointer<ScalarType>? {
|
||||
switch work {
|
||||
case let .some(b): return b.baseAddress
|
||||
case .none: return nil
|
||||
}
|
||||
}
|
||||
|
||||
@inline(__always)
|
||||
func rebind<I>(_ buffer: borrowing Buffer<I>) -> UnsafeMutablePointer<ScalarType>! {
|
||||
UnsafeMutableRawBufferPointer(buffer.buffer).bindMemory(to: ScalarType.self).baseAddress
|
||||
}
|
||||
|
||||
@inline(__always)
|
||||
func checkFftBufferCounts(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ComplexType>) {
|
||||
guard signal.count >= n else {
|
||||
fatalError("signal buffer too small")
|
||||
}
|
||||
guard spectrum.count >= (T.self == ComplexType.self ? n : n / 2) else {
|
||||
fatalError("spectrum buffer too small")
|
||||
}
|
||||
}
|
||||
|
||||
@inline(__always)
|
||||
func checkFftInternalLayoutBufferCounts(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ScalarType>) {
|
||||
guard signal.count >= n else {
|
||||
fatalError("signal buffer too small")
|
||||
}
|
||||
guard spectrum.count >= (T.self == ComplexType.self ? 2 * n : n) else {
|
||||
fatalError("spectrum buffer too small")
|
||||
}
|
||||
}
|
||||
|
||||
@inline(__always)
|
||||
func checkConvolveBufferCounts(a: borrowing Buffer<ScalarType>, b: borrowing Buffer<ScalarType>, ab: borrowing Buffer<ScalarType>) {
|
||||
let minCount = T.self == ComplexType.self ? 2 * n : n
|
||||
|
||||
guard a.count >= minCount else {
|
||||
fatalError("a buffer too small")
|
||||
}
|
||||
guard b.count >= minCount else {
|
||||
fatalError("b buffer too small")
|
||||
}
|
||||
guard ab.count >= minCount else {
|
||||
fatalError("ab buffer too small")
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform a forward FFT on the input buffer.
|
||||
///
|
||||
/// The input and output buffers may be the same.
|
||||
/// The data is stores in order as expected (interleaved complex components ordered by frequency).
|
||||
/// The input and output buffer must have a capacity of at least `n` for real FFTs and `2 * n` for complex FFTs.
|
||||
/// A fatal error will occur if any buffer is too small.
|
||||
///
|
||||
/// For a real forward transform with real input, the output array is organized as follows:
|
||||
/// index k > 2 where k is even is the real part of the k/2-th complex coefficient.
|
||||
/// index k > 2 where k is odd is the imaginary part of the k/2-th complex coefficient.
|
||||
/// index k = 0 is the real part of the 0 frequency (DC) coefficient.
|
||||
/// index k = 1 is the real part of the Nyquist coefficient.
|
||||
///
|
||||
/// Transforms are not scaled. fft_backward(fft_forward(x)) == n * x.
|
||||
///
|
||||
/// - Parameters:
|
||||
/// - input: The input buffer.
|
||||
/// - output: The output buffer.
|
||||
/// - work: An optional work buffer. Must have capacity of at least `n` for real FFTs and `2 * n` for complex FFTs.
|
||||
/// - sign: The direction of the FFT.
|
||||
public func forward(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ComplexType>) {
|
||||
checkFftBufferCounts(signal: signal, spectrum: spectrum)
|
||||
ScalarType.pffftTransformOrdered(ptr, rebind(signal), rebind(spectrum), toAddress(work), .forward)
|
||||
}
|
||||
|
||||
public func inverse(spectrum: borrowing Buffer<ComplexType>, signal: borrowing Buffer<T>) {
|
||||
checkFftBufferCounts(signal: signal, spectrum: spectrum)
|
||||
ScalarType.pffftTransformOrdered(ptr, rebind(spectrum), rebind(signal), toAddress(work), .backward)
|
||||
}
|
||||
|
||||
/// Perform a forward FFT on the input buffer, with implementation defined order.
|
||||
///
|
||||
/// This function behaves similarly to `fft` however the z-domain data is stored in most efficient ordering,
|
||||
/// which is suitable for transforming back with this function, or for convolution.
|
||||
/// - Parameters:
|
||||
/// - input: The input buffer.
|
||||
/// - output: The output buffer.
|
||||
/// - work: An optional work buffer. Must have capacity of at least `n` for real FFTs and `2 * n` for complex FFTs.
|
||||
/// - sign: The direction of the FFT.
|
||||
public func forwardToInternalLayout(signal: borrowing Buffer<T>, spectrum: borrowing Buffer<ScalarType>) {
|
||||
checkFftInternalLayoutBufferCounts(signal: signal, spectrum: spectrum)
|
||||
ScalarType.pffftTransform(ptr, rebind(signal), spectrum.baseAddress, toAddress(work), .forward)
|
||||
}
|
||||
|
||||
public func inverseFromInternalLayout(spectrum: borrowing Buffer<ScalarType>, signal: borrowing Buffer<T>) {
|
||||
checkFftInternalLayoutBufferCounts(signal: signal, spectrum: spectrum)
|
||||
ScalarType.pffftTransform(ptr, spectrum.baseAddress, rebind(signal), toAddress(work), .backward)
|
||||
}
|
||||
|
||||
public func reorder(spectrum: borrowing Buffer<ScalarType>, output: borrowing Buffer<ComplexType>) {
|
||||
guard spectrum.count >= (T.self == ComplexType.self ? 2 * n : n) else {
|
||||
fatalError("signal buffer too small")
|
||||
}
|
||||
guard output.count >= n else {
|
||||
fatalError("outputbuffer too small")
|
||||
}
|
||||
ScalarType.pffftZreorder(ptr, spectrum.baseAddress, rebind(output), .forward)
|
||||
}
|
||||
|
||||
/// Perform a convolution of two complex signals in the frequency domain.
|
||||
///
|
||||
/// Multiplies frequency domain components of `dftA` and `dftB` and stores the result in `dftAB`.
|
||||
/// The operation performed is `dftAB = (dftA * dftB) * scaling`.
|
||||
/// - Parameters:
|
||||
/// - dftA: The first input buffer of frequency domain data.
|
||||
/// - dftB: The second input buffer of frequency domain data.
|
||||
/// - dftAB: The output buffer of frequency domain data.
|
||||
/// - scaling: The scaling factor to apply to the result.
|
||||
public func convolve(dftA: borrowing Buffer<ScalarType>, dftB: borrowing Buffer<ScalarType>, dftAB: borrowing Buffer<ScalarType>, scaling: ScalarType) {
|
||||
checkConvolveBufferCounts(a: dftA, b: dftB, ab: dftAB)
|
||||
ScalarType.pffftZconvolveNoAccu(ptr, dftA.baseAddress, dftB.baseAddress, dftAB.baseAddress, scaling)
|
||||
}
|
||||
|
||||
/// Perform a convolution of two complex signals in the frequency domain.
|
||||
///
|
||||
/// Multiplies frequency domain components of `dftA` and `dftB` and accumulates the result in `dftAB`.
|
||||
/// The operation performed is `dftAB += (dftA * dftB) * scaling`.
|
||||
/// - Parameters:
|
||||
/// - dftA: The first input buffer of frequency domain data.
|
||||
/// - dftB: The second input buffer of frequency domain data.
|
||||
/// - dftAB: The output buffer of frequency domain data.
|
||||
/// - scaling: The scaling factor to apply to the result.
|
||||
public func convolveAccumulate(dftA: borrowing Buffer<ScalarType>, dftB: borrowing Buffer<ScalarType>, dftAB: borrowing Buffer<ScalarType>, scaling: ScalarType) {
|
||||
checkConvolveBufferCounts(a: dftA, b: dftB, ab: dftAB)
|
||||
ScalarType.pffftZconvolveAccumulate(ptr, dftA.baseAddress, dftB.baseAddress, dftAB.baseAddress, scaling)
|
||||
}
|
||||
|
||||
/// Returns the minimum FFT size for this type of setup.
|
||||
public static func minFftSize() -> Int {
|
||||
T.pffftMinFftSize(.real)
|
||||
}
|
||||
|
||||
/// Returns whether the given size is valid for the given type.
|
||||
///
|
||||
/// The PFFFT library requires `n` to be factorizable to `minFftSize` with factors of 2, 3, 5.
|
||||
/// - Parameters:
|
||||
/// - n: The size to check.
|
||||
/// - Returns: Whether the size is valid.
|
||||
public static func isValidSize(_ n: Int) -> Bool {
|
||||
T.pffftIsValidSize(n, .real)
|
||||
}
|
||||
|
||||
/// Returns the nearest valid size for the given type.
|
||||
/// - Parameters:
|
||||
/// - n: The size to check.
|
||||
/// - higher: Whether to return the next higher size if `n` is invalid.
|
||||
/// - Returns: The nearest valid size.
|
||||
public static func nearestValidSize(_ n: Int, higher: Bool) -> Int {
|
||||
T.pffftNearestValidSize(n, .real, higher)
|
||||
}
|
||||
|
||||
deinit {
|
||||
pffftd_destroy_setup(ptr)
|
||||
}
|
||||
}
|
||||
@@ -1,26 +0,0 @@
|
||||
internal import PFFFTLib
|
||||
|
||||
public var simdArch: String {
|
||||
String(cString: pffft_simd_arch())
|
||||
}
|
||||
|
||||
extension pffft_transform_t {
|
||||
@inline(__always)
|
||||
init(_ type: FFTType) {
|
||||
switch type {
|
||||
case .real: self = PFFFT_REAL
|
||||
case .complex: self = PFFFT_COMPLEX
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension pffft_direction_t {
|
||||
@inline(__always)
|
||||
init(_ sign: FFTSign) {
|
||||
switch sign {
|
||||
case .forward: self = PFFFT_FORWARD
|
||||
case .backward: self = PFFFT_BACKWARD
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
import Foundation
|
||||
|
||||
public class SetupCache<T: FFTElement> : @unchecked Sendable {
|
||||
typealias FFTType = FFT<T>
|
||||
private var cache: [Int: FFT<T>] = [:]
|
||||
|
||||
private let queue = DispatchQueue(label: String(describing: SetupCache.self), attributes: .concurrent)
|
||||
|
||||
public init() {}
|
||||
|
||||
public func get(for n: Int) throws -> FFT<T> {
|
||||
var setup: FFTType??
|
||||
queue.sync {
|
||||
setup = cache[n]
|
||||
}
|
||||
if setup == nil {
|
||||
queue.sync(flags: .barrier) {
|
||||
setup = cache[n]
|
||||
if setup == nil {
|
||||
let entry = try? FFTType(n: n)
|
||||
cache[n] = entry
|
||||
setup = entry
|
||||
}
|
||||
}
|
||||
}
|
||||
guard let s = setup! else { throw FFTError.invalidSize }
|
||||
return s
|
||||
}
|
||||
|
||||
public func clear() {
|
||||
queue.sync(flags: .barrier) {
|
||||
cache.removeAll()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,241 +0,0 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 32-bit single precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (float*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFFT_H
|
||||
#define PFFFT_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFT_Setup PFFFT_Setup;
|
||||
|
||||
#ifndef PFFFT_COMMON_ENUMS
|
||||
#define PFFFT_COMMON_ENUMS
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFT_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
|
||||
void pffft_destroy_setup(PFFFT_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) floats, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a crash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
|
||||
input with input(=transformation) length N, the output array is
|
||||
'mostly' complex:
|
||||
index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N
|
||||
index k == 0 is a special case:
|
||||
the real() part contains the result for the DC frequency 0,
|
||||
the imag() part contains the result for the Nyquist frequency Samplerate/2
|
||||
both 0-frequency and half frequency components, which are real,
|
||||
are assembled in the first entry as F(0)+i*F(N/2).
|
||||
With the output size N/2 complex values (=N real/imag values), it is
|
||||
obvious, that the result for negative frequencies are not output,
|
||||
cause of symmetry.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
|
||||
int pffft_simd_size();
|
||||
|
||||
/* return string identifier of used architecture (SSE/NEON/Altivec/..) */
|
||||
const char * pffft_simd_arch();
|
||||
|
||||
|
||||
/* following functions are identical to the pffftd_ functions */
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffft_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine size N is valid
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||
returns bool
|
||||
*/
|
||||
int pffft_is_valid_size(int N, pffft_transform_t cplx);
|
||||
|
||||
/* determine nearest valid transform size (by brute-force testing)
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||
higher: bool-flag to find nearest higher value; else lower.
|
||||
*/
|
||||
int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||
|
||||
/*
|
||||
the float buffers must have the correct alignment (16-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFFT_H */
|
||||
|
||||
@@ -1,236 +0,0 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||
which works on 32 bit floating point precision using SSE instructions,
|
||||
to work with 64 bit floating point precision using AVX instructions.
|
||||
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||
*/
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 64-bit double precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (double*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 32 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFFT_DOUBLE_H
|
||||
#define PFFFT_DOUBLE_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFTD_Setup PFFFTD_Setup;
|
||||
|
||||
#ifndef PFFFT_COMMON_ENUMS
|
||||
#define PFFFT_COMMON_ENUMS
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFTD_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
|
||||
void pffftd_destroy_setup(PFFFTD_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) doubles, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a crash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
|
||||
|
||||
/* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
|
||||
int pffftd_simd_size();
|
||||
|
||||
/* return string identifier of used architecture (AVX/..) */
|
||||
const char * pffftd_simd_arch();
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffftd_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine size N is valid
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||
*/
|
||||
int pffftd_is_valid_size(int N, pffft_transform_t cplx);
|
||||
|
||||
/* determine nearest valid transform size (by brute-force testing)
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||
higher: bool-flag to find nearest higher value; else lower.
|
||||
*/
|
||||
int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||
|
||||
|
||||
/* following functions are identical to the pffft_ functions - both declared */
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffftd_next_power_of_two(int N);
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffftd_is_power_of_two(int N);
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/*
|
||||
the double buffers must have the correct alignment (32-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffftd_aligned_malloc(size_t nb_bytes);
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffftd_aligned_free(void *);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFFT_DOUBLE_H */
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||
of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This file is largerly based on the original FFTPACK implementation, modified in
|
||||
order to take advantage of SIMD instructions of modern CPUs.
|
||||
*/
|
||||
|
||||
/*
|
||||
ChangeLog:
|
||||
- 2011/10/02, version 1: This is the very first release of this file.
|
||||
*/
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
# define COMPILER_MSVC
|
||||
#elif defined(__GNUC__)
|
||||
# define COMPILER_GCC
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(COMPILER_GCC)
|
||||
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||
#elif defined(COMPILER_MSVC)
|
||||
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||
#endif
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
#include "simd/pf_float.h"
|
||||
|
||||
/* have code comparable with this definition */
|
||||
#define SETUP_STRUCT PFFFT_Setup
|
||||
#define FUNC_NEW_SETUP pffft_new_setup
|
||||
#define FUNC_DESTROY pffft_destroy_setup
|
||||
#define FUNC_TRANSFORM_UNORDRD pffft_transform
|
||||
#define FUNC_TRANSFORM_ORDERED pffft_transform_ordered
|
||||
#define FUNC_ZREORDER pffft_zreorder
|
||||
#define FUNC_ZCONVOLVE_ACCUMULATE pffft_zconvolve_accumulate
|
||||
#define FUNC_ZCONVOLVE_NO_ACCU pffft_zconvolve_no_accu
|
||||
|
||||
#define FUNC_ALIGNED_MALLOC pffft_aligned_malloc
|
||||
#define FUNC_ALIGNED_FREE pffft_aligned_free
|
||||
#define FUNC_SIMD_SIZE pffft_simd_size
|
||||
#define FUNC_MIN_FFT_SIZE pffft_min_fft_size
|
||||
#define FUNC_IS_VALID_SIZE pffft_is_valid_size
|
||||
#define FUNC_NEAREST_SIZE pffft_nearest_transform_size
|
||||
#define FUNC_SIMD_ARCH pffft_simd_arch
|
||||
#define FUNC_VALIDATE_SIMD_A validate_pffft_simd
|
||||
#define FUNC_VALIDATE_SIMD_EX validate_pffft_simd_ex
|
||||
|
||||
#define FUNC_CPLX_FINALIZE pffft_cplx_finalize
|
||||
#define FUNC_CPLX_PREPROCESS pffft_cplx_preprocess
|
||||
#define FUNC_REAL_PREPROCESS_4X4 pffft_real_preprocess_4x4
|
||||
#define FUNC_REAL_PREPROCESS pffft_real_preprocess
|
||||
#define FUNC_REAL_FINALIZE_4X4 pffft_real_finalize_4x4
|
||||
#define FUNC_REAL_FINALIZE pffft_real_finalize
|
||||
#define FUNC_TRANSFORM_INTERNAL pffft_transform_internal
|
||||
|
||||
#define FUNC_COS cosf
|
||||
#define FUNC_SIN sinf
|
||||
|
||||
|
||||
#include "pffft_priv_impl.h"
|
||||
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
/* SSE and co like 16-bytes aligned pointers
|
||||
* with a 64-byte alignment, we are even aligned on L2 cache lines... */
|
||||
#define MALLOC_V4SF_ALIGNMENT 64
|
||||
|
||||
static void * Valigned_malloc(size_t nb_bytes) {
|
||||
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||
if (!p0) return (void *) 0;
|
||||
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
|
||||
*((void **) p - 1) = p0;
|
||||
return p;
|
||||
}
|
||||
|
||||
static void Valigned_free(void *p) {
|
||||
if (p) free(*((void **) p - 1));
|
||||
}
|
||||
|
||||
|
||||
static int next_power_of_two(int N) {
|
||||
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
|
||||
/* compute the next highest power of 2 of 32-bit v */
|
||||
unsigned v = N;
|
||||
v--;
|
||||
v |= v >> 1;
|
||||
v |= v >> 2;
|
||||
v |= v >> 4;
|
||||
v |= v >> 8;
|
||||
v |= v >> 16;
|
||||
v++;
|
||||
return v;
|
||||
}
|
||||
|
||||
static int is_power_of_two(int N) {
|
||||
/* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
|
||||
int f = N && !(N & (N - 1));
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||
void pffft_aligned_free(void *p) { Valigned_free(p); }
|
||||
int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||
int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||
|
||||
void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||
void pffftd_aligned_free(void *p) { Valigned_free(p); }
|
||||
int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||
int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||
@@ -1,147 +0,0 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||
of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This file is largerly based on the original FFTPACK implementation, modified in
|
||||
order to take advantage of SIMD instructions of modern CPUs.
|
||||
*/
|
||||
|
||||
/*
|
||||
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||
which works on 32 bit floating point precision using SSE instructions,
|
||||
to work with 64 bit floating point precision using AVX instructions.
|
||||
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||
*/
|
||||
|
||||
#include "pffft_double.h"
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
# define COMPILER_MSVC
|
||||
#elif defined(__GNUC__)
|
||||
# define COMPILER_GCC
|
||||
#endif
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
# define _USE_MATH_DEFINES
|
||||
# include <malloc.h>
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
# include <malloc.h>
|
||||
#else
|
||||
# include <alloca.h>
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(COMPILER_GCC)
|
||||
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||
#elif defined(COMPILER_MSVC)
|
||||
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||
#endif
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
#include "simd/pf_double.h"
|
||||
|
||||
/* have code comparable with this definition */
|
||||
#define float double
|
||||
#define SETUP_STRUCT PFFFTD_Setup
|
||||
#define FUNC_NEW_SETUP pffftd_new_setup
|
||||
#define FUNC_DESTROY pffftd_destroy_setup
|
||||
#define FUNC_TRANSFORM_UNORDRD pffftd_transform
|
||||
#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered
|
||||
#define FUNC_ZREORDER pffftd_zreorder
|
||||
#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate
|
||||
#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu
|
||||
|
||||
#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc
|
||||
#define FUNC_ALIGNED_FREE pffftd_aligned_free
|
||||
#define FUNC_SIMD_SIZE pffftd_simd_size
|
||||
#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size
|
||||
#define FUNC_IS_VALID_SIZE pffftd_is_valid_size
|
||||
#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size
|
||||
#define FUNC_SIMD_ARCH pffftd_simd_arch
|
||||
#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd
|
||||
#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex
|
||||
|
||||
#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize
|
||||
#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess
|
||||
#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4
|
||||
#define FUNC_REAL_PREPROCESS pffftd_real_preprocess
|
||||
#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4
|
||||
#define FUNC_REAL_FINALIZE pffftd_real_finalize
|
||||
#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal
|
||||
|
||||
#define FUNC_COS cos
|
||||
#define FUNC_SIN sin
|
||||
|
||||
|
||||
#include "pffft_priv_impl.h"
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,81 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_ALTIVEC_FLT_H
|
||||
#define PF_ALTIVEC_FLT_H
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
|
||||
#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
|
||||
typedef vector float v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
||||
# define VARCH "ALTIVEC"
|
||||
# define VZERO() ((vector float) vec_splat_u8(0))
|
||||
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
||||
# define VADD(a,b) vec_add(a,b)
|
||||
# define VMADD(a,b,c) vec_madd(a,b,c)
|
||||
# define VSUB(a,b) vec_sub(a,b)
|
||||
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
||||
# define LD_PS1(p) ld_ps1(&p)
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
||||
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
||||
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
||||
}
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
v4sf y0 = vec_mergeh(x0, x2); \
|
||||
v4sf y1 = vec_mergel(x0, x2); \
|
||||
v4sf y2 = vec_mergeh(x1, x3); \
|
||||
v4sf y3 = vec_mergel(x1, x3); \
|
||||
x0 = vec_mergeh(y0, y2); \
|
||||
x1 = vec_mergel(y0, y2); \
|
||||
x2 = vec_mergeh(y1, y3); \
|
||||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
@@ -1,145 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_AVX_DBL_H
|
||||
#define PF_AVX_DBL_H
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
AVX support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
|
||||
#pragma message( __FILE__ ": AVX macros are defined" )
|
||||
|
||||
#include <immintrin.h>
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "AVX"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
|
||||
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
|
||||
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
|
||||
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
|
||||
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
|
||||
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
|
||||
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_DBL_H
|
||||
#define PF_DBL_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef double vsfscalar;
|
||||
|
||||
#include "pf_avx_double.h"
|
||||
#include "pf_sse2_double.h"
|
||||
#include "pf_neon_double.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building double with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_double.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_DBL_H */
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_FLT_H
|
||||
#define PF_FLT_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef float vsfscalar;
|
||||
|
||||
#include "pf_sse1_float.h"
|
||||
#include "pf_neon_float.h"
|
||||
#include "pf_altivec_float.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building float with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_float.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_FLT_H */
|
||||
|
||||
@@ -1,203 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_DBL_H
|
||||
#define PF_NEON_DBL_H
|
||||
|
||||
/*
|
||||
NEON 64bit support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
|
||||
|
||||
#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
|
||||
|
||||
#include "pf_neon_double_from_avx.h"
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t al = vget_low_f64(a);
|
||||
float64x1_t bl = vget_low_f64(b);
|
||||
return vcombine_f64(al, bl);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t ah = vget_high_f64(a);
|
||||
float64x1_t bh = vget_high_f64(b);
|
||||
return vcombine_f64(ah, bh);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b.vect_f64[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[1];
|
||||
res.vect_f64[1] = b.vect_f64[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_reverse(__m256d x)
|
||||
{
|
||||
__m256d res;
|
||||
float64x2_t low = x.vect_f64[0];
|
||||
float64x2_t high = x.vect_f64[1];
|
||||
float64x1_t a = vget_low_f64(low);
|
||||
float64x1_t b = vget_high_f64(low);
|
||||
float64x1_t c = vget_low_f64(high);
|
||||
float64x1_t d = vget_high_f64(high);
|
||||
res.vect_f64[0] = vcombine_f64(d, c);
|
||||
res.vect_f64[1] = vcombine_f64(b, a);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
|
||||
_mm_shuffle_pd_11(low1__, low2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
|
||||
_mm_shuffle_pd_11(high1__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
|
||||
_mm_shuffle_pd_00(low2__, high2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
|
||||
_mm_shuffle_pd_11(low2__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
|
||||
*/
|
||||
|
||||
//see https://github.com/kunpengcompute/AvxToNeon
|
||||
|
||||
#ifndef PF_NEON_DBL_FROM_AVX_H
|
||||
#define PF_NEON_DBL_FROM_AVX_H
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#else
|
||||
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE static inline
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
float32x4_t vect_f32[2];
|
||||
} __m256;
|
||||
|
||||
typedef struct {
|
||||
float64x2_t vect_f64[2];
|
||||
} __m256d;
|
||||
|
||||
typedef float64x2_t __m128d;
|
||||
|
||||
FORCE_INLINE __m256d _mm256_setzero_pd(void)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_set1_pd(double a)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
|
||||
{
|
||||
return a.vect_f64[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.vect_f64[imm8];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_FLT_H
|
||||
#define PF_NEON_FLT_H
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
|
||||
#pragma message( __FILE__ ": ARM NEON macros are defined" )
|
||||
|
||||
# include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
||||
# define VZERO() vdupq_n_f32(0)
|
||||
# define VMUL(a,b) vmulq_f32(a,b)
|
||||
# define VADD(a,b) vaddq_f32(a,b)
|
||||
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
||||
# define VSUB(a,b) vsubq_f32(a,b)
|
||||
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
||||
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
||||
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
||||
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
||||
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
||||
}
|
||||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) vextq_f32(a, a, 2)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_NEON_FLT_H */
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_DBL_H
|
||||
#define PF_SCAL_DBL_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.0
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_DBL_H */
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_FLT_H
|
||||
#define PF_SCAL_FLT_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.f
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_FLT_H */
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE1_FLT_H
|
||||
#define PF_SSE1_FLT_H
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
|
||||
#pragma message( __FILE__ ": SSE1 float macros are defined" )
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
|
||||
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
||||
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "SSE1"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm_setzero_ps()
|
||||
# define VMUL(a,b) _mm_mul_ps(a,b)
|
||||
# define VADD(a,b) _mm_add_ps(a,b)
|
||||
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||
# define VSUB(a,b) _mm_sub_ps(a,b)
|
||||
# define LD_PS1(p) _mm_set1_ps(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
||||
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
@@ -1,281 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE2_DBL_H
|
||||
#define PF_SSE2_DBL_H
|
||||
|
||||
//detect sse2 support under MSVC
|
||||
#if defined ( _M_IX86_FP )
|
||||
# if _M_IX86_FP == 2
|
||||
# if !defined(__SSE2__)
|
||||
# define __SSE2__
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
SSE2 64bit support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
|
||||
#pragma message (__FILE__ ": SSE2 double macros are defined" )
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
typedef struct {
|
||||
__m128d d128[2];
|
||||
} m256d;
|
||||
|
||||
typedef m256d v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#elif defined (_MSC_VER)
|
||||
#define FORCE_INLINE static __forceinline
|
||||
|
||||
#else
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
#define FORCE_INLINE static inline
|
||||
#endif
|
||||
|
||||
FORCE_INLINE m256d mm256_setzero_pd(void)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_set1_pd(double a)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_load_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
# define VARCH "SSE2"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() mm256_setzero_pd()
|
||||
# define VMUL(a,b) mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
|
||||
|
||||
|
||||
FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
|
||||
{
|
||||
return a.d128[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.d128[imm8];
|
||||
}
|
||||
FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b;
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b.d128[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[1];
|
||||
res.d128[1] = b.d128[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_reverse(m256d x)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
|
||||
res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -1,15 +0,0 @@
|
||||
import Testing
|
||||
@testable import PFFFT
|
||||
|
||||
@Test func example() async throws {
|
||||
let cache = PFFFT.SetupCache<Double>()
|
||||
|
||||
let ff = try cache.get(for: 1024, type: .real)
|
||||
|
||||
let input = Buffer<Double>(capacity: 1023)
|
||||
let output = Buffer<Double>(capacity: 1024)
|
||||
|
||||
ff.fft(input: input, output: output, work: nil, sign: .forward)
|
||||
|
||||
// Write your test here and use APIs like `#expect(...)` to check expected conditions.
|
||||
}
|
||||
@@ -1,26 +1,12 @@
|
||||
import Algorithms
|
||||
import ComplexModule
|
||||
import Foundation
|
||||
import OdeInt
|
||||
import PFFFT
|
||||
import PFFFTLib
|
||||
import RealModule
|
||||
|
||||
struct Parameters {
|
||||
/// The number of beats to simulate.
|
||||
let numBeats: Int = 256
|
||||
|
||||
/// The ECG sampling frequency in Hz.
|
||||
let sfEcg: Int = 256
|
||||
|
||||
/// The internal sampling frequency in Hz.
|
||||
var sfInternal: Int = 512
|
||||
|
||||
/// The mean heart rate in beats per minute.
|
||||
let hrMean: Double = 60.0
|
||||
|
||||
/// The standard deviation of the heart rate.
|
||||
let hrStd: Double = 1.0
|
||||
|
||||
public struct Parameters {
|
||||
/// The ratio of power between low and high frequencies.
|
||||
let lfhfRatio: Double = 0.5
|
||||
|
||||
@@ -28,7 +14,7 @@ struct Parameters {
|
||||
let amplitude: Double = 1.4
|
||||
|
||||
/// RNG seed value.
|
||||
let seed: Int = 0
|
||||
let seed: UInt64 = 111
|
||||
|
||||
/// Amplitude of the noise.
|
||||
let aNoise: Double = 0.0
|
||||
@@ -55,8 +41,21 @@ struct Parameters {
|
||||
let fhistd = 0.01
|
||||
}
|
||||
|
||||
struct EcgDerive {
|
||||
let rrpc: [Double]
|
||||
public struct TimeParameters {
|
||||
/// The number of beats to simulate.
|
||||
let numBeats: Int = 8
|
||||
|
||||
/// The ECG sampling frequency in Hz.
|
||||
let sfEcg: Int = 256
|
||||
|
||||
/// The internal sampling frequency in Hz.
|
||||
var sfInternal: Int = 512
|
||||
|
||||
/// The mean heart rate in beats per minute.
|
||||
let hrMean: Double = 75.0
|
||||
|
||||
/// The standard deviation of the heart rate.
|
||||
let hrStd: Double = 1.0
|
||||
}
|
||||
|
||||
func stdev(_ data: [Double]) -> Double {
|
||||
@@ -65,21 +64,31 @@ func stdev(_ data: [Double]) -> Double {
|
||||
return sqrt(data.lazy.map { ($0 - mean) * ($0 - mean) }.reduce(0.0, +) / (n - 1))
|
||||
}
|
||||
|
||||
struct RRProcess: ~Copyable {
|
||||
public struct RRProcess: ~Copyable {
|
||||
let nrr: Int
|
||||
let sfInternal: Int
|
||||
|
||||
let fft: FFT<Double>
|
||||
let spectrum: Buffer<Complex<Double>>
|
||||
let signal: Buffer<Double>
|
||||
let fft: FFT<Double>
|
||||
|
||||
init(nrr: Int) {
|
||||
self.nrr = nrr
|
||||
// mean and standard deviation of RR intervals
|
||||
let rrMean: Double
|
||||
let rrStd: Double
|
||||
|
||||
public init(params: TimeParameters) {
|
||||
sfInternal = params.sfInternal
|
||||
rrMean = 60.0 / params.hrMean
|
||||
rrStd = 60.0 * params.hrStd / (params.hrMean * params.hrMean)
|
||||
|
||||
nrr = Int(pow(2.0, ceil(log2(Double(params.numBeats * sfInternal) * rrMean))))
|
||||
|
||||
fft = try! FFT<Double>(n: nrr)
|
||||
spectrum = fft.makeSpectrumBuffer(extra: 1)
|
||||
signal = fft.makeSignalBuffer()
|
||||
}
|
||||
|
||||
func generate(params: Parameters) -> [Double] {
|
||||
public func generate(params: Parameters) -> [Double] {
|
||||
let w1 = 2.0 * .pi * params.flo
|
||||
let w2 = 2.0 * .pi * params.fhi
|
||||
let c1 = 2.0 * .pi * params.flostd
|
||||
@@ -88,74 +97,159 @@ struct RRProcess: ~Copyable {
|
||||
let sig2 = 1.0
|
||||
let sig1 = params.lfhfRatio
|
||||
|
||||
let rrmean = 60.0 / params.hrMean
|
||||
let rrstd = 60.0 * params.hrStd / (params.hrMean * params.hrMean)
|
||||
let sf = Double(sfInternal)
|
||||
|
||||
let sf = Double(params.sfInternal)
|
||||
let df = sf / Double(nrr)
|
||||
let dw = (sf / Double(nrr)) * 2.0 * .pi
|
||||
|
||||
var rng = Xoshiro256Plus(seed: params.seed)
|
||||
|
||||
spectrum.mapInPlaceSwapLast { i in
|
||||
let w = dw * Double(i)
|
||||
|
||||
spectrum.withUnsafeMutableBufferPointer { swc in
|
||||
for i in 0 ..< nrr / 2 + 1 {
|
||||
let w = df * Double(i) * 2.0 * .pi
|
||||
let dw1 = w - w1
|
||||
let dw2 = w - w2
|
||||
let hw = sig1 * exp(-dw1 * dw1 / (2.0 * c1 * c1)) / sqrt(2.0 * .pi * c1 * c1)
|
||||
+ sig2 * exp(-dw2 * dw2 / (2.0 * c2 * c2)) / sqrt(2.0 * .pi * c2 * c2)
|
||||
|
||||
let sw = (sf / 2.0) * sqrt(hw)
|
||||
let ph = 2.0 * .pi * Double.random(in: 0.0 ..< 1.0)
|
||||
let ph = 2.0 * .pi * rng.nextDouble()
|
||||
|
||||
swc[i].real = sw * cos(ph)
|
||||
swc[i].imaginary = sw * sin(ph)
|
||||
}
|
||||
|
||||
// pack Nyquist frequency real to imaginary of DC
|
||||
swc[0].imaginary = swc[nrr / 2].real
|
||||
return Complex(length: sw, phase: ph)
|
||||
}
|
||||
|
||||
fft.inverse(spectrum: spectrum, signal: signal)
|
||||
|
||||
var rr = signal.withUnsafeMutableBufferPointer { outptr in
|
||||
outptr.map { $0 * 1.0 / Double(nrr) }
|
||||
}
|
||||
var rr = signal.map { $0 * 1.0 / Double(nrr) }
|
||||
|
||||
let xstd = stdev(rr)
|
||||
let ratio = rrstd / xstd
|
||||
let ratio = rrStd / xstd
|
||||
|
||||
for i in 0 ..< nrr {
|
||||
rr[i] = rr[i] * ratio + rrmean
|
||||
rr[i] = rr[i] * ratio + rrMean
|
||||
}
|
||||
return rr
|
||||
}
|
||||
}
|
||||
|
||||
// func compute(params: Parameters) {
|
||||
// // adjust extrema parameters for mean heart rate
|
||||
// let hrFact = sqrt(params.hrMean / 60)
|
||||
// let hrFact2 = sqrt(hrFact)
|
||||
public struct Generator: ~Copyable {
|
||||
let rrp: RRProcess
|
||||
|
||||
// let bi = params.b.map { $0 * hrFact }
|
||||
let hrFact: Double
|
||||
let hrFactSqrt: Double
|
||||
|
||||
// /// XXX: Discrepancy here between Java/C and Matlab, the former uses 1.0 for ti[4] adjustment.
|
||||
// let ti = zip([hrFact2, hrFact, 1, hrFact, hrFact2], params.theta).map(*)
|
||||
let dt: Double
|
||||
|
||||
// let ai = params.a
|
||||
|
||||
// let x0 = SIMD3<Double>(1.0, 0.0, 0.04) // XXX: Convert to init from vector3d
|
||||
// let rseed = params.seed
|
||||
let timeParams: TimeParameters
|
||||
|
||||
// // calculate time scales
|
||||
// let h = 1.0 / Double(params.sfInternal)
|
||||
// let tstep = 1.0 / Double(params.sfEcg)
|
||||
public init(params: TimeParameters) {
|
||||
rrp = RRProcess(params: params)
|
||||
|
||||
// // calculate length of the RR time series
|
||||
// let rrmean = (60.0 / params.hrMean)
|
||||
// stretching factors for intervals based on Bazett's formula
|
||||
hrFact = sqrt(params.hrMean / 60)
|
||||
hrFactSqrt = sqrt(hrFact)
|
||||
|
||||
// let numRr = Int(pow(2.0, ceil(log2(Double(params.numBeats * params.sfInternal) * rrmean))))
|
||||
// init time scales
|
||||
dt = 1.0 / Double(params.sfInternal)
|
||||
|
||||
// var rr = [Double](repeating: 0.0, count: numRr)
|
||||
timeParams = params
|
||||
}
|
||||
|
||||
// // TODO: check sfInternal is integer multple of sfEcg
|
||||
public func compute(params: Parameters) -> [Double] {
|
||||
let ai = params.a
|
||||
let bi = params.b.map { $0 * hrFact }
|
||||
// adjust extrema parameters for mean heart rate
|
||||
let ti = zip([hrFactSqrt, hrFact, 1, hrFact, hrFactSqrt], params.theta).map(*)
|
||||
|
||||
// // define frequency parameters for rr process
|
||||
// }
|
||||
let sfInternal = timeParams.sfInternal
|
||||
|
||||
let rr = rrp.generate(params: params)
|
||||
|
||||
let fhi = params.fhi
|
||||
|
||||
let dt = self.dt
|
||||
|
||||
// generate piecewise RR
|
||||
let rrpc = [Double](unsafeUninitializedCapacity: rr.count * 2) { rrpc, count in
|
||||
var tecg = 0.0
|
||||
var i = 0
|
||||
var j = 0
|
||||
while i < rr.count {
|
||||
tecg += rr[i]
|
||||
j = Int((tecg / dt).rounded(.toNearestOrEven))
|
||||
for k in i ... j {
|
||||
rrpc[k] = rr[i]
|
||||
}
|
||||
i = j + 1
|
||||
}
|
||||
count = i
|
||||
}
|
||||
print("rrpc: \(rrpc.count)")
|
||||
|
||||
let nt = rr.count
|
||||
|
||||
let x0 = SIMD3<Double>(1.0, 0.0, 0.04)
|
||||
|
||||
var mip = 0
|
||||
var mt2 = 0.0
|
||||
|
||||
let ts = (0 ..< nt).map { Double($0) * dt }
|
||||
print("ts: \(ts.count) \(ts.last!)")
|
||||
|
||||
let result = SIMD3<Double>.integrate(over: ts, y0: x0, tol: 1e-6) { x, t in
|
||||
let ta = atan2(x[1], x[0])
|
||||
|
||||
let r0 = 1.0
|
||||
let a0 = 1.0 - sqrt(x[0] * x[0] + x[1] * x[1]) / r0
|
||||
|
||||
let ip = Int(floor(t * Double(sfInternal)))
|
||||
mip = max(ip, mip)
|
||||
mt2 = max(t, mt2)
|
||||
//print("ip: \(ip) mip: \(mip) mt2: \(mt2)")
|
||||
|
||||
let w0 = 2 * .pi / rrpc[ip]
|
||||
|
||||
let zbase = 0.005 * sin(2 * .pi * fhi * t)
|
||||
|
||||
var dxdt = SIMD3<Double>()
|
||||
|
||||
dxdt[0] = a0 * x[0] - w0 * x[1]
|
||||
dxdt[1] = a0 * x[1] + w0 * x[0]
|
||||
|
||||
dxdt[2] = 0.0
|
||||
|
||||
for i in 0 ..< ti.count {
|
||||
let dt = remainder(ta - ti[i], 2 * .pi)
|
||||
let dt² = dt * dt
|
||||
|
||||
dxdt[2] += -ai[i] * dt * exp(-0.5 * dt² / (bi[i] * bi[i]))
|
||||
}
|
||||
dxdt[2] += -1.0 * (x[2] - zbase)
|
||||
|
||||
return dxdt
|
||||
}
|
||||
|
||||
print("mip: \(mip)")
|
||||
|
||||
// downsample to ECG sampling frequency
|
||||
let qstep = sfInternal / timeParams.sfEcg
|
||||
var zresult = stride(from: 0, to: nt, by: qstep).map { result[$0][2] }
|
||||
|
||||
let (zmin, zmax) = zresult.minAndMax()!
|
||||
let zrange = zmax - zmin
|
||||
|
||||
var rng = Xoshiro256Plus(seed: params.seed + 1)
|
||||
|
||||
// Scale signal between -0.4 and 1.2 mV
|
||||
// add uniformly distributed measurement noise
|
||||
for i in 0 ..< zresult.count {
|
||||
zresult[i] = 1.6 * (zresult[i] - zmin) / zrange - 0.4
|
||||
zresult[i] += params.aNoise * (2.0 * rng.nextDouble() - 1.0)
|
||||
}
|
||||
|
||||
// write zresult to text/CSV file
|
||||
let filename = "ecg.csv"
|
||||
try! zresult[0 ..< 800].lazy.map { String($0) }.joined(separator: "\n").write(to: URL(fileURLWithPath: filename), atomically: true, encoding: .utf8)
|
||||
return zresult
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import Testing
|
||||
@testable import EcgSynKit
|
||||
import PFFFT
|
||||
import ComplexModule
|
||||
|
||||
@Test func fftTest () {
|
||||
// Write your test here and use APIs like `#expect(...)` to check expected conditions.
|
||||
let p0 = TimeParameters()
|
||||
let c = Generator(params: p0)
|
||||
let p = Parameters()
|
||||
let z = c.compute(params: p)
|
||||
|
||||
|
||||
let rrp = RRProcess(nrr: 131072)
|
||||
let rr = rrp.generate(params: p)
|
||||
// print("z: \(z)")
|
||||
print("PFFFT.simdArch: \(FFT<Complex<Float32>>.simdArch)")
|
||||
//fft(data: &a, isign: -1)
|
||||
//print("out: \(rr)")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user