add PFFFT target, FFTS sources
This commit is contained in:
8
FFTS/.gitignore
vendored
Normal file
8
FFTS/.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
.DS_Store
|
||||
/.build
|
||||
/Packages
|
||||
xcuserdata/
|
||||
DerivedData/
|
||||
.swiftpm/configuration/registries.json
|
||||
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
||||
.netrc
|
||||
24
FFTS/Package.swift
Normal file
24
FFTS/Package.swift
Normal file
@@ -0,0 +1,24 @@
|
||||
// swift-tools-version: 6.0
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "FFTS",
|
||||
products: [
|
||||
// Products define the executables and libraries a package produces, making them visible to other packages.
|
||||
.library(
|
||||
name: "FFTS",
|
||||
targets: ["FFTS"]),
|
||||
],
|
||||
targets: [
|
||||
.plugin(name: "CMakePlugin", capability: .buildTool()),
|
||||
.target(
|
||||
name: "FFTS",
|
||||
plugins: ["CMakePlugin"]),
|
||||
.testTarget(
|
||||
name: "FFTSTests",
|
||||
dependencies: ["FFTS"]
|
||||
),
|
||||
]
|
||||
)
|
||||
3
FFTS/Plugins/CMakePlugin/CMakePlugin.swift
Normal file
3
FFTS/Plugins/CMakePlugin/CMakePlugin.swift
Normal file
@@ -0,0 +1,3 @@
|
||||
import PackagePlugin
|
||||
|
||||
@main
|
||||
6
FFTS/Tests/FFTSTests/FFTSTests.swift
Normal file
6
FFTS/Tests/FFTSTests/FFTSTests.swift
Normal file
@@ -0,0 +1,6 @@
|
||||
import Testing
|
||||
@testable import FFTS
|
||||
|
||||
@Test func example() async throws {
|
||||
// Write your test here and use APIs like `#expect(...)` to check expected conditions.
|
||||
}
|
||||
@@ -19,12 +19,25 @@ let package = Package(
|
||||
targets: [
|
||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||
// Targets can depend on other targets in this package and products from dependencies.
|
||||
.target(
|
||||
name: "PFFFT",
|
||||
publicHeadersPath: "include",
|
||||
cSettings: [
|
||||
.define("PFFFT_SCALVEC_ENABLED", to: "1"),
|
||||
.define("_USE_MATH_DEFINES"),
|
||||
.define("NDEBUG"),
|
||||
.unsafeFlags(["-O3", "-std=c99"]),
|
||||
.unsafeFlags(["-mavx"], .when(platforms: [.linux, .macOS])),
|
||||
]
|
||||
),
|
||||
.target(
|
||||
name: "EcgSynKit",
|
||||
dependencies: [
|
||||
.target(name: "PFFFT"),
|
||||
.product(name: "Algorithms", package: "swift-algorithms"),
|
||||
.product(name: "KissFFT", package: "kissfft")
|
||||
]),
|
||||
]
|
||||
),
|
||||
.testTarget(
|
||||
name: "EcgSynKitTests",
|
||||
dependencies: ["EcgSynKit"]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import Algorithms
|
||||
import Foundation
|
||||
import KissFFT
|
||||
import PFFFT
|
||||
|
||||
struct Parameters {
|
||||
/// The number of beats to simulate.
|
||||
@@ -52,52 +52,43 @@ struct Parameters {
|
||||
let fhistd = 0.01
|
||||
}
|
||||
|
||||
/// Compute fft or inverse (based on sign) of vector data, where [i] is real and [i+1] is imaginary, i starts
|
||||
/// from 0.
|
||||
func fft(data: inout [Double], isign: Int) {
|
||||
let isign = Double(isign)
|
||||
let n = data.count
|
||||
class PfftSetupCache: @unchecked Sendable {
|
||||
private var cache: [Int: OpaquePointer?] = [:]
|
||||
|
||||
var j = 0
|
||||
for i in stride(from: 0, to: n, by: 2) {
|
||||
if j > i {
|
||||
data.swapAt(j, i)
|
||||
data.swapAt(j + 1, i + 1)
|
||||
private let queue = DispatchQueue(label: String(describing: PfftSetupCache.self), attributes: .concurrent)
|
||||
|
||||
func get(for nrr: Int) -> OpaquePointer? {
|
||||
var setup: OpaquePointer??
|
||||
queue.sync {
|
||||
setup = cache[nrr]
|
||||
}
|
||||
var m = n / 2
|
||||
while m >= 2, j >= m {
|
||||
j -= m
|
||||
m /= 2
|
||||
}
|
||||
j += m
|
||||
}
|
||||
|
||||
var mmax = 2
|
||||
while mmax < n {
|
||||
let istep = 2 * mmax
|
||||
let theta = isign * (2.0 * .pi / Double(mmax))
|
||||
let wtemp = sin(0.5 * theta)
|
||||
let wpr = -2.0 * wtemp * wtemp
|
||||
let wpi = sin(theta)
|
||||
var wr = 1.0
|
||||
var wi = 0.0
|
||||
|
||||
for m in stride(from: 0, to: mmax, by: 2) {
|
||||
for i in stride(from: m, to: n, by: istep) {
|
||||
let j = i + mmax
|
||||
let tempr = wr * data[j] - wi * data[j + 1]
|
||||
let tempi = wr * data[j + 1] + wi * data[j]
|
||||
data[j] = data[i] - tempr
|
||||
data[j + 1] = data[i + 1] - tempi
|
||||
data[i] += tempr
|
||||
data[i + 1] += tempi
|
||||
if setup == nil {
|
||||
queue.sync(flags: .barrier) {
|
||||
setup = cache[nrr]
|
||||
if setup == nil {
|
||||
setup = pffftd_new_setup(Int32(nrr), PFFFT_REAL)
|
||||
cache[nrr] = setup
|
||||
}
|
||||
}
|
||||
let wrtemp = wr
|
||||
wr = wr * wpr - wi * wpi + wr
|
||||
wi = wi * wpr + wrtemp * wpi + wi
|
||||
}
|
||||
mmax = istep
|
||||
return setup!
|
||||
}
|
||||
|
||||
deinit {
|
||||
for (_, setup) in cache {
|
||||
if setup != nil {
|
||||
pffftd_destroy_setup(setup)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static let shared = PfftSetupCache()
|
||||
}
|
||||
|
||||
func stdev(_ data: [Double]) -> Double {
|
||||
let n = Double(data.count)
|
||||
let mean = data.reduce(0.0, +) / n
|
||||
return sqrt(data.lazy.map { ($0 - mean) * ($0 - mean) }.reduce(0.0, +) / (n - 1))
|
||||
}
|
||||
|
||||
func rrprocess(params: Parameters, nrr: Int) -> [Double] {
|
||||
@@ -115,28 +106,35 @@ func rrprocess(params: Parameters, nrr: Int) -> [Double] {
|
||||
let sf = Double(params.sfInternal)
|
||||
let df = sf / Double(nrr)
|
||||
|
||||
var swc = (0 ..< nrr / 2 + 1).map {
|
||||
let w = df * Double($0) * 2.0 * .pi
|
||||
let dw1 = w - w1
|
||||
let dw2 = w - w2
|
||||
let hw = sig1 * exp(-dw1 * dw1 / (2.0 * c1 * c1)) / sqrt(2.0 * .pi * c1 * c1)
|
||||
+ sig2 * exp(-dw2 * dw2 / (2.0 * c2 * c2)) / sqrt(2.0 * .pi * c2 * c2)
|
||||
let fft = PfftSetupCache.shared.get(for: nrr)
|
||||
|
||||
let sw = (sf / 2.0) * sqrt(hw)
|
||||
var rr = withUnsafeTemporaryAllocation(byteCount: (nrr + 2) * MemoryLayout<Double>.stride, alignment: 32) {
|
||||
let swc = $0.bindMemory(to: Double.self)
|
||||
|
||||
let ph = 2.0 * .pi * Double.random(in: 0.0 ..< 1.0)
|
||||
return kiss_fft_cpx(r: sw * cos(ph), i: sw * sin(ph))
|
||||
for i in 0 ..< nrr / 2 + 1 {
|
||||
let w = df * Double(i) * 2.0 * .pi
|
||||
let dw1 = w - w1
|
||||
let dw2 = w - w2
|
||||
let hw = sig1 * exp(-dw1 * dw1 / (2.0 * c1 * c1)) / sqrt(2.0 * .pi * c1 * c1)
|
||||
+ sig2 * exp(-dw2 * dw2 / (2.0 * c2 * c2)) / sqrt(2.0 * .pi * c2 * c2)
|
||||
|
||||
let sw = (sf / 2.0) * sqrt(hw)
|
||||
let ph = 2.0 * .pi * Double.random(in: 0.0 ..< 1.0)
|
||||
|
||||
swc[i * 2] = sw * cos(ph)
|
||||
swc[i * 2 + 1] = sw * sin(ph)
|
||||
}
|
||||
|
||||
// pack Nyquist frequency real to imaginary of DC
|
||||
swc[1] = swc[nrr]
|
||||
|
||||
return withUnsafeTemporaryAllocation(byteCount: nrr * MemoryLayout<Double>.stride, alignment: 32) {
|
||||
let outptr = $0.bindMemory(to: Double.self)
|
||||
|
||||
pffftd_transform_ordered(fft, swc.baseAddress, outptr.baseAddress, nil, PFFFT_BACKWARD)
|
||||
return outptr.map { $0 * 1.0 / Double(nrr) }
|
||||
}
|
||||
}
|
||||
swc[0].i = 0.0
|
||||
swc[nrr / 2].i = 0.0
|
||||
|
||||
let fft = kiss_fftr_alloc(Int32(nrr), 1, nil, nil)
|
||||
|
||||
let outptr = UnsafeMutablePointer<Double>.allocate(capacity: nrr)
|
||||
kiss_fftri(fft, swc, outptr)
|
||||
|
||||
var rr = (0 ..< nrr).map { outptr[$0] * (1.0 / Double(nrr)) }
|
||||
outptr.deallocate()
|
||||
|
||||
let xstd = stdev(rr)
|
||||
let ratio = rrstd / xstd
|
||||
@@ -144,43 +142,37 @@ func rrprocess(params: Parameters, nrr: Int) -> [Double] {
|
||||
for i in 0 ..< nrr {
|
||||
rr[i] = rr[i] * ratio + rrmean
|
||||
}
|
||||
|
||||
return rr
|
||||
}
|
||||
|
||||
func stdev(_ data: [Double]) -> Double {
|
||||
let n = Double(data.count)
|
||||
let mean = data.reduce(0.0, +) / n
|
||||
return sqrt(data.lazy.map { ($0 - mean) * ($0 - mean) }.reduce(0.0, +) / (n - 1))
|
||||
}
|
||||
|
||||
func compute(params: Parameters) {
|
||||
// adjust extrema parameters for mean heart rate
|
||||
let hrFact = sqrt(params.hrMean / 60)
|
||||
let hrFact2 = sqrt(hrFact)
|
||||
// func compute(params: Parameters) {
|
||||
// // adjust extrema parameters for mean heart rate
|
||||
// let hrFact = sqrt(params.hrMean / 60)
|
||||
// let hrFact2 = sqrt(hrFact)
|
||||
|
||||
let bi = params.b.map { $0 * hrFact }
|
||||
// let bi = params.b.map { $0 * hrFact }
|
||||
|
||||
/// XXX: Discrepancy here between Java/C and Matlab, the former uses 1.0 for ti[4] adjustment.
|
||||
let ti = zip([hrFact2, hrFact, 1, hrFact, hrFact2], params.theta).map(*)
|
||||
// /// XXX: Discrepancy here between Java/C and Matlab, the former uses 1.0 for ti[4] adjustment.
|
||||
// let ti = zip([hrFact2, hrFact, 1, hrFact, hrFact2], params.theta).map(*)
|
||||
|
||||
let ai = params.a
|
||||
// let ai = params.a
|
||||
|
||||
let x0 = SIMD3<Double>(1.0, 0.0, 0.04) // XXX: Convert to init from vector3d
|
||||
let rseed = params.seed
|
||||
// let x0 = SIMD3<Double>(1.0, 0.0, 0.04) // XXX: Convert to init from vector3d
|
||||
// let rseed = params.seed
|
||||
|
||||
// calculate time scales
|
||||
let h = 1.0 / Double(params.sfInternal)
|
||||
let tstep = 1.0 / Double(params.sfEcg)
|
||||
// // calculate time scales
|
||||
// let h = 1.0 / Double(params.sfInternal)
|
||||
// let tstep = 1.0 / Double(params.sfEcg)
|
||||
|
||||
// calculate length of the RR time series
|
||||
let rrmean = (60.0 / params.hrMean)
|
||||
// // calculate length of the RR time series
|
||||
// let rrmean = (60.0 / params.hrMean)
|
||||
|
||||
let numRr = Int(pow(2.0, ceil(log2(Double(params.numBeats * params.sfInternal) * rrmean))))
|
||||
// let numRr = Int(pow(2.0, ceil(log2(Double(params.numBeats * params.sfInternal) * rrmean))))
|
||||
|
||||
var rr = [Double](repeating: 0.0, count: numRr)
|
||||
// var rr = [Double](repeating: 0.0, count: numRr)
|
||||
|
||||
// TODO: check sfInternal is integer multple of sfEcg
|
||||
// // TODO: check sfInternal is integer multple of sfEcg
|
||||
|
||||
// define frequency parameters for rr process
|
||||
}
|
||||
// // define frequency parameters for rr process
|
||||
// }
|
||||
|
||||
236
Sources/PFFFT/include/pffft_double.h
Normal file
236
Sources/PFFFT/include/pffft_double.h
Normal file
@@ -0,0 +1,236 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||
which works on 32 bit floating point precision using SSE instructions,
|
||||
to work with 64 bit floating point precision using AVX instructions.
|
||||
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||
*/
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 64-bit double precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (double*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 32 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFFT_DOUBLE_H
|
||||
#define PFFFT_DOUBLE_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFTD_Setup PFFFTD_Setup;
|
||||
|
||||
#ifndef PFFFT_COMMON_ENUMS
|
||||
#define PFFFT_COMMON_ENUMS
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFTD_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFTD_Setup *pffftd_new_setup(int N, pffft_transform_t transform);
|
||||
void pffftd_destroy_setup(PFFFTD_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) doubles, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a crash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffftd_transform(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffftd_transform_ordered(PFFFTD_Setup *setup, const double *input, double *output, double *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffftd_zreorder(PFFFTD_Setup *setup, const double *input, double *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffftd_zconvolve_accumulate(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double *dft_ab, double scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffftd_zconvolve_no_accu(PFFFTD_Setup *setup, const double *dft_a, const double *dft_b, double*dft_ab, double scaling);
|
||||
|
||||
/* return 4 or 1 wether support AVX instructions was enabled when building pffft-double.c */
|
||||
int pffftd_simd_size();
|
||||
|
||||
/* return string identifier of used architecture (AVX/..) */
|
||||
const char * pffftd_simd_arch();
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffftd_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine size N is valid
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||
*/
|
||||
int pffftd_is_valid_size(int N, pffft_transform_t cplx);
|
||||
|
||||
/* determine nearest valid transform size (by brute-force testing)
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||
higher: bool-flag to find nearest higher value; else lower.
|
||||
*/
|
||||
int pffftd_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||
|
||||
|
||||
/* following functions are identical to the pffft_ functions - both declared */
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffftd_next_power_of_two(int N);
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffftd_is_power_of_two(int N);
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/*
|
||||
the double buffers must have the correct alignment (32-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffftd_aligned_malloc(size_t nb_bytes);
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffftd_aligned_free(void *);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFFT_DOUBLE_H */
|
||||
|
||||
241
Sources/PFFFT/pffft.h
Normal file
241
Sources/PFFFT/pffft.h
Normal file
@@ -0,0 +1,241 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 32-bit single precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (float*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PFFFT_H
|
||||
#define PFFFT_H
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFT_Setup PFFFT_Setup;
|
||||
|
||||
#ifndef PFFFT_COMMON_ENUMS
|
||||
#define PFFFT_COMMON_ENUMS
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFT_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
|
||||
void pffft_destroy_setup(PFFFT_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) floats, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a crash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
For a real forward transform (PFFFT_REAL | PFFFT_FORWARD) with real
|
||||
input with input(=transformation) length N, the output array is
|
||||
'mostly' complex:
|
||||
index k in 1 .. N/2 -1 corresponds to frequency k * Samplerate / N
|
||||
index k == 0 is a special case:
|
||||
the real() part contains the result for the DC frequency 0,
|
||||
the imag() part contains the result for the Nyquist frequency Samplerate/2
|
||||
both 0-frequency and half frequency components, which are real,
|
||||
are assembled in the first entry as F(0)+i*F(N/2).
|
||||
With the output size N/2 complex values (=N real/imag values), it is
|
||||
obvious, that the result for negative frequencies are not output,
|
||||
cause of symmetry.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/* return 4 or 1 wether support SSE/NEON/Altivec instructions was enabled when building pffft.c */
|
||||
int pffft_simd_size();
|
||||
|
||||
/* return string identifier of used architecture (SSE/NEON/Altivec/..) */
|
||||
const char * pffft_simd_arch();
|
||||
|
||||
|
||||
/* following functions are identical to the pffftd_ functions */
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffft_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine size N is valid
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5
|
||||
returns bool
|
||||
*/
|
||||
int pffft_is_valid_size(int N, pffft_transform_t cplx);
|
||||
|
||||
/* determine nearest valid transform size (by brute-force testing)
|
||||
- factorizable to pffft_min_fft_size() with factors 2, 3, 5.
|
||||
higher: bool-flag to find nearest higher value; else lower.
|
||||
*/
|
||||
int pffft_nearest_transform_size(int N, pffft_transform_t cplx, int higher);
|
||||
|
||||
/*
|
||||
the float buffers must have the correct alignment (16-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* PFFFT_H */
|
||||
|
||||
53
Sources/PFFFT/pffft_common.c
Normal file
53
Sources/PFFFT/pffft_common.c
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
/* SSE and co like 16-bytes aligned pointers
|
||||
* with a 64-byte alignment, we are even aligned on L2 cache lines... */
|
||||
#define MALLOC_V4SF_ALIGNMENT 64
|
||||
|
||||
static void * Valigned_malloc(size_t nb_bytes) {
|
||||
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||
if (!p0) return (void *) 0;
|
||||
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
|
||||
*((void **) p - 1) = p0;
|
||||
return p;
|
||||
}
|
||||
|
||||
static void Valigned_free(void *p) {
|
||||
if (p) free(*((void **) p - 1));
|
||||
}
|
||||
|
||||
|
||||
static int next_power_of_two(int N) {
|
||||
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
|
||||
/* compute the next highest power of 2 of 32-bit v */
|
||||
unsigned v = N;
|
||||
v--;
|
||||
v |= v >> 1;
|
||||
v |= v >> 2;
|
||||
v |= v >> 4;
|
||||
v |= v >> 8;
|
||||
v |= v >> 16;
|
||||
v++;
|
||||
return v;
|
||||
}
|
||||
|
||||
static int is_power_of_two(int N) {
|
||||
/* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
|
||||
int f = N && !(N & (N - 1));
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void *pffft_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||
void pffft_aligned_free(void *p) { Valigned_free(p); }
|
||||
int pffft_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||
int pffft_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||
|
||||
void *pffftd_aligned_malloc(size_t nb_bytes) { return Valigned_malloc(nb_bytes); }
|
||||
void pffftd_aligned_free(void *p) { Valigned_free(p); }
|
||||
int pffftd_next_power_of_two(int N) { return next_power_of_two(N); }
|
||||
int pffftd_is_power_of_two(int N) { return is_power_of_two(N); }
|
||||
147
Sources/PFFFT/pffft_double.c
Normal file
147
Sources/PFFFT/pffft_double.c
Normal file
@@ -0,0 +1,147 @@
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB
|
||||
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
|
||||
of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This file is largerly based on the original FFTPACK implementation, modified in
|
||||
order to take advantage of SIMD instructions of modern CPUs.
|
||||
*/
|
||||
|
||||
/*
|
||||
NOTE: This file is adapted from Julien Pommier's original PFFFT,
|
||||
which works on 32 bit floating point precision using SSE instructions,
|
||||
to work with 64 bit floating point precision using AVX instructions.
|
||||
Author: Dario Mambro @ https://github.com/unevens/pffft
|
||||
*/
|
||||
|
||||
#include "pffft_double.h"
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
# define COMPILER_MSVC
|
||||
#elif defined(__GNUC__)
|
||||
# define COMPILER_GCC
|
||||
#endif
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
# define _USE_MATH_DEFINES
|
||||
# include <malloc.h>
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
# include <malloc.h>
|
||||
#else
|
||||
# include <alloca.h>
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(COMPILER_GCC)
|
||||
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
|
||||
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
|
||||
#elif defined(COMPILER_MSVC)
|
||||
# define ALWAYS_INLINE(return_type) __forceinline return_type
|
||||
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
|
||||
# define RESTRICT __restrict
|
||||
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
#pragma warning( disable : 4244 4305 4204 4456 )
|
||||
#endif
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
#include "simd/pf_double.h"
|
||||
|
||||
/* have code comparable with this definition */
|
||||
#define float double
|
||||
#define SETUP_STRUCT PFFFTD_Setup
|
||||
#define FUNC_NEW_SETUP pffftd_new_setup
|
||||
#define FUNC_DESTROY pffftd_destroy_setup
|
||||
#define FUNC_TRANSFORM_UNORDRD pffftd_transform
|
||||
#define FUNC_TRANSFORM_ORDERED pffftd_transform_ordered
|
||||
#define FUNC_ZREORDER pffftd_zreorder
|
||||
#define FUNC_ZCONVOLVE_ACCUMULATE pffftd_zconvolve_accumulate
|
||||
#define FUNC_ZCONVOLVE_NO_ACCU pffftd_zconvolve_no_accu
|
||||
|
||||
#define FUNC_ALIGNED_MALLOC pffftd_aligned_malloc
|
||||
#define FUNC_ALIGNED_FREE pffftd_aligned_free
|
||||
#define FUNC_SIMD_SIZE pffftd_simd_size
|
||||
#define FUNC_MIN_FFT_SIZE pffftd_min_fft_size
|
||||
#define FUNC_IS_VALID_SIZE pffftd_is_valid_size
|
||||
#define FUNC_NEAREST_SIZE pffftd_nearest_transform_size
|
||||
#define FUNC_SIMD_ARCH pffftd_simd_arch
|
||||
#define FUNC_VALIDATE_SIMD_A validate_pffftd_simd
|
||||
#define FUNC_VALIDATE_SIMD_EX validate_pffftd_simd_ex
|
||||
|
||||
#define FUNC_CPLX_FINALIZE pffftd_cplx_finalize
|
||||
#define FUNC_CPLX_PREPROCESS pffftd_cplx_preprocess
|
||||
#define FUNC_REAL_PREPROCESS_4X4 pffftd_real_preprocess_4x4
|
||||
#define FUNC_REAL_PREPROCESS pffftd_real_preprocess
|
||||
#define FUNC_REAL_FINALIZE_4X4 pffftd_real_finalize_4x4
|
||||
#define FUNC_REAL_FINALIZE pffftd_real_finalize
|
||||
#define FUNC_TRANSFORM_INTERNAL pffftd_transform_internal
|
||||
|
||||
#define FUNC_COS cos
|
||||
#define FUNC_SIN sin
|
||||
|
||||
|
||||
#include "pffft_priv_impl.h"
|
||||
|
||||
|
||||
2231
Sources/PFFFT/pffft_priv_impl.h
Normal file
2231
Sources/PFFFT/pffft_priv_impl.h
Normal file
File diff suppressed because it is too large
Load Diff
81
Sources/PFFFT/simd/pf_altivec_float.h
Normal file
81
Sources/PFFFT/simd/pf_altivec_float.h
Normal file
@@ -0,0 +1,81 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_ALTIVEC_FLT_H
|
||||
#define PF_ALTIVEC_FLT_H
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
|
||||
#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
|
||||
typedef vector float v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
||||
# define VARCH "ALTIVEC"
|
||||
# define VZERO() ((vector float) vec_splat_u8(0))
|
||||
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
||||
# define VADD(a,b) vec_add(a,b)
|
||||
# define VMADD(a,b,c) vec_madd(a,b,c)
|
||||
# define VSUB(a,b) vec_sub(a,b)
|
||||
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
||||
# define LD_PS1(p) ld_ps1(&p)
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
||||
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
||||
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
||||
}
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
v4sf y0 = vec_mergeh(x0, x2); \
|
||||
v4sf y1 = vec_mergel(x0, x2); \
|
||||
v4sf y2 = vec_mergeh(x1, x3); \
|
||||
v4sf y3 = vec_mergel(x1, x3); \
|
||||
x0 = vec_mergeh(y0, y2); \
|
||||
x1 = vec_mergel(y0, y2); \
|
||||
x2 = vec_mergeh(y1, y3); \
|
||||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
145
Sources/PFFFT/simd/pf_avx_double.h
Normal file
145
Sources/PFFFT/simd/pf_avx_double.h
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_AVX_DBL_H
|
||||
#define PF_AVX_DBL_H
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
AVX support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
|
||||
#pragma message( __FILE__ ": AVX macros are defined" )
|
||||
|
||||
#include <immintrin.h>
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "AVX"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
|
||||
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
|
||||
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
|
||||
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
|
||||
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
|
||||
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
|
||||
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
84
Sources/PFFFT/simd/pf_double.h
Normal file
84
Sources/PFFFT/simd/pf_double.h
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_DBL_H
|
||||
#define PF_DBL_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef double vsfscalar;
|
||||
|
||||
#include "pf_avx_double.h"
|
||||
#include "pf_sse2_double.h"
|
||||
#include "pf_neon_double.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building double with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_double.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_DBL_H */
|
||||
|
||||
84
Sources/PFFFT/simd/pf_float.h
Normal file
84
Sources/PFFFT/simd/pf_float.h
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_FLT_H
|
||||
#define PF_FLT_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef float vsfscalar;
|
||||
|
||||
#include "pf_sse1_float.h"
|
||||
#include "pf_neon_float.h"
|
||||
#include "pf_altivec_float.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building float with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_float.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_FLT_H */
|
||||
|
||||
203
Sources/PFFFT/simd/pf_neon_double.h
Normal file
203
Sources/PFFFT/simd/pf_neon_double.h
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_DBL_H
|
||||
#define PF_NEON_DBL_H
|
||||
|
||||
/*
|
||||
NEON 64bit support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
|
||||
|
||||
#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
|
||||
|
||||
#include "pf_neon_double_from_avx.h"
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t al = vget_low_f64(a);
|
||||
float64x1_t bl = vget_low_f64(b);
|
||||
return vcombine_f64(al, bl);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t ah = vget_high_f64(a);
|
||||
float64x1_t bh = vget_high_f64(b);
|
||||
return vcombine_f64(ah, bh);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b.vect_f64[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[1];
|
||||
res.vect_f64[1] = b.vect_f64[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_reverse(__m256d x)
|
||||
{
|
||||
__m256d res;
|
||||
float64x2_t low = x.vect_f64[0];
|
||||
float64x2_t high = x.vect_f64[1];
|
||||
float64x1_t a = vget_low_f64(low);
|
||||
float64x1_t b = vget_high_f64(low);
|
||||
float64x1_t c = vget_low_f64(high);
|
||||
float64x1_t d = vget_high_f64(high);
|
||||
res.vect_f64[0] = vcombine_f64(d, c);
|
||||
res.vect_f64[1] = vcombine_f64(b, a);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
|
||||
_mm_shuffle_pd_11(low1__, low2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
|
||||
_mm_shuffle_pd_11(high1__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
|
||||
_mm_shuffle_pd_00(low2__, high2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
|
||||
_mm_shuffle_pd_11(low2__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
123
Sources/PFFFT/simd/pf_neon_double_from_avx.h
Normal file
123
Sources/PFFFT/simd/pf_neon_double_from_avx.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
|
||||
*/
|
||||
|
||||
//see https://github.com/kunpengcompute/AvxToNeon
|
||||
|
||||
#ifndef PF_NEON_DBL_FROM_AVX_H
|
||||
#define PF_NEON_DBL_FROM_AVX_H
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#else
|
||||
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE static inline
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
float32x4_t vect_f32[2];
|
||||
} __m256;
|
||||
|
||||
typedef struct {
|
||||
float64x2_t vect_f64[2];
|
||||
} __m256d;
|
||||
|
||||
typedef float64x2_t __m128d;
|
||||
|
||||
FORCE_INLINE __m256d _mm256_setzero_pd(void)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_set1_pd(double a)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
|
||||
{
|
||||
return a.vect_f64[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.vect_f64[imm8];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
87
Sources/PFFFT/simd/pf_neon_float.h
Normal file
87
Sources/PFFFT/simd/pf_neon_float.h
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_FLT_H
|
||||
#define PF_NEON_FLT_H
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
|
||||
#pragma message( __FILE__ ": ARM NEON macros are defined" )
|
||||
|
||||
# include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
||||
# define VZERO() vdupq_n_f32(0)
|
||||
# define VMUL(a,b) vmulq_f32(a,b)
|
||||
# define VADD(a,b) vaddq_f32(a,b)
|
||||
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
||||
# define VSUB(a,b) vsubq_f32(a,b)
|
||||
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
||||
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
||||
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
||||
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
||||
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
||||
}
|
||||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) vextq_f32(a, a, 2)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_NEON_FLT_H */
|
||||
|
||||
185
Sources/PFFFT/simd/pf_scalar_double.h
Normal file
185
Sources/PFFFT/simd/pf_scalar_double.h
Normal file
@@ -0,0 +1,185 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_DBL_H
|
||||
#define PF_SCAL_DBL_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.0
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_DBL_H */
|
||||
|
||||
185
Sources/PFFFT/simd/pf_scalar_float.h
Normal file
185
Sources/PFFFT/simd/pf_scalar_float.h
Normal file
@@ -0,0 +1,185 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_FLT_H
|
||||
#define PF_SCAL_FLT_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.f
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_FLT_H */
|
||||
|
||||
82
Sources/PFFFT/simd/pf_sse1_float.h
Normal file
82
Sources/PFFFT/simd/pf_sse1_float.h
Normal file
@@ -0,0 +1,82 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE1_FLT_H
|
||||
#define PF_SSE1_FLT_H
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
|
||||
#pragma message( __FILE__ ": SSE1 float macros are defined" )
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
|
||||
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
||||
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "SSE1"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm_setzero_ps()
|
||||
# define VMUL(a,b) _mm_mul_ps(a,b)
|
||||
# define VADD(a,b) _mm_add_ps(a,b)
|
||||
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||
# define VSUB(a,b) _mm_sub_ps(a,b)
|
||||
# define LD_PS1(p) _mm_set1_ps(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
||||
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
281
Sources/PFFFT/simd/pf_sse2_double.h
Normal file
281
Sources/PFFFT/simd/pf_sse2_double.h
Normal file
@@ -0,0 +1,281 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE2_DBL_H
|
||||
#define PF_SSE2_DBL_H
|
||||
|
||||
//detect sse2 support under MSVC
|
||||
#if defined ( _M_IX86_FP )
|
||||
# if _M_IX86_FP == 2
|
||||
# if !defined(__SSE2__)
|
||||
# define __SSE2__
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
SSE2 64bit support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
|
||||
#pragma message (__FILE__ ": SSE2 double macros are defined" )
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
typedef struct {
|
||||
__m128d d128[2];
|
||||
} m256d;
|
||||
|
||||
typedef m256d v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#elif defined (_MSC_VER)
|
||||
#define FORCE_INLINE static __forceinline
|
||||
|
||||
#else
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
#define FORCE_INLINE static inline
|
||||
#endif
|
||||
|
||||
FORCE_INLINE m256d mm256_setzero_pd(void)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_set1_pd(double a)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_load_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
# define VARCH "SSE2"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() mm256_setzero_pd()
|
||||
# define VMUL(a,b) mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
|
||||
|
||||
|
||||
FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
|
||||
{
|
||||
return a.d128[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.d128[imm8];
|
||||
}
|
||||
FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b;
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b.d128[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[1];
|
||||
res.d128[1] = b.d128[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_reverse(m256d x)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
|
||||
res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
#endif
|
||||
Reference in New Issue
Block a user