add PFFFT target, FFTS sources
This commit is contained in:
81
Sources/PFFFT/simd/pf_altivec_float.h
Normal file
81
Sources/PFFFT/simd/pf_altivec_float.h
Normal file
@@ -0,0 +1,81 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_ALTIVEC_FLT_H
|
||||
#define PF_ALTIVEC_FLT_H
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
|
||||
#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
|
||||
typedef vector float v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
||||
# define VARCH "ALTIVEC"
|
||||
# define VZERO() ((vector float) vec_splat_u8(0))
|
||||
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
||||
# define VADD(a,b) vec_add(a,b)
|
||||
# define VMADD(a,b,c) vec_madd(a,b,c)
|
||||
# define VSUB(a,b) vec_sub(a,b)
|
||||
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
||||
# define LD_PS1(p) ld_ps1(&p)
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
||||
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
||||
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
||||
}
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
v4sf y0 = vec_mergeh(x0, x2); \
|
||||
v4sf y1 = vec_mergel(x0, x2); \
|
||||
v4sf y2 = vec_mergeh(x1, x3); \
|
||||
v4sf y3 = vec_mergel(x1, x3); \
|
||||
x0 = vec_mergeh(y0, y2); \
|
||||
x1 = vec_mergel(y0, y2); \
|
||||
x2 = vec_mergeh(y1, y3); \
|
||||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
145
Sources/PFFFT/simd/pf_avx_double.h
Normal file
145
Sources/PFFFT/simd/pf_avx_double.h
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_AVX_DBL_H
|
||||
#define PF_AVX_DBL_H
|
||||
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
AVX -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
AVX support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
|
||||
#pragma message( __FILE__ ": AVX macros are defined" )
|
||||
|
||||
#include <immintrin.h>
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "AVX"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0), \
|
||||
1); \
|
||||
out2 = _mm256_insertf128_pd( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3), \
|
||||
1); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
|
||||
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
|
||||
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
|
||||
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
|
||||
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
|
||||
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
|
||||
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
84
Sources/PFFFT/simd/pf_double.h
Normal file
84
Sources/PFFFT/simd/pf_double.h
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_DBL_H
|
||||
#define PF_DBL_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef double vsfscalar;
|
||||
|
||||
#include "pf_avx_double.h"
|
||||
#include "pf_sse2_double.h"
|
||||
#include "pf_neon_double.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building double with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_double.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_DBL_H */
|
||||
|
||||
84
Sources/PFFFT/simd/pf_float.h
Normal file
84
Sources/PFFFT/simd/pf_float.h
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_FLT_H
|
||||
#define PF_FLT_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
typedef float vsfscalar;
|
||||
|
||||
#include "pf_sse1_float.h"
|
||||
#include "pf_neon_float.h"
|
||||
#include "pf_altivec_float.h"
|
||||
|
||||
#ifndef SIMD_SZ
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# pragma message( "building float with simd disabled !" )
|
||||
# define PFFFT_SIMD_DISABLE /* fallback to scalar code */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pf_scalar_float.h"
|
||||
|
||||
/* shortcuts for complex multiplcations */
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
/* multiply a scalar with a vector */
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
#endif /* PF_FLT_H */
|
||||
|
||||
203
Sources/PFFFT/simd/pf_neon_double.h
Normal file
203
Sources/PFFFT/simd/pf_neon_double.h
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_DBL_H
|
||||
#define PF_NEON_DBL_H
|
||||
|
||||
/*
|
||||
NEON 64bit support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
|
||||
|
||||
#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
|
||||
|
||||
#include "pf_neon_double_from_avx.h"
|
||||
typedef __m256d v4sf;
|
||||
|
||||
/* 4 doubles by simd vector */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm256_setzero_pd()
|
||||
# define VMUL(a,b) _mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) _mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) _mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) _mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
|
||||
|
||||
FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t al = vget_low_f64(a);
|
||||
float64x1_t bl = vget_low_f64(b);
|
||||
return vcombine_f64(al, bl);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
|
||||
{
|
||||
float64x1_t ah = vget_high_f64(a);
|
||||
float64x1_t bh = vget_high_f64(b);
|
||||
return vcombine_f64(ah, bh);
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
|
||||
res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[0];
|
||||
res.vect_f64[1] = b.vect_f64[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a.vect_f64[1];
|
||||
res.vect_f64[1] = b.vect_f64[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_reverse(__m256d x)
|
||||
{
|
||||
__m256d res;
|
||||
float64x2_t low = x.vect_f64[0];
|
||||
float64x2_t high = x.vect_f64[1];
|
||||
float64x1_t a = vget_low_f64(low);
|
||||
float64x1_t b = vget_high_f64(low);
|
||||
float64x1_t c = vget_low_f64(high);
|
||||
float64x1_t d = vget_high_f64(high);
|
||||
res.vect_f64[0] = vcombine_f64(d, c);
|
||||
res.vect_f64[1] = vcombine_f64(b, a);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
|
||||
_mm_shuffle_pd_11(low1__, low2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
|
||||
_mm_shuffle_pd_11(high1__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = _mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = _mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
|
||||
__m256d tmp__ = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
|
||||
_mm_shuffle_pd_00(low2__, high2__)); \
|
||||
out2 = _mm256_insertf128_pd_1( \
|
||||
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
|
||||
_mm_shuffle_pd_11(low2__, high2__)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
__m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
_mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
123
Sources/PFFFT/simd/pf_neon_double_from_avx.h
Normal file
123
Sources/PFFFT/simd/pf_neon_double_from_avx.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
|
||||
*/
|
||||
|
||||
//see https://github.com/kunpengcompute/AvxToNeon
|
||||
|
||||
#ifndef PF_NEON_DBL_FROM_AVX_H
|
||||
#define PF_NEON_DBL_FROM_AVX_H
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#else
|
||||
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE static inline
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
float32x4_t vect_f32[2];
|
||||
} __m256;
|
||||
|
||||
typedef struct {
|
||||
float64x2_t vect_f64[2];
|
||||
} __m256d;
|
||||
|
||||
typedef float64x2_t __m128d;
|
||||
|
||||
FORCE_INLINE __m256d _mm256_setzero_pd(void)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
|
||||
{
|
||||
__m256d res_m256d;
|
||||
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
|
||||
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
|
||||
return res_m256d;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_set1_pd(double a)
|
||||
{
|
||||
__m256d ret;
|
||||
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
||||
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
|
||||
{
|
||||
return a.vect_f64[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.vect_f64[imm8];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
__m256d res;
|
||||
res.vect_f64[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif /* PF_AVX_DBL_H */
|
||||
|
||||
87
Sources/PFFFT/simd/pf_neon_float.h
Normal file
87
Sources/PFFFT/simd/pf_neon_float.h
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_NEON_FLT_H
|
||||
#define PF_NEON_FLT_H
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
|
||||
#pragma message( __FILE__ ": ARM NEON macros are defined" )
|
||||
|
||||
# include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "NEON"
|
||||
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
||||
# define VZERO() vdupq_n_f32(0)
|
||||
# define VMUL(a,b) vmulq_f32(a,b)
|
||||
# define VADD(a,b) vaddq_f32(a,b)
|
||||
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
||||
# define VSUB(a,b) vsubq_f32(a,b)
|
||||
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
||||
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
||||
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
||||
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
||||
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
||||
}
|
||||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) vextq_f32(a, a, 2)
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_NEON_FLT_H */
|
||||
|
||||
185
Sources/PFFFT/simd/pf_scalar_double.h
Normal file
185
Sources/PFFFT/simd/pf_scalar_double.h
Normal file
@@ -0,0 +1,185 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_DBL_H
|
||||
#define PF_SCAL_DBL_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.0
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_DBL_H */
|
||||
|
||||
185
Sources/PFFFT/simd/pf_scalar_float.h
Normal file
185
Sources/PFFFT/simd/pf_scalar_float.h
Normal file
@@ -0,0 +1,185 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SCAL_FLT_H
|
||||
#define PF_SCAL_FLT_H
|
||||
|
||||
/*
|
||||
fallback mode(s) for situations where SSE/AVX/NEON/Altivec are not available, use scalar mode instead
|
||||
*/
|
||||
|
||||
#if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
|
||||
#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
|
||||
|
||||
typedef struct {
|
||||
vsfscalar a;
|
||||
vsfscalar b;
|
||||
vsfscalar c;
|
||||
vsfscalar d;
|
||||
} v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "4xScalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VZERO() {
|
||||
v4sf r = { 0.f, 0.f, 0.f, 0.f };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMUL(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a * B.a, A.b * B.b, A.c * B.c, A.d * B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VADD(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a + B.a, A.b + B.b, A.c + B.c, A.d + B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VMADD(v4sf A, v4sf B, v4sf C) {
|
||||
v4sf r = { A.a * B.a + C.a, A.b * B.b + C.b, A.c * B.c + C.c, A.d * B.d + C.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) VSUB(v4sf A, v4sf B) {
|
||||
v4sf r = { A.a - B.a, A.b - B.b, A.c - B.c, A.d - B.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE(v4sf) LD_PS1(vsfscalar v) {
|
||||
v4sf r = { v, v, v, v };
|
||||
return r;
|
||||
}
|
||||
|
||||
# define VLOAD_UNALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VLOAD_ALIGNED(ptr) (*((v4sf*)(ptr)))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(v4sf)-1) ) == 0)
|
||||
|
||||
|
||||
/* INTERLEAVE2() */
|
||||
#define INTERLEAVE2( A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, B.a, A.b, B.b }; \
|
||||
v4sf Dr = { A.c, B.c, A.d, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* UNINTERLEAVE2() */
|
||||
#define UNINTERLEAVE2(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Cr = { A.a, A.c, B.a, B.c }; \
|
||||
v4sf Dr = { A.b, A.d, B.b, B.d }; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VTRANSPOSE4() */
|
||||
#define VTRANSPOSE4(A, B, C, D) \
|
||||
do { \
|
||||
v4sf Ar = { A.a, B.a, C.a, D.a }; \
|
||||
v4sf Br = { A.b, B.b, C.b, D.b }; \
|
||||
v4sf Cr = { A.c, B.c, C.c, D.c }; \
|
||||
v4sf Dr = { A.d, B.d, C.d, D.d }; \
|
||||
A = Ar; \
|
||||
B = Br; \
|
||||
C = Cr; \
|
||||
D = Dr; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* VSWAPHL() */
|
||||
static ALWAYS_INLINE(v4sf) VSWAPHL(v4sf A, v4sf B) {
|
||||
v4sf r = { B.a, B.b, A.c, A.d };
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* reverse/flip all floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_S(v4sf A) {
|
||||
v4sf r = { A.d, A.c, A.b, A.a };
|
||||
return r;
|
||||
}
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
static ALWAYS_INLINE(v4sf) VREV_C(v4sf A) {
|
||||
v4sf r = { A.c, A.d, A.a, A.b };
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(SIMD_SZ)
|
||||
#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
|
||||
typedef vsfscalar v4sf;
|
||||
|
||||
# define SIMD_SZ 1
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
vsfscalar f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "Scalar"
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.f
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* PF_SCAL_FLT_H */
|
||||
|
||||
82
Sources/PFFFT/simd/pf_sse1_float.h
Normal file
82
Sources/PFFFT/simd/pf_sse1_float.h
Normal file
@@ -0,0 +1,82 @@
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE1_FLT_H
|
||||
#define PF_SSE1_FLT_H
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
|
||||
#pragma message( __FILE__ ": SSE1 float macros are defined" )
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
|
||||
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
||||
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
# define VARCH "SSE1"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm_setzero_ps()
|
||||
# define VMUL(a,b) _mm_mul_ps(a,b)
|
||||
# define VADD(a,b) _mm_add_ps(a,b)
|
||||
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||
# define VSUB(a,b) _mm_sub_ps(a,b)
|
||||
# define LD_PS1(p) _mm_set1_ps(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
||||
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
#else
|
||||
/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
|
||||
#endif
|
||||
|
||||
#endif /* PF_SSE1_FLT_H */
|
||||
|
||||
281
Sources/PFFFT/simd/pf_sse2_double.h
Normal file
281
Sources/PFFFT/simd/pf_sse2_double.h
Normal file
@@ -0,0 +1,281 @@
|
||||
/*
|
||||
Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
|
||||
*/
|
||||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef PF_SSE2_DBL_H
|
||||
#define PF_SSE2_DBL_H
|
||||
|
||||
//detect sse2 support under MSVC
|
||||
#if defined ( _M_IX86_FP )
|
||||
# if _M_IX86_FP == 2
|
||||
# if !defined(__SSE2__)
|
||||
# define __SSE2__
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
SSE2 64bit support macros
|
||||
*/
|
||||
#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ))
|
||||
#pragma message (__FILE__ ": SSE2 double macros are defined" )
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
typedef struct {
|
||||
__m128d d128[2];
|
||||
} m256d;
|
||||
|
||||
typedef m256d v4sf;
|
||||
|
||||
# define SIMD_SZ 4
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
double f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#pragma push_macro("FORCE_INLINE")
|
||||
#define FORCE_INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#elif defined (_MSC_VER)
|
||||
#define FORCE_INLINE static __forceinline
|
||||
|
||||
#else
|
||||
#error "Macro name collisions may happens with unknown compiler"
|
||||
#ifdef FORCE_INLINE
|
||||
#undef FORCE_INLINE
|
||||
#endif
|
||||
#define FORCE_INLINE static inline
|
||||
#endif
|
||||
|
||||
FORCE_INLINE m256d mm256_setzero_pd(void)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
|
||||
ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_set1_pd(double a)
|
||||
{
|
||||
m256d ret;
|
||||
ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_load_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
|
||||
res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
# define VARCH "SSE2"
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() mm256_setzero_pd()
|
||||
# define VMUL(a,b) mm256_mul_pd(a,b)
|
||||
# define VADD(a,b) mm256_add_pd(a,b)
|
||||
# define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
|
||||
# define VSUB(a,b) mm256_sub_pd(a,b)
|
||||
# define LD_PS1(p) mm256_set1_pd(p)
|
||||
# define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
|
||||
|
||||
|
||||
FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
|
||||
{
|
||||
return a.d128[0];
|
||||
}
|
||||
|
||||
FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
|
||||
{
|
||||
assert(imm8 >= 0 && imm8 <= 1);
|
||||
return a.d128[imm8];
|
||||
}
|
||||
FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b;
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a;
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
|
||||
res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[0];
|
||||
res.d128[1] = b.d128[0];
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = a.d128[1];
|
||||
res.d128[1] = b.d128[1];
|
||||
return res;
|
||||
}
|
||||
|
||||
FORCE_INLINE m256d mm256_reverse(m256d x)
|
||||
{
|
||||
m256d res;
|
||||
res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
|
||||
res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in2[0], in1[1], in2[1] ]
|
||||
out2 = [ in1[2], in2[2], in1[3], in2[3] ]
|
||||
*/
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
|
||||
_mm_shuffle_pd(low1__, low2__, 3)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
|
||||
_mm_shuffle_pd(high1__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
|
||||
out1 = [ in1[0], in1[2], in2[0], in2[2] ]
|
||||
out2 = [ in1[1], in1[3], in2[1], in2[3] ]
|
||||
*/
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
__m128d low1__ = mm256_castpd256_pd128(in1); \
|
||||
__m128d low2__ = mm256_castpd256_pd128(in2); \
|
||||
__m128d high1__ = mm256_extractf128_pd(in1, 1); \
|
||||
__m128d high2__ = mm256_extractf128_pd(in2, 1); \
|
||||
m256d tmp__ = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 0)); \
|
||||
out2 = mm256_insertf128_pd_1( \
|
||||
mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
|
||||
_mm_shuffle_pd(low2__, high2__, 3)); \
|
||||
out1 = tmp__; \
|
||||
}
|
||||
|
||||
# define VTRANSPOSE4(row0, row1, row2, row3) { \
|
||||
m256d tmp3, tmp2, tmp1, tmp0; \
|
||||
\
|
||||
tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
|
||||
tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
|
||||
tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
|
||||
tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
|
||||
\
|
||||
(row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
|
||||
(row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
|
||||
(row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
|
||||
(row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
|
||||
}
|
||||
|
||||
/*VSWAPHL(a, b) pseudo code:
|
||||
return [ b[0], b[1], a[2], a[3] ]
|
||||
*/
|
||||
# define VSWAPHL(a,b) \
|
||||
mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
|
||||
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) mm256_reverse(a)
|
||||
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
|
||||
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
|
||||
|
||||
#endif
|
||||
#endif
|
||||
Reference in New Issue
Block a user