| /* |
| Copyright (c) 2020 Dario Mambro ( [email protected] ) |
| */ |
| |
| /* Copyright (c) 2013 Julien Pommier ( [email protected] ) |
| |
| Redistribution and use of the Software in source and binary forms, |
| with or without modification, is permitted provided that the |
| following conditions are met: |
| |
| - Neither the names of NCAR's Computational and Information Systems |
| Laboratory, the University Corporation for Atmospheric Research, |
| nor the names of its sponsors or contributors may be used to |
| endorse or promote products derived from this Software without |
| specific prior written permission. |
| |
| - Redistributions of source code must retain the above copyright |
| notices, this list of conditions, and the disclaimer below. |
| |
| - Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions, and the disclaimer below in the |
| documentation and/or other materials provided with the |
| distribution. |
| |
| THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT |
| HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE |
| SOFTWARE. |
| */ |
| |
| #ifndef PF_SSE2_DBL_H |
| #define PF_SSE2_DBL_H |
| |
| //detect sse2 support under MSVC |
| #if defined ( _M_IX86_FP ) |
| # if _M_IX86_FP == 2 |
| # if !defined(__SSE2__) |
| # define __SSE2__ |
| # endif |
| # endif |
| #endif |
| |
| /* |
| SSE2 64bit support macros |
| */ |
| #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ )) |
| #pragma message (__FILE__ ": SSE2 double macros are defined" ) |
| |
| #include <emmintrin.h> |
| |
| typedef struct { |
| __m128d d128[2]; |
| } m256d; |
| |
| typedef m256d v4sf; |
| |
| # define SIMD_SZ 4 |
| |
| typedef union v4sf_union { |
| v4sf v; |
| double f[SIMD_SZ]; |
| } v4sf_union; |
| |
| |
| #if defined(__GNUC__) || defined(__clang__) |
| |
| #pragma push_macro("FORCE_INLINE") |
| #define FORCE_INLINE static inline __attribute__((always_inline)) |
| |
| #elif defined (_MSC_VER) |
| #define FORCE_INLINE static __forceinline |
| |
| #else |
| #error "Macro name collisions may happens with unknown compiler" |
| #ifdef FORCE_INLINE |
| #undef FORCE_INLINE |
| #endif |
| #define FORCE_INLINE static inline |
| #endif |
| |
| FORCE_INLINE m256d mm256_setzero_pd(void) |
| { |
| m256d ret; |
| ret.d128[0] = ret.d128[1] = _mm_setzero_pd(); |
| return ret; |
| } |
| |
| FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b) |
| { |
| m256d ret; |
| ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]); |
| ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]); |
| return ret; |
| } |
| |
| FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b) |
| { |
| m256d ret; |
| ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]); |
| ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]); |
| return ret; |
| } |
| |
| FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b) |
| { |
| m256d ret; |
| ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]); |
| ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]); |
| return ret; |
| } |
| |
| FORCE_INLINE m256d mm256_set1_pd(double a) |
| { |
| m256d ret; |
| ret.d128[0] = ret.d128[1] = _mm_set1_pd(a); |
| return ret; |
| } |
| |
| FORCE_INLINE m256d mm256_load_pd (double const * mem_addr) |
| { |
| m256d res; |
| res.d128[0] = _mm_load_pd((const double *)mem_addr); |
| res.d128[1] = _mm_load_pd((const double *)mem_addr + 2); |
| return res; |
| } |
| FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr) |
| { |
| m256d res; |
| res.d128[0] = _mm_loadu_pd((const double *)mem_addr); |
| res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2); |
| return res; |
| } |
| |
| |
| # define VARCH "SSE2" |
| # define VREQUIRES_ALIGN 1 |
| # define VZERO() mm256_setzero_pd() |
| # define VMUL(a,b) mm256_mul_pd(a,b) |
| # define VADD(a,b) mm256_add_pd(a,b) |
| # define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c) |
| # define VSUB(a,b) mm256_sub_pd(a,b) |
| # define LD_PS1(p) mm256_set1_pd(p) |
| # define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr) |
| # define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr) |
| |
| |
| FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a) |
| { |
| return a.d128[0]; |
| } |
| |
| FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8) |
| { |
| assert(imm8 >= 0 && imm8 <= 1); |
| return a.d128[imm8]; |
| } |
| FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b) |
| { |
| m256d res; |
| res.d128[0] = a.d128[0]; |
| res.d128[1] = b; |
| return res; |
| } |
| FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a) |
| { |
| m256d res; |
| res.d128[0] = a; |
| return res; |
| } |
| |
| FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b) |
| { |
| m256d res; |
| res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0); |
| res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0); |
| return res; |
| } |
| |
| FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b) |
| { |
| m256d res; |
| res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3); |
| res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3); |
| return res; |
| } |
| |
| FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) { |
| m256d res; |
| res.d128[0] = a.d128[0]; |
| res.d128[1] = b.d128[0]; |
| return res; |
| } |
| |
| |
| FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b) |
| { |
| m256d res; |
| res.d128[0] = a.d128[1]; |
| res.d128[1] = b.d128[1]; |
| return res; |
| } |
| |
| FORCE_INLINE m256d mm256_reverse(m256d x) |
| { |
| m256d res; |
| res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1); |
| res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1); |
| return res; |
| } |
| |
| /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: |
| out1 = [ in1[0], in2[0], in1[1], in2[1] ] |
| out2 = [ in1[2], in2[2], in1[3], in2[3] ] |
| */ |
| # define INTERLEAVE2(in1, in2, out1, out2) { \ |
| __m128d low1__ = mm256_castpd256_pd128(in1); \ |
| __m128d low2__ = mm256_castpd256_pd128(in2); \ |
| __m128d high1__ = mm256_extractf128_pd(in1, 1); \ |
| __m128d high2__ = mm256_extractf128_pd(in2, 1); \ |
| m256d tmp__ = mm256_insertf128_pd_1( \ |
| mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ |
| _mm_shuffle_pd(low1__, low2__, 3)); \ |
| out2 = mm256_insertf128_pd_1( \ |
| mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ |
| _mm_shuffle_pd(high1__, high2__, 3)); \ |
| out1 = tmp__; \ |
| } |
| |
| /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: |
| out1 = [ in1[0], in1[2], in2[0], in2[2] ] |
| out2 = [ in1[1], in1[3], in2[1], in2[3] ] |
| */ |
| # define UNINTERLEAVE2(in1, in2, out1, out2) { \ |
| __m128d low1__ = mm256_castpd256_pd128(in1); \ |
| __m128d low2__ = mm256_castpd256_pd128(in2); \ |
| __m128d high1__ = mm256_extractf128_pd(in1, 1); \ |
| __m128d high2__ = mm256_extractf128_pd(in2, 1); \ |
| m256d tmp__ = mm256_insertf128_pd_1( \ |
| mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ |
| _mm_shuffle_pd(low2__, high2__, 0)); \ |
| out2 = mm256_insertf128_pd_1( \ |
| mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ |
| _mm_shuffle_pd(low2__, high2__, 3)); \ |
| out1 = tmp__; \ |
| } |
| |
| # define VTRANSPOSE4(row0, row1, row2, row3) { \ |
| m256d tmp3, tmp2, tmp1, tmp0; \ |
| \ |
| tmp0 = mm256_shuffle_pd_00((row0),(row1)); \ |
| tmp2 = mm256_shuffle_pd_11((row0),(row1)); \ |
| tmp1 = mm256_shuffle_pd_00((row2),(row3)); \ |
| tmp3 = mm256_shuffle_pd_11((row2),(row3)); \ |
| \ |
| (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \ |
| (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \ |
| (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \ |
| (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \ |
| } |
| |
| /*VSWAPHL(a, b) pseudo code: |
| return [ b[0], b[1], a[2], a[3] ] |
| */ |
| # define VSWAPHL(a,b) \ |
| mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1)) |
| |
| /* reverse/flip all floats */ |
| # define VREV_S(a) mm256_reverse(a) |
| |
| /* reverse/flip complex floats */ |
| # define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a)) |
| |
| # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) |
| |
| #endif |
| #endif |