| |
| /* Copyright (c) 2013 Julien Pommier ( [email protected] ) |
| |
| Redistribution and use of the Software in source and binary forms, |
| with or without modification, is permitted provided that the |
| following conditions are met: |
| |
| - Neither the names of NCAR's Computational and Information Systems |
| Laboratory, the University Corporation for Atmospheric Research, |
| nor the names of its sponsors or contributors may be used to |
| endorse or promote products derived from this Software without |
| specific prior written permission. |
| |
| - Redistributions of source code must retain the above copyright |
| notices, this list of conditions, and the disclaimer below. |
| |
| - Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions, and the disclaimer below in the |
| documentation and/or other materials provided with the |
| distribution. |
| |
| THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT |
| HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE |
| SOFTWARE. |
| */ |
| |
| #ifndef PF_FLT_H |
| #define PF_FLT_H |
| |
| #include <assert.h> |
| #include <string.h> |
| #include <stdint.h> |
| |
| |
| /* |
| * SIMD reference material: |
| * |
| * general SIMD introduction: |
| * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing |
| * |
| * SSE 1: |
| * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ |
| * |
| * ARM NEON: |
| * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics |
| * |
| * Altivec: |
| * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf |
| * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html |
| * better one? |
| * |
| */ |
| |
| |
| #include "pf_sse1_float.h" |
| #include "pf_neon_float.h" |
| #include "pf_altivec_float.h" |
| |
| #ifndef SIMD_SZ |
| # if !defined(PFFFT_SIMD_DISABLE) |
| # warning "building float with simd disabled !\n"; |
| # define PFFFT_SIMD_DISABLE /* fallback to scalar code */ |
| # endif |
| #endif |
| |
| #include "pf_scalar_float.h" |
| |
| |
| /* shortcuts for complex multiplcations */ |
| #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } |
| #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } |
| #ifndef SVMUL |
| /* multiply a scalar with a vector */ |
| #define SVMUL(f,v) VMUL(LD_PS1(f),v) |
| #endif |
| |
| typedef union v4sf_union { |
| v4sf v; |
| float f[SIMD_SZ]; |
| } v4sf_union; |
| |
| #if !defined(PFFFT_SIMD_DISABLE) |
| |
| #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) |
| |
| /* detect bugs with the vector support macros */ |
| static void Vvalidate_simd() { |
| float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; |
| v4sf_union a0, a1, a2, a3, t, u; |
| memcpy(a0.f, f, 4*sizeof(float)); |
| memcpy(a1.f, f+4, 4*sizeof(float)); |
| memcpy(a2.f, f+8, 4*sizeof(float)); |
| memcpy(a3.f, f+12, 4*sizeof(float)); |
| |
| t = a0; u = a1; t.v = VZERO(); |
| printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); |
| t.v = VADD(a1.v, a2.v); |
| printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); |
| t.v = VMUL(a1.v, a2.v); |
| printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); |
| t.v = VMADD(a1.v, a2.v,a0.v); |
| printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); |
| |
| INTERLEAVE2(a1.v,a2.v,t.v,u.v); |
| printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); |
| assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); |
| UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); |
| printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); |
| assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); |
| |
| t.v=LD_PS1(f[15]); |
| printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); |
| assertv4(t, 15, 15, 15, 15); |
| t.v = VSWAPHL(a1.v, a2.v); |
| printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); |
| assertv4(t, 8, 9, 6, 7); |
| VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); |
| printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", |
| a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], |
| a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); |
| assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); |
| } |
| #endif /* !PFFFT_SIMD_DISABLE */ |
| |
| /* SSE and co like 16-bytes aligned pointers |
| * with a 64-byte alignment, we are even aligned on L2 cache lines... */ |
| #define MALLOC_V4SF_ALIGNMENT 64 |
| |
| static |
| void *Valigned_malloc(size_t nb_bytes) { |
| void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); |
| if (!p0) return (void *) 0; |
| p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); |
| *((void **) p - 1) = p0; |
| return p; |
| } |
| |
| static |
| void Valigned_free(void *p) { |
| if (p) free(*((void **) p - 1)); |
| } |
| |
| |
| #endif /* PF_FLT_H */ |
| |