fixed a bug in real ffts for N multiple of 25; work around a compiler bug with clang 3.2 for arm on linux
diff --git a/pffft.c b/pffft.c
index fc4e314..0603f2b 100644
--- a/pffft.c
+++ b/pffft.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011 Julien Pommier ( [email protected] )
+/* Copyright (c) 2013 Julien Pommier ( [email protected] )
Based on original fortran 77 code from FFTPACKv4 from NETLIB
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
@@ -71,7 +71,7 @@
#endif
#if defined(COMPILER_GCC)
-# define ALWAYS_INLINE(return_type) return_type __attribute__ ((always_inline))
+# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
# define RESTRICT __restrict
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
@@ -160,7 +160,7 @@
# define LD_PS1(p) vld1q_dup_f32(&(p))
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-# define VTRANSPOSE4_(x0,x1,x2,x3) { \
+# define VTRANSPOSE4(x0,x1,x2,x3) { \
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
@@ -168,7 +168,7 @@
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
}
// marginally faster version
-# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
#else
@@ -825,14 +825,14 @@
for (k = 1; k <= l1; ++k) {
for (i = 3; i <= ido; i += 2) {
ic = idp2 - i;
- dr2 = LD_PS1(wa1[i-2]); di2 = LD_PS1(wa1[i-1]);
- dr3 = LD_PS1(wa2[i-2]); di3 = LD_PS1(wa2[i-1]);
- dr4 = LD_PS1(wa3[i-2]); di4 = LD_PS1(wa3[i-1]);
- dr5 = LD_PS1(wa4[i-2]); di5 = LD_PS1(wa4[i-1]);
- VCPLXMUL(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
- VCPLXMUL(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
- VCPLXMUL(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
- VCPLXMUL(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+ dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
+ dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
+ dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
+ dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
+ VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+ VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+ VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+ VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
cr2 = VADD(dr2, dr5);
ci5 = VSUB(dr5, dr2);
cr5 = VSUB(di2, di5);
@@ -842,21 +842,21 @@
cr4 = VSUB(di3, di4);
ci3 = VADD(di3, di4);
ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
- ch_ref(i, 1, k) = VADD(cc_ref(i, k, 1), VADD(ci2, ci3));
+ ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
- ti2 = VADD(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));
+ ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
- ti3 = VADD(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));
+ ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
- ch_ref(i - 1, 3, k) = VADD(tr2, tr5);
- ch_ref(ic - 1, 2, k) = VSUB(tr2, tr5);
+ ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+ ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
ch_ref(i, 3, k) = VADD(ti2, ti5);
ch_ref(ic, 2, k) = VSUB(ti5, ti2);
- ch_ref(i - 1, 5, k) = VADD(tr3, tr4);
- ch_ref(ic - 1, 4, k) = VSUB(tr3, tr4);
+ ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+ ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
ch_ref(i, 5, k) = VADD(ti3, ti4);
ch_ref(ic, 4, k) = VSUB(ti4, ti3);
}
@@ -939,10 +939,10 @@
dr2 = VSUB(cr2, ci5);
di5 = VSUB(ci2, cr5);
di2 = VADD(ci2, cr5);
- VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
- VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
- VCPLXMUL(dr4, di4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
- VCPLXMUL(dr5, di5, LD_PS1(wa4[i-2]), LD_PS1(wa4[i-1]));
+ VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+ VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+ VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+ VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
@@ -1411,7 +1411,7 @@
}
-ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
const v4sf *e, v4sf *out) {
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
@@ -1512,7 +1512,7 @@
}
-ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
const v4sf *e, v4sf *out, int first) {
v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
/*
@@ -1617,8 +1617,6 @@
v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
int ib = (nf_odd ^ ordered ? 1 : 0);
- if (scratch == 0) scratch = scratch_on_stack;
-
assert(VALIGNED(finput) && VALIGNED(foutput));
//assert(finput != foutput);
@@ -1675,7 +1673,7 @@
}
void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
- int i, Ncvec = s->Ncvec;
+ int Ncvec = s->Ncvec;
const v4sf * RESTRICT va = (const v4sf*)a;
const v4sf * RESTRICT vb = (const v4sf*)b;
v4sf * RESTRICT vab = (v4sf*)ab;
@@ -1693,10 +1691,16 @@
__builtin_prefetch(va+6);
__builtin_prefetch(vb+6);
__builtin_prefetch(vab+6);
+# ifndef __clang__
+# define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
#endif
float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
v4sf vscal = LD_PS1(scaling);
+ int i;
+#endif
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
ar = ((v4sf_union*)va)[0].f[0];
@@ -1706,8 +1710,7 @@
abr = ((v4sf_union*)vab)[0].f[0];
abi = ((v4sf_union*)vab)[1].f[0];
-#ifdef __arm__
-# if 1 // inline asm version
+#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
const float *a_ = a, *b_ = b; float *ab_ = ab;
int N = Ncvec;
asm volatile("mov r8, %2 \n"
@@ -1743,49 +1746,7 @@
"subs %3, #2 \n"
"bne 1b \n"
: "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
-
-# else // neon instrinsics version, 30% slower that the asm one with gcc 4.6
- v4sf a1r, a1i, b1r, b1i;
- v4sf a2r, a2i, b2r, b2i;
- v4sf ab1r, ab1i, ab2r, ab2i;
- for (i=0; i < Ncvec; i += 2) {
- __builtin_prefetch(va+8);
- __builtin_prefetch(va+10);
-
- a1r = *va++; a1i = *va++;
- a2r = *va++; a2i = *va++;
- b1r = *vb++; b1i = *vb++;
- b2r = *vb++; b2i = *vb++;
- ab1r = vab[0]; ab1i = vab[1];
- ab2r = vab[2]; ab2i = vab[3];
-
- v4sf z1r = VMUL(a1r, b1r);
- v4sf z2r = VMUL(a2r, b2r);
- v4sf z1i = VMUL(a1r, b1i);
- v4sf z2i = VMUL(a2r, b2i);
-
- __builtin_prefetch(vb+4);
- __builtin_prefetch(vb+6);
-
- z1r = vmlsq_f32(z1r, a1i, b1i);
- z2r = vmlsq_f32(z2r, a2i, b2i);
- z1i = vmlaq_f32(z1i, a1i, b1r);
- z2i = vmlaq_f32(z2i, a2i, b2r);
-
- __builtin_prefetch(vab+4);
- __builtin_prefetch(vab+6);
-
- ab1r = vmlaq_f32(ab1r, z1r, vscal);
- ab2r = vmlaq_f32(ab2r, z2r, vscal);
- ab1i = vmlaq_f32(ab1i, z1i, vscal);
- ab2i = vmlaq_f32(ab2i, z2i, vscal);
-
- *vab++ = ab1r; *vab++ = ab1i;
- *vab++ = ab2r; *vab++ = ab2i;
- }
-# endif
-
-#else // not ARM, no need to use a special routine
+#else // default routine, works fine for non-arm cpus with current compilers
for (i=0; i < Ncvec; i += 2) {
v4sf ar, ai, br, bi;
ar = va[2*i+0]; ai = va[2*i+1];